rf_netbsdkintf.c revision 1.410.4.3 1 /* $NetBSD: rf_netbsdkintf.c,v 1.410.4.3 2023/10/18 11:44:22 martin Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.410.4.3 2023/10/18 11:44:22 martin Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #define DEVICE_XNAME(dev) dev ? device_xname(dev) : "null"
162
163 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
164 static rf_declare_mutex2(rf_sparet_wait_mutex);
165 static rf_declare_cond2(rf_sparet_wait_cv);
166 static rf_declare_cond2(rf_sparet_resp_cv);
167
168 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
169 * spare table */
170 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
171 * installation process */
172 #endif
173
174 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
175
176 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
177
178 /* prototypes */
179 static void KernelWakeupFunc(struct buf *);
180 static void InitBP(struct buf *, struct vnode *, unsigned,
181 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
182 void *, int);
183 static void raidinit(struct raid_softc *);
184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
185 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
186
187 static int raid_match(device_t, cfdata_t, void *);
188 static void raid_attach(device_t, device_t, void *);
189 static int raid_detach(device_t, int);
190
191 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
192 daddr_t, daddr_t);
193 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
194 daddr_t, daddr_t, int);
195
196 static int raidwrite_component_label(unsigned,
197 dev_t, struct vnode *, RF_ComponentLabel_t *);
198 static int raidread_component_label(unsigned,
199 dev_t, struct vnode *, RF_ComponentLabel_t *);
200
201 static int raid_diskstart(device_t, struct buf *bp);
202 static int raid_dumpblocks(device_t, void *, daddr_t, int);
203 static int raid_lastclose(device_t);
204
205 static dev_type_open(raidopen);
206 static dev_type_close(raidclose);
207 static dev_type_read(raidread);
208 static dev_type_write(raidwrite);
209 static dev_type_ioctl(raidioctl);
210 static dev_type_strategy(raidstrategy);
211 static dev_type_dump(raiddump);
212 static dev_type_size(raidsize);
213
214 const struct bdevsw raid_bdevsw = {
215 .d_open = raidopen,
216 .d_close = raidclose,
217 .d_strategy = raidstrategy,
218 .d_ioctl = raidioctl,
219 .d_dump = raiddump,
220 .d_psize = raidsize,
221 .d_discard = nodiscard,
222 .d_flag = D_DISK
223 };
224
225 const struct cdevsw raid_cdevsw = {
226 .d_open = raidopen,
227 .d_close = raidclose,
228 .d_read = raidread,
229 .d_write = raidwrite,
230 .d_ioctl = raidioctl,
231 .d_stop = nostop,
232 .d_tty = notty,
233 .d_poll = nopoll,
234 .d_mmap = nommap,
235 .d_kqfilter = nokqfilter,
236 .d_discard = nodiscard,
237 .d_flag = D_DISK
238 };
239
240 static struct dkdriver rf_dkdriver = {
241 .d_open = raidopen,
242 .d_close = raidclose,
243 .d_strategy = raidstrategy,
244 .d_diskstart = raid_diskstart,
245 .d_dumpblocks = raid_dumpblocks,
246 .d_lastclose = raid_lastclose,
247 .d_minphys = minphys
248 };
249
250 #define raidunit(x) DISKUNIT(x)
251 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
252
253 extern struct cfdriver raid_cd;
254 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
255 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
256 DVF_DETACH_SHUTDOWN);
257
258 /* Internal representation of a rf_recon_req */
259 struct rf_recon_req_internal {
260 RF_RowCol_t col;
261 RF_ReconReqFlags_t flags;
262 void *raidPtr;
263 };
264
265 /*
266 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
267 * Be aware that large numbers can allow the driver to consume a lot of
268 * kernel memory, especially on writes, and in degraded mode reads.
269 *
270 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
271 * a single 64K write will typically require 64K for the old data,
272 * 64K for the old parity, and 64K for the new parity, for a total
273 * of 192K (if the parity buffer is not re-used immediately).
274 * Even it if is used immediately, that's still 128K, which when multiplied
275 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
276 *
277 * Now in degraded mode, for example, a 64K read on the above setup may
278 * require data reconstruction, which will require *all* of the 4 remaining
279 * disks to participate -- 4 * 32K/disk == 128K again.
280 */
281
282 #ifndef RAIDOUTSTANDING
283 #define RAIDOUTSTANDING 6
284 #endif
285
286 #define RAIDLABELDEV(dev) \
287 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
288
289 /* declared here, and made public, for the benefit of KVM stuff.. */
290
291 static int raidlock(struct raid_softc *);
292 static void raidunlock(struct raid_softc *);
293
294 static int raid_detach_unlocked(struct raid_softc *);
295
296 static void rf_markalldirty(RF_Raid_t *);
297 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
298
299 static void rf_ReconThread(struct rf_recon_req_internal *);
300 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
301 static void rf_CopybackThread(RF_Raid_t *raidPtr);
302 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
303 static int rf_autoconfig(device_t);
304 static int rf_rescan(void);
305 static void rf_buildroothack(RF_ConfigSet_t *);
306
307 static RF_AutoConfig_t *rf_find_raid_components(void);
308 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
309 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
310 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
311 static int rf_set_autoconfig(RF_Raid_t *, int);
312 static int rf_set_rootpartition(RF_Raid_t *, int);
313 static void rf_release_all_vps(RF_ConfigSet_t *);
314 static void rf_cleanup_config_set(RF_ConfigSet_t *);
315 static int rf_have_enough_components(RF_ConfigSet_t *);
316 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
317 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
318
319 /*
320 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
321 * Note that this is overridden by having RAID_AUTOCONFIG as an option
322 * in the kernel config file.
323 */
324 #ifdef RAID_AUTOCONFIG
325 int raidautoconfig = 1;
326 #else
327 int raidautoconfig = 0;
328 #endif
329 static bool raidautoconfigdone = false;
330
331 struct pool rf_alloclist_pool; /* AllocList */
332
333 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
334 static kmutex_t raid_lock;
335
336 static struct raid_softc *
337 raidcreate(int unit) {
338 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
339 sc->sc_unit = unit;
340 cv_init(&sc->sc_cv, "raidunit");
341 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
342 return sc;
343 }
344
345 static void
346 raiddestroy(struct raid_softc *sc) {
347 cv_destroy(&sc->sc_cv);
348 mutex_destroy(&sc->sc_mutex);
349 kmem_free(sc, sizeof(*sc));
350 }
351
352 static struct raid_softc *
353 raidget(int unit, bool create) {
354 struct raid_softc *sc;
355 if (unit < 0) {
356 #ifdef DIAGNOSTIC
357 panic("%s: unit %d!", __func__, unit);
358 #endif
359 return NULL;
360 }
361 mutex_enter(&raid_lock);
362 LIST_FOREACH(sc, &raids, sc_link) {
363 if (sc->sc_unit == unit) {
364 mutex_exit(&raid_lock);
365 return sc;
366 }
367 }
368 mutex_exit(&raid_lock);
369 if (!create)
370 return NULL;
371 sc = raidcreate(unit);
372 mutex_enter(&raid_lock);
373 LIST_INSERT_HEAD(&raids, sc, sc_link);
374 mutex_exit(&raid_lock);
375 return sc;
376 }
377
378 static void
379 raidput(struct raid_softc *sc) {
380 mutex_enter(&raid_lock);
381 LIST_REMOVE(sc, sc_link);
382 mutex_exit(&raid_lock);
383 raiddestroy(sc);
384 }
385
386 void
387 raidattach(int num)
388 {
389
390 /*
391 * Device attachment and associated initialization now occurs
392 * as part of the module initialization.
393 */
394 }
395
396 static int
397 rf_autoconfig(device_t self)
398 {
399 RF_AutoConfig_t *ac_list;
400 RF_ConfigSet_t *config_sets;
401
402 if (!raidautoconfig || raidautoconfigdone == true)
403 return 0;
404
405 /* XXX This code can only be run once. */
406 raidautoconfigdone = true;
407
408 #ifdef __HAVE_CPU_BOOTCONF
409 /*
410 * 0. find the boot device if needed first so we can use it later
411 * this needs to be done before we autoconfigure any raid sets,
412 * because if we use wedges we are not going to be able to open
413 * the boot device later
414 */
415 if (booted_device == NULL)
416 cpu_bootconf();
417 #endif
418 /* 1. locate all RAID components on the system */
419 aprint_debug("Searching for RAID components...\n");
420 ac_list = rf_find_raid_components();
421
422 /* 2. Sort them into their respective sets. */
423 config_sets = rf_create_auto_sets(ac_list);
424
425 /*
426 * 3. Evaluate each set and configure the valid ones.
427 * This gets done in rf_buildroothack().
428 */
429 rf_buildroothack(config_sets);
430
431 return 1;
432 }
433
434 int
435 rf_inited(const struct raid_softc *rs) {
436 return (rs->sc_flags & RAIDF_INITED) != 0;
437 }
438
439 RF_Raid_t *
440 rf_get_raid(struct raid_softc *rs) {
441 return &rs->sc_r;
442 }
443
444 int
445 rf_get_unit(const struct raid_softc *rs) {
446 return rs->sc_unit;
447 }
448
449 static int
450 rf_containsboot(RF_Raid_t *r, device_t bdv) {
451 const char *bootname;
452 size_t len;
453
454 /* if bdv is NULL, the set can't contain it. exit early. */
455 if (bdv == NULL)
456 return 0;
457
458 bootname = device_xname(bdv);
459 len = strlen(bootname);
460
461 for (int col = 0; col < r->numCol; col++) {
462 const char *devname = r->Disks[col].devname;
463 devname += sizeof("/dev/") - 1;
464 if (strncmp(devname, "dk", 2) == 0) {
465 const char *parent =
466 dkwedge_get_parent_name(r->Disks[col].dev);
467 if (parent != NULL)
468 devname = parent;
469 }
470 if (strncmp(devname, bootname, len) == 0) {
471 struct raid_softc *sc = r->softc;
472 aprint_debug("raid%d includes boot device %s\n",
473 sc->sc_unit, devname);
474 return 1;
475 }
476 }
477 return 0;
478 }
479
480 static int
481 rf_rescan(void)
482 {
483 RF_AutoConfig_t *ac_list;
484 RF_ConfigSet_t *config_sets, *cset, *next_cset;
485 struct raid_softc *sc;
486 int raid_added;
487
488 ac_list = rf_find_raid_components();
489 config_sets = rf_create_auto_sets(ac_list);
490
491 raid_added = 1;
492 while (raid_added > 0) {
493 raid_added = 0;
494 cset = config_sets;
495 while (cset != NULL) {
496 next_cset = cset->next;
497 if (rf_have_enough_components(cset) &&
498 cset->ac->clabel->autoconfigure == 1) {
499 sc = rf_auto_config_set(cset);
500 if (sc != NULL) {
501 aprint_debug("raid%d: configured ok, rootable %d\n",
502 sc->sc_unit, cset->rootable);
503 /* We added one RAID set */
504 raid_added++;
505 } else {
506 /* The autoconfig didn't work :( */
507 aprint_debug("Autoconfig failed\n");
508 rf_release_all_vps(cset);
509 }
510 } else {
511 /* we're not autoconfiguring this set...
512 release the associated resources */
513 rf_release_all_vps(cset);
514 }
515 /* cleanup */
516 rf_cleanup_config_set(cset);
517 cset = next_cset;
518 }
519 if (raid_added > 0) {
520 /* We added at least one RAID set, so re-scan for recursive RAID */
521 ac_list = rf_find_raid_components();
522 config_sets = rf_create_auto_sets(ac_list);
523 }
524 }
525
526 return 0;
527 }
528
529 /*
530 * Example setup:
531 * dk1 at wd0: "raid@wd0", 171965 blocks at 32802, type: raidframe
532 * dk3 at wd1: "raid@wd1", 171965 blocks at 32802, type: raidframz
533 * raid1: Components: /dev/dk1 /dev/dk3
534 * dk4 at raid1: "empty@raid1", 8192 blocks at 34, type: msdos
535 * dk5 at raid1: "root@raid1", 163517 blocks at 8226, type: ffs
536 *
537 * If booted from wd0, booted_device will be
538 * disk wd0, startblk = 41092, nblks = 163517
539 *
540 * That is, dk5 with startblk computed from the beginning of wd0
541 * instead of beginning of raid1:
542 * 32802 + 64 (RF_PROTECTED_SECTORS) + 8226 = 41092
543 *
544 * In order to find the boot wedge, we must iterate on each component,
545 * find its offset from disk beginning, abd look for the boot wedge with
546 * startblck adjusted.
547 */
548 static device_t
549 rf_find_bootwedge(struct raid_softc *rsc)
550 {
551 RF_Raid_t *r = &rsc->sc_r;
552 const char *bootname;
553 size_t len;
554 device_t rdev = NULL;
555
556 if (booted_device == NULL)
557 goto out;
558
559 bootname = device_xname(booted_device);
560 len = strlen(bootname);
561
562 aprint_debug("%s: booted_device %s, startblk = %"PRId64", "
563 "nblks = %"PRId64"\n", __func__,
564 bootname, booted_startblk, booted_nblks);
565
566 for (int col = 0; col < r->numCol; col++) {
567 const char *devname = r->Disks[col].devname;
568 const char *parent;
569 struct disk *dk;
570 u_int nwedges;
571 struct dkwedge_info *dkwi;
572 struct dkwedge_list dkwl;
573 size_t dkwi_len;
574 int i;
575
576 devname += sizeof("/dev/") - 1;
577 if (strncmp(devname, "dk", 2) != 0)
578 continue;
579
580 parent = dkwedge_get_parent_name(r->Disks[col].dev);
581 if (parent == NULL) {
582 aprint_debug("%s: cannot find parent for "
583 "component /dev/%s", __func__, devname);
584 continue;
585 }
586
587 if (strncmp(parent, bootname, len) != 0)
588 continue;
589
590 aprint_debug("%s: looking up wedge %s in device %s\n",
591 __func__, devname, parent);
592
593 dk = disk_find(parent);
594 nwedges = dk->dk_nwedges;
595 dkwi_len = sizeof(*dkwi) * nwedges;
596 dkwi = RF_Malloc(dkwi_len);
597
598 dkwl.dkwl_buf = dkwi;
599 dkwl.dkwl_bufsize = dkwi_len;
600 dkwl.dkwl_nwedges = 0;
601 dkwl.dkwl_ncopied = 0;
602
603 if (dkwedge_list(dk, &dkwl, curlwp) == 0) {
604 daddr_t startblk;
605
606 for (i = 0; i < dkwl.dkwl_ncopied; i++) {
607 if (strcmp(dkwi[i].dkw_devname, devname) == 0)
608 break;
609 }
610
611 KASSERT(i < dkwl.dkwl_ncopied);
612
613 aprint_debug("%s: wedge %s, "
614 "startblk = %"PRId64", "
615 "nblks = %"PRId64"\n",
616 __func__,
617 dkwi[i].dkw_devname,
618 dkwi[i].dkw_offset,
619 dkwi[i].dkw_size);
620
621 startblk = booted_startblk
622 - dkwi[i].dkw_offset
623 - RF_PROTECTED_SECTORS;
624
625 aprint_debug("%s: looking for wedge in %s, "
626 "startblk = %"PRId64", "
627 "nblks = %"PRId64"\n",
628 __func__,
629 DEVICE_XNAME(rsc->sc_dksc.sc_dev),
630 startblk, booted_nblks);
631
632 rdev = dkwedge_find_partition(rsc->sc_dksc.sc_dev,
633 startblk,
634 booted_nblks);
635 if (rdev) {
636 aprint_debug("%s: root candidate wedge %s "
637 "shifted from %s\n", __func__,
638 device_xname(rdev),
639 dkwi[i].dkw_devname);
640 goto done;
641 } else {
642 aprint_debug("%s: not found\n", __func__);
643 }
644 }
645
646 aprint_debug("%s: nothing found for col %d\n", __func__, col);
647 done:
648 RF_Free(dkwi, dkwi_len);
649 }
650
651 out:
652 if (!rdev)
653 aprint_debug("%s: nothing found\n", __func__);
654
655 return rdev;
656 }
657
658 static void
659 rf_buildroothack(RF_ConfigSet_t *config_sets)
660 {
661 RF_AutoConfig_t *ac_list;
662 RF_ConfigSet_t *cset;
663 RF_ConfigSet_t *next_cset;
664 int num_root;
665 int raid_added;
666 struct raid_softc *sc, *rsc;
667 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
668
669 sc = rsc = NULL;
670 num_root = 0;
671
672 raid_added = 1;
673 while (raid_added > 0) {
674 raid_added = 0;
675 cset = config_sets;
676 while (cset != NULL) {
677 next_cset = cset->next;
678 if (rf_have_enough_components(cset) &&
679 cset->ac->clabel->autoconfigure == 1) {
680 sc = rf_auto_config_set(cset);
681 if (sc != NULL) {
682 aprint_debug("raid%d: configured ok, rootable %d\n",
683 sc->sc_unit, cset->rootable);
684 /* We added one RAID set */
685 raid_added++;
686 if (cset->rootable) {
687 rsc = sc;
688 num_root++;
689 }
690 } else {
691 /* The autoconfig didn't work :( */
692 aprint_debug("Autoconfig failed\n");
693 rf_release_all_vps(cset);
694 }
695 } else {
696 /* we're not autoconfiguring this set...
697 release the associated resources */
698 rf_release_all_vps(cset);
699 }
700 /* cleanup */
701 rf_cleanup_config_set(cset);
702 cset = next_cset;
703 }
704 if (raid_added > 0) {
705 /* We added at least one RAID set, so re-scan for recursive RAID */
706 ac_list = rf_find_raid_components();
707 config_sets = rf_create_auto_sets(ac_list);
708 }
709 }
710
711 /* if the user has specified what the root device should be
712 then we don't touch booted_device or boothowto... */
713
714 if (rootspec != NULL) {
715 aprint_debug("%s: rootspec %s\n", __func__, rootspec);
716 return;
717 }
718
719 /* we found something bootable... */
720 if (num_root == 1) {
721 device_t candidate_root = NULL;
722 dksc = &rsc->sc_dksc;
723
724 if (dksc->sc_dkdev.dk_nwedges != 0) {
725
726 /* Find the wedge we booted from */
727 candidate_root = rf_find_bootwedge(rsc);
728
729 /* Try first partition */
730 if (candidate_root == NULL) {
731 size_t i = 0;
732 candidate_root = dkwedge_find_by_parent(
733 device_xname(dksc->sc_dev), &i);
734 }
735 aprint_debug("%s: candidate wedge root %s\n",
736 __func__, DEVICE_XNAME(candidate_root));
737 } else {
738 candidate_root = dksc->sc_dev;
739 }
740
741 aprint_debug("%s: candidate root = %s, booted_device = %s, "
742 "root_partition = %d, contains_boot=%d\n",
743 __func__, DEVICE_XNAME(candidate_root),
744 DEVICE_XNAME(booted_device), rsc->sc_r.root_partition,
745 rf_containsboot(&rsc->sc_r, booted_device));
746
747 /* XXX the check for booted_device == NULL can probably be
748 * dropped, now that rf_containsboot handles that case.
749 */
750 if (booted_device == NULL ||
751 rsc->sc_r.root_partition == 1 ||
752 rf_containsboot(&rsc->sc_r, booted_device)) {
753 booted_device = candidate_root;
754 booted_method = "raidframe/single";
755 booted_partition = 0; /* XXX assume 'a' */
756 aprint_debug("%s: set booted_device = %s\n", __func__,
757 DEVICE_XNAME(booted_device));
758 }
759 } else if (num_root > 1) {
760 aprint_debug("%s: many roots=%d, %s\n", __func__, num_root,
761 DEVICE_XNAME(booted_device));
762
763 /*
764 * Maybe the MD code can help. If it cannot, then
765 * setroot() will discover that we have no
766 * booted_device and will ask the user if nothing was
767 * hardwired in the kernel config file
768 */
769 if (booted_device == NULL)
770 return;
771
772 num_root = 0;
773 mutex_enter(&raid_lock);
774 LIST_FOREACH(sc, &raids, sc_link) {
775 RF_Raid_t *r = &sc->sc_r;
776 if (r->valid == 0)
777 continue;
778
779 if (r->root_partition == 0)
780 continue;
781
782 if (rf_containsboot(r, booted_device)) {
783 num_root++;
784 rsc = sc;
785 dksc = &rsc->sc_dksc;
786 }
787 }
788 mutex_exit(&raid_lock);
789
790 if (num_root == 1) {
791 booted_device = dksc->sc_dev;
792 booted_method = "raidframe/multi";
793 booted_partition = 0; /* XXX assume 'a' */
794 } else {
795 /* we can't guess.. require the user to answer... */
796 boothowto |= RB_ASKNAME;
797 }
798 }
799 }
800
801 static int
802 raidsize(dev_t dev)
803 {
804 struct raid_softc *rs;
805 struct dk_softc *dksc;
806 unsigned int unit;
807
808 unit = raidunit(dev);
809 if ((rs = raidget(unit, false)) == NULL)
810 return -1;
811 dksc = &rs->sc_dksc;
812
813 if ((rs->sc_flags & RAIDF_INITED) == 0)
814 return -1;
815
816 return dk_size(dksc, dev);
817 }
818
819 static int
820 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
821 {
822 unsigned int unit;
823 struct raid_softc *rs;
824 struct dk_softc *dksc;
825
826 unit = raidunit(dev);
827 if ((rs = raidget(unit, false)) == NULL)
828 return ENXIO;
829 dksc = &rs->sc_dksc;
830
831 if ((rs->sc_flags & RAIDF_INITED) == 0)
832 return ENODEV;
833
834 /*
835 Note that blkno is relative to this particular partition.
836 By adding adding RF_PROTECTED_SECTORS, we get a value that
837 is relative to the partition used for the underlying component.
838 */
839 blkno += RF_PROTECTED_SECTORS;
840
841 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
842 }
843
844 static int
845 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
846 {
847 struct raid_softc *rs = raidsoftc(dev);
848 const struct bdevsw *bdev;
849 RF_Raid_t *raidPtr;
850 int c, sparecol, j, scol, dumpto;
851 int error = 0;
852
853 raidPtr = &rs->sc_r;
854
855 /* we only support dumping to RAID 1 sets */
856 if (raidPtr->Layout.numDataCol != 1 ||
857 raidPtr->Layout.numParityCol != 1)
858 return EINVAL;
859
860 if ((error = raidlock(rs)) != 0)
861 return error;
862
863 /* figure out what device is alive.. */
864
865 /*
866 Look for a component to dump to. The preference for the
867 component to dump to is as follows:
868 1) the first component
869 2) a used_spare of the first component
870 3) the second component
871 4) a used_spare of the second component
872 */
873
874 dumpto = -1;
875 for (c = 0; c < raidPtr->numCol; c++) {
876 if (raidPtr->Disks[c].status == rf_ds_optimal) {
877 /* this might be the one */
878 dumpto = c;
879 break;
880 }
881 }
882
883 /*
884 At this point we have possibly selected a live component.
885 If we didn't find a live ocmponent, we now check to see
886 if there is a relevant spared component.
887 */
888
889 for (c = 0; c < raidPtr->numSpare; c++) {
890 sparecol = raidPtr->numCol + c;
891 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
892 /* How about this one? */
893 scol = -1;
894 for(j=0;j<raidPtr->numCol;j++) {
895 if (raidPtr->Disks[j].spareCol == sparecol) {
896 scol = j;
897 break;
898 }
899 }
900 if (scol == 0) {
901 /*
902 We must have found a spared first
903 component! We'll take that over
904 anything else found so far. (We
905 couldn't have found a real first
906 component before, since this is a
907 used spare, and it's saying that
908 it's replacing the first
909 component.) On reboot (with
910 autoconfiguration turned on)
911 sparecol will become the first
912 component (component0) of this set.
913 */
914 dumpto = sparecol;
915 break;
916 } else if (scol != -1) {
917 /*
918 Must be a spared second component.
919 We'll dump to that if we havn't found
920 anything else so far.
921 */
922 if (dumpto == -1)
923 dumpto = sparecol;
924 }
925 }
926 }
927
928 if (dumpto == -1) {
929 /* we couldn't find any live components to dump to!?!?
930 */
931 error = EINVAL;
932 goto out;
933 }
934
935 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
936 if (bdev == NULL) {
937 error = ENXIO;
938 goto out;
939 }
940
941 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
942 blkno, va, nblk * raidPtr->bytesPerSector);
943
944 out:
945 raidunlock(rs);
946
947 return error;
948 }
949
950 /* ARGSUSED */
951 static int
952 raidopen(dev_t dev, int flags, int fmt,
953 struct lwp *l)
954 {
955 int unit = raidunit(dev);
956 struct raid_softc *rs;
957 struct dk_softc *dksc;
958 int error = 0;
959 int part, pmask;
960
961 if ((rs = raidget(unit, true)) == NULL)
962 return ENXIO;
963 if ((error = raidlock(rs)) != 0)
964 return error;
965
966 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
967 error = EBUSY;
968 goto bad;
969 }
970
971 dksc = &rs->sc_dksc;
972
973 part = DISKPART(dev);
974 pmask = (1 << part);
975
976 if (!DK_BUSY(dksc, pmask) &&
977 ((rs->sc_flags & RAIDF_INITED) != 0)) {
978 /* First one... mark things as dirty... Note that we *MUST*
979 have done a configure before this. I DO NOT WANT TO BE
980 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
981 THAT THEY BELONG TOGETHER!!!!! */
982 /* XXX should check to see if we're only open for reading
983 here... If so, we needn't do this, but then need some
984 other way of keeping track of what's happened.. */
985
986 rf_markalldirty(&rs->sc_r);
987 }
988
989 if ((rs->sc_flags & RAIDF_INITED) != 0)
990 error = dk_open(dksc, dev, flags, fmt, l);
991
992 bad:
993 raidunlock(rs);
994
995 return error;
996
997
998 }
999
1000 static int
1001 raid_lastclose(device_t self)
1002 {
1003 struct raid_softc *rs = raidsoftc(self);
1004
1005 /* Last one... device is not unconfigured yet.
1006 Device shutdown has taken care of setting the
1007 clean bits if RAIDF_INITED is not set
1008 mark things as clean... */
1009
1010 rf_update_component_labels(&rs->sc_r,
1011 RF_FINAL_COMPONENT_UPDATE);
1012
1013 /* pass to unlocked code */
1014 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1015 rs->sc_flags |= RAIDF_DETACH;
1016
1017 return 0;
1018 }
1019
1020 /* ARGSUSED */
1021 static int
1022 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
1023 {
1024 int unit = raidunit(dev);
1025 struct raid_softc *rs;
1026 struct dk_softc *dksc;
1027 cfdata_t cf;
1028 int error = 0, do_detach = 0, do_put = 0;
1029
1030 if ((rs = raidget(unit, false)) == NULL)
1031 return ENXIO;
1032 dksc = &rs->sc_dksc;
1033
1034 if ((error = raidlock(rs)) != 0)
1035 return error;
1036
1037 if ((rs->sc_flags & RAIDF_INITED) != 0) {
1038 error = dk_close(dksc, dev, flags, fmt, l);
1039 if ((rs->sc_flags & RAIDF_DETACH) != 0)
1040 do_detach = 1;
1041 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1042 do_put = 1;
1043
1044 raidunlock(rs);
1045
1046 if (do_detach) {
1047 /* free the pseudo device attach bits */
1048 cf = device_cfdata(dksc->sc_dev);
1049 error = config_detach(dksc->sc_dev, 0);
1050 if (error == 0)
1051 free(cf, M_RAIDFRAME);
1052 } else if (do_put) {
1053 raidput(rs);
1054 }
1055
1056 return error;
1057
1058 }
1059
1060 static void
1061 raid_wakeup(RF_Raid_t *raidPtr)
1062 {
1063 rf_lock_mutex2(raidPtr->iodone_lock);
1064 rf_signal_cond2(raidPtr->iodone_cv);
1065 rf_unlock_mutex2(raidPtr->iodone_lock);
1066 }
1067
1068 static void
1069 raidstrategy(struct buf *bp)
1070 {
1071 unsigned int unit;
1072 struct raid_softc *rs;
1073 struct dk_softc *dksc;
1074 RF_Raid_t *raidPtr;
1075
1076 unit = raidunit(bp->b_dev);
1077 if ((rs = raidget(unit, false)) == NULL) {
1078 bp->b_error = ENXIO;
1079 goto fail;
1080 }
1081 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1082 bp->b_error = ENXIO;
1083 goto fail;
1084 }
1085 dksc = &rs->sc_dksc;
1086 raidPtr = &rs->sc_r;
1087
1088 /* Queue IO only */
1089 if (dk_strategy_defer(dksc, bp))
1090 goto done;
1091
1092 /* schedule the IO to happen at the next convenient time */
1093 raid_wakeup(raidPtr);
1094
1095 done:
1096 return;
1097
1098 fail:
1099 bp->b_resid = bp->b_bcount;
1100 biodone(bp);
1101 }
1102
1103 static int
1104 raid_diskstart(device_t dev, struct buf *bp)
1105 {
1106 struct raid_softc *rs = raidsoftc(dev);
1107 RF_Raid_t *raidPtr;
1108
1109 raidPtr = &rs->sc_r;
1110 if (!raidPtr->valid) {
1111 db1_printf(("raid is not valid..\n"));
1112 return ENODEV;
1113 }
1114
1115 /* XXX */
1116 bp->b_resid = 0;
1117
1118 return raiddoaccess(raidPtr, bp);
1119 }
1120
1121 void
1122 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1123 {
1124 struct raid_softc *rs;
1125 struct dk_softc *dksc;
1126
1127 rs = raidPtr->softc;
1128 dksc = &rs->sc_dksc;
1129
1130 dk_done(dksc, bp);
1131
1132 rf_lock_mutex2(raidPtr->mutex);
1133 raidPtr->openings++;
1134 rf_unlock_mutex2(raidPtr->mutex);
1135
1136 /* schedule more IO */
1137 raid_wakeup(raidPtr);
1138 }
1139
1140 /* ARGSUSED */
1141 static int
1142 raidread(dev_t dev, struct uio *uio, int flags)
1143 {
1144 int unit = raidunit(dev);
1145 struct raid_softc *rs;
1146
1147 if ((rs = raidget(unit, false)) == NULL)
1148 return ENXIO;
1149
1150 if ((rs->sc_flags & RAIDF_INITED) == 0)
1151 return ENXIO;
1152
1153 return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1154
1155 }
1156
1157 /* ARGSUSED */
1158 static int
1159 raidwrite(dev_t dev, struct uio *uio, int flags)
1160 {
1161 int unit = raidunit(dev);
1162 struct raid_softc *rs;
1163
1164 if ((rs = raidget(unit, false)) == NULL)
1165 return ENXIO;
1166
1167 if ((rs->sc_flags & RAIDF_INITED) == 0)
1168 return ENXIO;
1169
1170 return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1171
1172 }
1173
1174 static int
1175 raid_detach_unlocked(struct raid_softc *rs)
1176 {
1177 struct dk_softc *dksc = &rs->sc_dksc;
1178 RF_Raid_t *raidPtr;
1179 int error;
1180
1181 raidPtr = &rs->sc_r;
1182
1183 if (DK_BUSY(dksc, 0) ||
1184 raidPtr->recon_in_progress != 0 ||
1185 raidPtr->parity_rewrite_in_progress != 0 ||
1186 raidPtr->copyback_in_progress != 0)
1187 return EBUSY;
1188
1189 if ((rs->sc_flags & RAIDF_INITED) == 0)
1190 return 0;
1191
1192 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1193
1194 if ((error = rf_Shutdown(raidPtr)) != 0)
1195 return error;
1196
1197 rs->sc_flags &= ~RAIDF_INITED;
1198
1199 /* Kill off any queued buffers */
1200 dk_drain(dksc);
1201 bufq_free(dksc->sc_bufq);
1202
1203 /* Detach the disk. */
1204 dkwedge_delall(&dksc->sc_dkdev);
1205 disk_detach(&dksc->sc_dkdev);
1206 disk_destroy(&dksc->sc_dkdev);
1207 dk_detach(dksc);
1208
1209 return 0;
1210 }
1211
1212 int
1213 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1214 {
1215 struct rf_recon_req_internal *rrint;
1216
1217 if (raidPtr->Layout.map->faultsTolerated == 0) {
1218 /* Can't do this on a RAID 0!! */
1219 return EINVAL;
1220 }
1221
1222 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1223 /* bad column */
1224 return EINVAL;
1225 }
1226
1227 rf_lock_mutex2(raidPtr->mutex);
1228 if (raidPtr->status == rf_rs_reconstructing) {
1229 /* you can't fail a disk while we're reconstructing! */
1230 /* XXX wrong for RAID6 */
1231 goto out;
1232 }
1233 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1234 (raidPtr->numFailures > 0)) {
1235 /* some other component has failed. Let's not make
1236 things worse. XXX wrong for RAID6 */
1237 goto out;
1238 }
1239 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1240 /* Can't fail a spared disk! */
1241 goto out;
1242 }
1243 rf_unlock_mutex2(raidPtr->mutex);
1244
1245 /* make a copy of the recon request so that we don't rely on
1246 * the user's buffer */
1247 rrint = RF_Malloc(sizeof(*rrint));
1248 if (rrint == NULL)
1249 return(ENOMEM);
1250 rrint->col = rr->col;
1251 rrint->flags = rr->flags;
1252 rrint->raidPtr = raidPtr;
1253
1254 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1255 rrint, "raid_recon");
1256 out:
1257 rf_unlock_mutex2(raidPtr->mutex);
1258 return EINVAL;
1259 }
1260
1261 static int
1262 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1263 {
1264 /* allocate a buffer for the layout-specific data, and copy it in */
1265 if (k_cfg->layoutSpecificSize == 0)
1266 return 0;
1267
1268 if (k_cfg->layoutSpecificSize > 10000) {
1269 /* sanity check */
1270 return EINVAL;
1271 }
1272
1273 u_char *specific_buf;
1274 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1275 if (specific_buf == NULL)
1276 return ENOMEM;
1277
1278 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1279 k_cfg->layoutSpecificSize);
1280 if (retcode) {
1281 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1282 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1283 return retcode;
1284 }
1285
1286 k_cfg->layoutSpecific = specific_buf;
1287 return 0;
1288 }
1289
1290 static int
1291 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1292 {
1293 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1294
1295 if (rs->sc_r.valid) {
1296 /* There is a valid RAID set running on this unit! */
1297 printf("raid%d: Device already configured!\n", rs->sc_unit);
1298 return EINVAL;
1299 }
1300
1301 /* copy-in the configuration information */
1302 /* data points to a pointer to the configuration structure */
1303 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1304 if (*k_cfg == NULL) {
1305 return ENOMEM;
1306 }
1307 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1308 if (retcode == 0)
1309 return 0;
1310 RF_Free(*k_cfg, sizeof(RF_Config_t));
1311 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1312 rs->sc_flags |= RAIDF_SHUTDOWN;
1313 return retcode;
1314 }
1315
1316 int
1317 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1318 {
1319 int retcode, i;
1320 RF_Raid_t *raidPtr = &rs->sc_r;
1321
1322 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1323
1324 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1325 goto out;
1326
1327 /* should do some kind of sanity check on the configuration.
1328 * Store the sum of all the bytes in the last byte? */
1329
1330 /* Force nul-termination on all strings. */
1331 #define ZERO_FINAL(s) do { s[sizeof(s) - 1] = '\0'; } while (0)
1332 for (i = 0; i < RF_MAXCOL; i++) {
1333 ZERO_FINAL(k_cfg->devnames[0][i]);
1334 }
1335 for (i = 0; i < RF_MAXSPARE; i++) {
1336 ZERO_FINAL(k_cfg->spare_names[i]);
1337 }
1338 for (i = 0; i < RF_MAXDBGV; i++) {
1339 ZERO_FINAL(k_cfg->debugVars[i]);
1340 }
1341 #undef ZERO_FINAL
1342
1343 /* Check some basic limits. */
1344 if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1345 retcode = EINVAL;
1346 goto out;
1347 }
1348 if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1349 retcode = EINVAL;
1350 goto out;
1351 }
1352
1353 /* configure the system */
1354
1355 /*
1356 * Clear the entire RAID descriptor, just to make sure
1357 * there is no stale data left in the case of a
1358 * reconfiguration
1359 */
1360 memset(raidPtr, 0, sizeof(*raidPtr));
1361 raidPtr->softc = rs;
1362 raidPtr->raidid = rs->sc_unit;
1363
1364 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1365
1366 if (retcode == 0) {
1367 /* allow this many simultaneous IO's to
1368 this RAID device */
1369 raidPtr->openings = RAIDOUTSTANDING;
1370
1371 raidinit(rs);
1372 raid_wakeup(raidPtr);
1373 rf_markalldirty(raidPtr);
1374 }
1375
1376 /* free the buffers. No return code here. */
1377 if (k_cfg->layoutSpecificSize) {
1378 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1379 }
1380 out:
1381 RF_Free(k_cfg, sizeof(RF_Config_t));
1382 if (retcode) {
1383 /*
1384 * If configuration failed, set sc_flags so that we
1385 * will detach the device when we close it.
1386 */
1387 rs->sc_flags |= RAIDF_SHUTDOWN;
1388 }
1389 return retcode;
1390 }
1391
1392 #if RF_DISABLED
1393 static int
1394 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1395 {
1396
1397 /* XXX check the label for valid stuff... */
1398 /* Note that some things *should not* get modified --
1399 the user should be re-initing the labels instead of
1400 trying to patch things.
1401 */
1402 #ifdef DEBUG
1403 int raidid = raidPtr->raidid;
1404 printf("raid%d: Got component label:\n", raidid);
1405 printf("raid%d: Version: %d\n", raidid, clabel->version);
1406 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1407 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1408 printf("raid%d: Column: %d\n", raidid, clabel->column);
1409 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1410 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1411 printf("raid%d: Status: %d\n", raidid, clabel->status);
1412 #endif /* DEBUG */
1413 clabel->row = 0;
1414 int column = clabel->column;
1415
1416 if ((column < 0) || (column >= raidPtr->numCol)) {
1417 return(EINVAL);
1418 }
1419
1420 /* XXX this isn't allowed to do anything for now :-) */
1421
1422 /* XXX and before it is, we need to fill in the rest
1423 of the fields!?!?!?! */
1424 memcpy(raidget_component_label(raidPtr, column),
1425 clabel, sizeof(*clabel));
1426 raidflush_component_label(raidPtr, column);
1427 return 0;
1428 }
1429 #endif
1430
1431 static int
1432 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1433 {
1434 /*
1435 we only want the serial number from
1436 the above. We get all the rest of the information
1437 from the config that was used to create this RAID
1438 set.
1439 */
1440
1441 raidPtr->serial_number = clabel->serial_number;
1442
1443 for (int column = 0; column < raidPtr->numCol; column++) {
1444 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1445 if (RF_DEAD_DISK(diskPtr->status))
1446 continue;
1447 RF_ComponentLabel_t *ci_label = raidget_component_label(
1448 raidPtr, column);
1449 /* Zeroing this is important. */
1450 memset(ci_label, 0, sizeof(*ci_label));
1451 raid_init_component_label(raidPtr, ci_label);
1452 ci_label->serial_number = raidPtr->serial_number;
1453 ci_label->row = 0; /* we dont' pretend to support more */
1454 rf_component_label_set_partitionsize(ci_label,
1455 diskPtr->partitionSize);
1456 ci_label->column = column;
1457 raidflush_component_label(raidPtr, column);
1458 /* XXXjld what about the spares? */
1459 }
1460
1461 return 0;
1462 }
1463
1464 static int
1465 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1466 {
1467
1468 if (raidPtr->Layout.map->faultsTolerated == 0) {
1469 /* Can't do this on a RAID 0!! */
1470 return EINVAL;
1471 }
1472
1473 if (raidPtr->recon_in_progress == 1) {
1474 /* a reconstruct is already in progress! */
1475 return EINVAL;
1476 }
1477
1478 RF_SingleComponent_t component;
1479 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1480 component.row = 0; /* we don't support any more */
1481 int column = component.column;
1482
1483 if ((column < 0) || (column >= raidPtr->numCol)) {
1484 return EINVAL;
1485 }
1486
1487 rf_lock_mutex2(raidPtr->mutex);
1488 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1489 (raidPtr->numFailures > 0)) {
1490 /* XXX 0 above shouldn't be constant!!! */
1491 /* some component other than this has failed.
1492 Let's not make things worse than they already
1493 are... */
1494 printf("raid%d: Unable to reconstruct to disk at:\n",
1495 raidPtr->raidid);
1496 printf("raid%d: Col: %d Too many failures.\n",
1497 raidPtr->raidid, column);
1498 rf_unlock_mutex2(raidPtr->mutex);
1499 return EINVAL;
1500 }
1501
1502 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1503 printf("raid%d: Unable to reconstruct to disk at:\n",
1504 raidPtr->raidid);
1505 printf("raid%d: Col: %d "
1506 "Reconstruction already occurring!\n",
1507 raidPtr->raidid, column);
1508
1509 rf_unlock_mutex2(raidPtr->mutex);
1510 return EINVAL;
1511 }
1512
1513 if (raidPtr->Disks[column].status == rf_ds_spared) {
1514 rf_unlock_mutex2(raidPtr->mutex);
1515 return EINVAL;
1516 }
1517
1518 rf_unlock_mutex2(raidPtr->mutex);
1519
1520 struct rf_recon_req_internal *rrint;
1521 rrint = RF_Malloc(sizeof(*rrint));
1522 if (rrint == NULL)
1523 return ENOMEM;
1524
1525 rrint->col = column;
1526 rrint->raidPtr = raidPtr;
1527
1528 return RF_CREATE_THREAD(raidPtr->recon_thread,
1529 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1530 }
1531
1532 static int
1533 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1534 {
1535 /*
1536 * This makes no sense on a RAID 0, or if we are not reconstructing
1537 * so tell the user it's done.
1538 */
1539 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1540 raidPtr->status != rf_rs_reconstructing) {
1541 *data = 100;
1542 return 0;
1543 }
1544 if (raidPtr->reconControl->numRUsTotal == 0) {
1545 *data = 0;
1546 return 0;
1547 }
1548 *data = (raidPtr->reconControl->numRUsComplete * 100
1549 / raidPtr->reconControl->numRUsTotal);
1550 return 0;
1551 }
1552
1553 /*
1554 * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1555 * on the component_name[] array.
1556 */
1557 static void
1558 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1559 {
1560
1561 memcpy(component, data, sizeof *component);
1562 component->component_name[sizeof(component->component_name) - 1] = '\0';
1563 }
1564
1565 static int
1566 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1567 {
1568 int unit = raidunit(dev);
1569 int part, pmask;
1570 struct raid_softc *rs;
1571 struct dk_softc *dksc;
1572 RF_Config_t *k_cfg;
1573 RF_Raid_t *raidPtr;
1574 RF_AccTotals_t *totals;
1575 RF_SingleComponent_t component;
1576 RF_DeviceConfig_t *d_cfg, *ucfgp;
1577 int retcode = 0;
1578 int column;
1579 RF_ComponentLabel_t *clabel;
1580 int d;
1581
1582 if ((rs = raidget(unit, false)) == NULL)
1583 return ENXIO;
1584
1585 dksc = &rs->sc_dksc;
1586 raidPtr = &rs->sc_r;
1587
1588 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1589 (int) DISKPART(dev), (int) unit, cmd));
1590
1591 /* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
1592 switch (cmd) {
1593 case RAIDFRAME_CONFIGURE:
1594 case RAIDFRAME_RESCAN:
1595 break;
1596 default:
1597 if (!rf_inited(rs))
1598 return ENXIO;
1599 }
1600
1601 switch (cmd) {
1602 /* configure the system */
1603 case RAIDFRAME_CONFIGURE:
1604 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1605 return retcode;
1606 return rf_construct(rs, k_cfg);
1607
1608 /* shutdown the system */
1609 case RAIDFRAME_SHUTDOWN:
1610
1611 part = DISKPART(dev);
1612 pmask = (1 << part);
1613
1614 if ((retcode = raidlock(rs)) != 0)
1615 return retcode;
1616
1617 if (DK_BUSY(dksc, pmask) ||
1618 raidPtr->recon_in_progress != 0 ||
1619 raidPtr->parity_rewrite_in_progress != 0 ||
1620 raidPtr->copyback_in_progress != 0)
1621 retcode = EBUSY;
1622 else {
1623 /* detach and free on close */
1624 rs->sc_flags |= RAIDF_SHUTDOWN;
1625 retcode = 0;
1626 }
1627
1628 raidunlock(rs);
1629
1630 return retcode;
1631 case RAIDFRAME_GET_COMPONENT_LABEL:
1632 return rf_get_component_label(raidPtr, data);
1633
1634 #if RF_DISABLED
1635 case RAIDFRAME_SET_COMPONENT_LABEL:
1636 return rf_set_component_label(raidPtr, data);
1637 #endif
1638
1639 case RAIDFRAME_INIT_LABELS:
1640 return rf_init_component_label(raidPtr, data);
1641
1642 case RAIDFRAME_SET_AUTOCONFIG:
1643 d = rf_set_autoconfig(raidPtr, *(int *) data);
1644 printf("raid%d: New autoconfig value is: %d\n",
1645 raidPtr->raidid, d);
1646 *(int *) data = d;
1647 return retcode;
1648
1649 case RAIDFRAME_SET_ROOT:
1650 d = rf_set_rootpartition(raidPtr, *(int *) data);
1651 printf("raid%d: New rootpartition value is: %d\n",
1652 raidPtr->raidid, d);
1653 *(int *) data = d;
1654 return retcode;
1655
1656 /* initialize all parity */
1657 case RAIDFRAME_REWRITEPARITY:
1658
1659 if (raidPtr->Layout.map->faultsTolerated == 0) {
1660 /* Parity for RAID 0 is trivially correct */
1661 raidPtr->parity_good = RF_RAID_CLEAN;
1662 return 0;
1663 }
1664
1665 if (raidPtr->parity_rewrite_in_progress == 1) {
1666 /* Re-write is already in progress! */
1667 return EINVAL;
1668 }
1669
1670 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1671 rf_RewriteParityThread, raidPtr,"raid_parity");
1672
1673 case RAIDFRAME_ADD_HOT_SPARE:
1674 rf_copy_single_component(&component, data);
1675 return rf_add_hot_spare(raidPtr, &component);
1676
1677 case RAIDFRAME_REMOVE_HOT_SPARE:
1678 return retcode;
1679
1680 case RAIDFRAME_DELETE_COMPONENT:
1681 rf_copy_single_component(&component, data);
1682 return rf_delete_component(raidPtr, &component);
1683
1684 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1685 rf_copy_single_component(&component, data);
1686 return rf_incorporate_hot_spare(raidPtr, &component);
1687
1688 case RAIDFRAME_REBUILD_IN_PLACE:
1689 return rf_rebuild_in_place(raidPtr, data);
1690
1691 case RAIDFRAME_GET_INFO:
1692 ucfgp = *(RF_DeviceConfig_t **)data;
1693 d_cfg = RF_Malloc(sizeof(*d_cfg));
1694 if (d_cfg == NULL)
1695 return ENOMEM;
1696 retcode = rf_get_info(raidPtr, d_cfg);
1697 if (retcode == 0) {
1698 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1699 }
1700 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1701 return retcode;
1702
1703 case RAIDFRAME_CHECK_PARITY:
1704 *(int *) data = raidPtr->parity_good;
1705 return 0;
1706
1707 case RAIDFRAME_PARITYMAP_STATUS:
1708 if (rf_paritymap_ineligible(raidPtr))
1709 return EINVAL;
1710 rf_paritymap_status(raidPtr->parity_map, data);
1711 return 0;
1712
1713 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1714 if (rf_paritymap_ineligible(raidPtr))
1715 return EINVAL;
1716 if (raidPtr->parity_map == NULL)
1717 return ENOENT; /* ??? */
1718 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1719 return EINVAL;
1720 return 0;
1721
1722 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1723 if (rf_paritymap_ineligible(raidPtr))
1724 return EINVAL;
1725 *(int *) data = rf_paritymap_get_disable(raidPtr);
1726 return 0;
1727
1728 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1729 if (rf_paritymap_ineligible(raidPtr))
1730 return EINVAL;
1731 rf_paritymap_set_disable(raidPtr, *(int *)data);
1732 /* XXX should errors be passed up? */
1733 return 0;
1734
1735 case RAIDFRAME_RESCAN:
1736 return rf_rescan();
1737
1738 case RAIDFRAME_RESET_ACCTOTALS:
1739 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1740 return 0;
1741
1742 case RAIDFRAME_GET_ACCTOTALS:
1743 totals = (RF_AccTotals_t *) data;
1744 *totals = raidPtr->acc_totals;
1745 return 0;
1746
1747 case RAIDFRAME_KEEP_ACCTOTALS:
1748 raidPtr->keep_acc_totals = *(int *)data;
1749 return 0;
1750
1751 case RAIDFRAME_GET_SIZE:
1752 *(int *) data = raidPtr->totalSectors;
1753 return 0;
1754
1755 case RAIDFRAME_FAIL_DISK:
1756 return rf_fail_disk(raidPtr, data);
1757
1758 /* invoke a copyback operation after recon on whatever disk
1759 * needs it, if any */
1760 case RAIDFRAME_COPYBACK:
1761
1762 if (raidPtr->Layout.map->faultsTolerated == 0) {
1763 /* This makes no sense on a RAID 0!! */
1764 return EINVAL;
1765 }
1766
1767 if (raidPtr->copyback_in_progress == 1) {
1768 /* Copyback is already in progress! */
1769 return EINVAL;
1770 }
1771
1772 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1773 rf_CopybackThread, raidPtr, "raid_copyback");
1774
1775 /* return the percentage completion of reconstruction */
1776 case RAIDFRAME_CHECK_RECON_STATUS:
1777 return rf_check_recon_status(raidPtr, data);
1778
1779 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1780 rf_check_recon_status_ext(raidPtr, data);
1781 return 0;
1782
1783 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1784 if (raidPtr->Layout.map->faultsTolerated == 0) {
1785 /* This makes no sense on a RAID 0, so tell the
1786 user it's done. */
1787 *(int *) data = 100;
1788 return 0;
1789 }
1790 if (raidPtr->parity_rewrite_in_progress == 1) {
1791 *(int *) data = 100 *
1792 raidPtr->parity_rewrite_stripes_done /
1793 raidPtr->Layout.numStripe;
1794 } else {
1795 *(int *) data = 100;
1796 }
1797 return 0;
1798
1799 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1800 rf_check_parityrewrite_status_ext(raidPtr, data);
1801 return 0;
1802
1803 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1804 if (raidPtr->Layout.map->faultsTolerated == 0) {
1805 /* This makes no sense on a RAID 0 */
1806 *(int *) data = 100;
1807 return 0;
1808 }
1809 if (raidPtr->copyback_in_progress == 1) {
1810 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1811 raidPtr->Layout.numStripe;
1812 } else {
1813 *(int *) data = 100;
1814 }
1815 return 0;
1816
1817 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1818 rf_check_copyback_status_ext(raidPtr, data);
1819 return 0;
1820
1821 case RAIDFRAME_SET_LAST_UNIT:
1822 for (column = 0; column < raidPtr->numCol; column++)
1823 if (raidPtr->Disks[column].status != rf_ds_optimal)
1824 return EBUSY;
1825
1826 for (column = 0; column < raidPtr->numCol; column++) {
1827 clabel = raidget_component_label(raidPtr, column);
1828 clabel->last_unit = *(int *)data;
1829 raidflush_component_label(raidPtr, column);
1830 }
1831 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1832 return 0;
1833
1834 /* the sparetable daemon calls this to wait for the kernel to
1835 * need a spare table. this ioctl does not return until a
1836 * spare table is needed. XXX -- calling mpsleep here in the
1837 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1838 * -- I should either compute the spare table in the kernel,
1839 * or have a different -- XXX XXX -- interface (a different
1840 * character device) for delivering the table -- XXX */
1841 #if RF_DISABLED
1842 case RAIDFRAME_SPARET_WAIT:
1843 rf_lock_mutex2(rf_sparet_wait_mutex);
1844 while (!rf_sparet_wait_queue)
1845 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1846 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1847 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1848 rf_unlock_mutex2(rf_sparet_wait_mutex);
1849
1850 /* structure assignment */
1851 *((RF_SparetWait_t *) data) = *waitreq;
1852
1853 RF_Free(waitreq, sizeof(*waitreq));
1854 return 0;
1855
1856 /* wakes up a process waiting on SPARET_WAIT and puts an error
1857 * code in it that will cause the dameon to exit */
1858 case RAIDFRAME_ABORT_SPARET_WAIT:
1859 waitreq = RF_Malloc(sizeof(*waitreq));
1860 waitreq->fcol = -1;
1861 rf_lock_mutex2(rf_sparet_wait_mutex);
1862 waitreq->next = rf_sparet_wait_queue;
1863 rf_sparet_wait_queue = waitreq;
1864 rf_broadcast_cond2(rf_sparet_wait_cv);
1865 rf_unlock_mutex2(rf_sparet_wait_mutex);
1866 return 0;
1867
1868 /* used by the spare table daemon to deliver a spare table
1869 * into the kernel */
1870 case RAIDFRAME_SEND_SPARET:
1871
1872 /* install the spare table */
1873 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1874
1875 /* respond to the requestor. the return status of the spare
1876 * table installation is passed in the "fcol" field */
1877 waitred = RF_Malloc(sizeof(*waitreq));
1878 waitreq->fcol = retcode;
1879 rf_lock_mutex2(rf_sparet_wait_mutex);
1880 waitreq->next = rf_sparet_resp_queue;
1881 rf_sparet_resp_queue = waitreq;
1882 rf_broadcast_cond2(rf_sparet_resp_cv);
1883 rf_unlock_mutex2(rf_sparet_wait_mutex);
1884
1885 return retcode;
1886 #endif
1887 default:
1888 /*
1889 * Don't bother trying to load compat modules
1890 * if it is not our ioctl. This is more efficient
1891 * and makes rump tests not depend on compat code
1892 */
1893 if (IOCGROUP(cmd) != 'r')
1894 break;
1895 #ifdef _LP64
1896 if ((l->l_proc->p_flag & PK_32) != 0) {
1897 module_autoload("compat_netbsd32_raid",
1898 MODULE_CLASS_EXEC);
1899 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1900 (rs, cmd, data), enosys(), retcode);
1901 if (retcode != EPASSTHROUGH)
1902 return retcode;
1903 }
1904 #endif
1905 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1906 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1907 (rs, cmd, data), enosys(), retcode);
1908 if (retcode != EPASSTHROUGH)
1909 return retcode;
1910
1911 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1912 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1913 (rs, cmd, data), enosys(), retcode);
1914 if (retcode != EPASSTHROUGH)
1915 return retcode;
1916 break; /* fall through to the os-specific code below */
1917
1918 }
1919
1920 if (!raidPtr->valid)
1921 return EINVAL;
1922
1923 /*
1924 * Add support for "regular" device ioctls here.
1925 */
1926
1927 switch (cmd) {
1928 case DIOCGCACHE:
1929 retcode = rf_get_component_caches(raidPtr, (int *)data);
1930 break;
1931
1932 case DIOCCACHESYNC:
1933 retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1934 break;
1935
1936 default:
1937 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1938 break;
1939 }
1940
1941 return retcode;
1942
1943 }
1944
1945
1946 /* raidinit -- complete the rest of the initialization for the
1947 RAIDframe device. */
1948
1949
1950 static void
1951 raidinit(struct raid_softc *rs)
1952 {
1953 cfdata_t cf;
1954 unsigned int unit;
1955 struct dk_softc *dksc = &rs->sc_dksc;
1956 RF_Raid_t *raidPtr = &rs->sc_r;
1957 device_t dev;
1958
1959 unit = raidPtr->raidid;
1960
1961 /* XXX doesn't check bounds. */
1962 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1963
1964 /* attach the pseudo device */
1965 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1966 cf->cf_name = raid_cd.cd_name;
1967 cf->cf_atname = raid_cd.cd_name;
1968 cf->cf_unit = unit;
1969 cf->cf_fstate = FSTATE_STAR;
1970
1971 dev = config_attach_pseudo(cf);
1972 if (dev == NULL) {
1973 printf("raid%d: config_attach_pseudo failed\n",
1974 raidPtr->raidid);
1975 free(cf, M_RAIDFRAME);
1976 return;
1977 }
1978
1979 /* provide a backpointer to the real softc */
1980 raidsoftc(dev) = rs;
1981
1982 /* disk_attach actually creates space for the CPU disklabel, among
1983 * other things, so it's critical to call this *BEFORE* we try putzing
1984 * with disklabels. */
1985 dk_init(dksc, dev, DKTYPE_RAID);
1986 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1987
1988 /* XXX There may be a weird interaction here between this, and
1989 * protectedSectors, as used in RAIDframe. */
1990
1991 rs->sc_size = raidPtr->totalSectors;
1992
1993 /* Attach dk and disk subsystems */
1994 dk_attach(dksc);
1995 disk_attach(&dksc->sc_dkdev);
1996 rf_set_geometry(rs, raidPtr);
1997
1998 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1999
2000 /* mark unit as usuable */
2001 rs->sc_flags |= RAIDF_INITED;
2002
2003 dkwedge_discover(&dksc->sc_dkdev);
2004 }
2005
2006 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
2007 /* wake up the daemon & tell it to get us a spare table
2008 * XXX
2009 * the entries in the queues should be tagged with the raidPtr
2010 * so that in the extremely rare case that two recons happen at once,
2011 * we know for which device were requesting a spare table
2012 * XXX
2013 *
2014 * XXX This code is not currently used. GO
2015 */
2016 int
2017 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2018 {
2019 int retcode;
2020
2021 rf_lock_mutex2(rf_sparet_wait_mutex);
2022 req->next = rf_sparet_wait_queue;
2023 rf_sparet_wait_queue = req;
2024 rf_broadcast_cond2(rf_sparet_wait_cv);
2025
2026 /* mpsleep unlocks the mutex */
2027 while (!rf_sparet_resp_queue) {
2028 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2029 }
2030 req = rf_sparet_resp_queue;
2031 rf_sparet_resp_queue = req->next;
2032 rf_unlock_mutex2(rf_sparet_wait_mutex);
2033
2034 retcode = req->fcol;
2035 RF_Free(req, sizeof(*req)); /* this is not the same req as we
2036 * alloc'd */
2037 return retcode;
2038 }
2039 #endif
2040
2041 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2042 * bp & passes it down.
2043 * any calls originating in the kernel must use non-blocking I/O
2044 * do some extra sanity checking to return "appropriate" error values for
2045 * certain conditions (to make some standard utilities work)
2046 *
2047 * Formerly known as: rf_DoAccessKernel
2048 */
2049 void
2050 raidstart(RF_Raid_t *raidPtr)
2051 {
2052 struct raid_softc *rs;
2053 struct dk_softc *dksc;
2054
2055 rs = raidPtr->softc;
2056 dksc = &rs->sc_dksc;
2057 /* quick check to see if anything has died recently */
2058 rf_lock_mutex2(raidPtr->mutex);
2059 if (raidPtr->numNewFailures > 0) {
2060 rf_unlock_mutex2(raidPtr->mutex);
2061 rf_update_component_labels(raidPtr,
2062 RF_NORMAL_COMPONENT_UPDATE);
2063 rf_lock_mutex2(raidPtr->mutex);
2064 raidPtr->numNewFailures--;
2065 }
2066 rf_unlock_mutex2(raidPtr->mutex);
2067
2068 if ((rs->sc_flags & RAIDF_INITED) == 0) {
2069 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
2070 return;
2071 }
2072
2073 dk_start(dksc, NULL);
2074 }
2075
2076 static int
2077 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
2078 {
2079 RF_SectorCount_t num_blocks, pb, sum;
2080 RF_RaidAddr_t raid_addr;
2081 daddr_t blocknum;
2082 int rc;
2083
2084 rf_lock_mutex2(raidPtr->mutex);
2085 if (raidPtr->openings == 0) {
2086 rf_unlock_mutex2(raidPtr->mutex);
2087 return EAGAIN;
2088 }
2089 rf_unlock_mutex2(raidPtr->mutex);
2090
2091 blocknum = bp->b_rawblkno;
2092
2093 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2094 (int) blocknum));
2095
2096 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2097 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2098
2099 /* *THIS* is where we adjust what block we're going to...
2100 * but DO NOT TOUCH bp->b_blkno!!! */
2101 raid_addr = blocknum;
2102
2103 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2104 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2105 sum = raid_addr + num_blocks + pb;
2106 if (1 || rf_debugKernelAccess) {
2107 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2108 (int) raid_addr, (int) sum, (int) num_blocks,
2109 (int) pb, (int) bp->b_resid));
2110 }
2111 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2112 || (sum < num_blocks) || (sum < pb)) {
2113 rc = ENOSPC;
2114 goto done;
2115 }
2116 /*
2117 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2118 */
2119
2120 if (bp->b_bcount & raidPtr->sectorMask) {
2121 rc = ENOSPC;
2122 goto done;
2123 }
2124 db1_printf(("Calling DoAccess..\n"));
2125
2126
2127 rf_lock_mutex2(raidPtr->mutex);
2128 raidPtr->openings--;
2129 rf_unlock_mutex2(raidPtr->mutex);
2130
2131 /* don't ever condition on bp->b_flags & B_WRITE.
2132 * always condition on B_READ instead */
2133
2134 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2135 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2136 raid_addr, num_blocks,
2137 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2138
2139 done:
2140 return rc;
2141 }
2142
2143 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2144
2145 int
2146 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2147 {
2148 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2149 struct buf *bp;
2150
2151 req->queue = queue;
2152 bp = req->bp;
2153
2154 switch (req->type) {
2155 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2156 /* XXX need to do something extra here.. */
2157 /* I'm leaving this in, as I've never actually seen it used,
2158 * and I'd like folks to report it... GO */
2159 printf("%s: WAKEUP CALLED\n", __func__);
2160 queue->numOutstanding++;
2161
2162 bp->b_flags = 0;
2163 bp->b_private = req;
2164
2165 KernelWakeupFunc(bp);
2166 break;
2167
2168 case RF_IO_TYPE_READ:
2169 case RF_IO_TYPE_WRITE:
2170 #if RF_ACC_TRACE > 0
2171 if (req->tracerec) {
2172 RF_ETIMER_START(req->tracerec->timer);
2173 }
2174 #endif
2175 InitBP(bp, queue->rf_cinfo->ci_vp,
2176 op, queue->rf_cinfo->ci_dev,
2177 req->sectorOffset, req->numSector,
2178 req->buf, KernelWakeupFunc, (void *) req,
2179 queue->raidPtr->logBytesPerSector);
2180
2181 if (rf_debugKernelAccess) {
2182 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2183 (long) bp->b_blkno));
2184 }
2185 queue->numOutstanding++;
2186 queue->last_deq_sector = req->sectorOffset;
2187 /* acc wouldn't have been let in if there were any pending
2188 * reqs at any other priority */
2189 queue->curPriority = req->priority;
2190
2191 db1_printf(("Going for %c to unit %d col %d\n",
2192 req->type, queue->raidPtr->raidid,
2193 queue->col));
2194 db1_printf(("sector %d count %d (%d bytes) %d\n",
2195 (int) req->sectorOffset, (int) req->numSector,
2196 (int) (req->numSector <<
2197 queue->raidPtr->logBytesPerSector),
2198 (int) queue->raidPtr->logBytesPerSector));
2199
2200 /*
2201 * XXX: drop lock here since this can block at
2202 * least with backing SCSI devices. Retake it
2203 * to minimize fuss with calling interfaces.
2204 */
2205
2206 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2207 bdev_strategy(bp);
2208 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2209 break;
2210
2211 default:
2212 panic("bad req->type in rf_DispatchKernelIO");
2213 }
2214 db1_printf(("Exiting from DispatchKernelIO\n"));
2215
2216 return 0;
2217 }
2218 /* this is the callback function associated with a I/O invoked from
2219 kernel code.
2220 */
2221 static void
2222 KernelWakeupFunc(struct buf *bp)
2223 {
2224 RF_DiskQueueData_t *req = NULL;
2225 RF_DiskQueue_t *queue;
2226
2227 db1_printf(("recovering the request queue:\n"));
2228
2229 req = bp->b_private;
2230
2231 queue = (RF_DiskQueue_t *) req->queue;
2232
2233 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2234
2235 #if RF_ACC_TRACE > 0
2236 if (req->tracerec) {
2237 RF_ETIMER_STOP(req->tracerec->timer);
2238 RF_ETIMER_EVAL(req->tracerec->timer);
2239 rf_lock_mutex2(rf_tracing_mutex);
2240 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2241 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2242 req->tracerec->num_phys_ios++;
2243 rf_unlock_mutex2(rf_tracing_mutex);
2244 }
2245 #endif
2246
2247 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2248 * ballistic, and mark the component as hosed... */
2249
2250 if (bp->b_error != 0) {
2251 /* Mark the disk as dead */
2252 /* but only mark it once... */
2253 /* and only if it wouldn't leave this RAID set
2254 completely broken */
2255 if (((queue->raidPtr->Disks[queue->col].status ==
2256 rf_ds_optimal) ||
2257 (queue->raidPtr->Disks[queue->col].status ==
2258 rf_ds_used_spare)) &&
2259 (queue->raidPtr->numFailures <
2260 queue->raidPtr->Layout.map->faultsTolerated)) {
2261 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2262 queue->raidPtr->raidid,
2263 bp->b_error,
2264 queue->raidPtr->Disks[queue->col].devname);
2265 queue->raidPtr->Disks[queue->col].status =
2266 rf_ds_failed;
2267 queue->raidPtr->status = rf_rs_degraded;
2268 queue->raidPtr->numFailures++;
2269 queue->raidPtr->numNewFailures++;
2270 } else { /* Disk is already dead... */
2271 /* printf("Disk already marked as dead!\n"); */
2272 }
2273
2274 }
2275
2276 /* Fill in the error value */
2277 req->error = bp->b_error;
2278
2279 /* Drop this one on the "finished" queue... */
2280 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2281
2282 /* Let the raidio thread know there is work to be done. */
2283 rf_signal_cond2(queue->raidPtr->iodone_cv);
2284
2285 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2286 }
2287
2288
2289 /*
2290 * initialize a buf structure for doing an I/O in the kernel.
2291 */
2292 static void
2293 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2294 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2295 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2296 {
2297 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2298 bp->b_oflags = 0;
2299 bp->b_cflags = 0;
2300 bp->b_bcount = numSect << logBytesPerSector;
2301 bp->b_bufsize = bp->b_bcount;
2302 bp->b_error = 0;
2303 bp->b_dev = dev;
2304 bp->b_data = bf;
2305 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2306 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2307 if (bp->b_bcount == 0) {
2308 panic("bp->b_bcount is zero in InitBP!!");
2309 }
2310 bp->b_iodone = cbFunc;
2311 bp->b_private = cbArg;
2312 }
2313
2314 /*
2315 * Wait interruptibly for an exclusive lock.
2316 *
2317 * XXX
2318 * Several drivers do this; it should be abstracted and made MP-safe.
2319 * (Hmm... where have we seen this warning before :-> GO )
2320 */
2321 static int
2322 raidlock(struct raid_softc *rs)
2323 {
2324 int error;
2325
2326 error = 0;
2327 mutex_enter(&rs->sc_mutex);
2328 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2329 rs->sc_flags |= RAIDF_WANTED;
2330 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2331 if (error != 0)
2332 goto done;
2333 }
2334 rs->sc_flags |= RAIDF_LOCKED;
2335 done:
2336 mutex_exit(&rs->sc_mutex);
2337 return error;
2338 }
2339 /*
2340 * Unlock and wake up any waiters.
2341 */
2342 static void
2343 raidunlock(struct raid_softc *rs)
2344 {
2345
2346 mutex_enter(&rs->sc_mutex);
2347 rs->sc_flags &= ~RAIDF_LOCKED;
2348 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2349 rs->sc_flags &= ~RAIDF_WANTED;
2350 cv_broadcast(&rs->sc_cv);
2351 }
2352 mutex_exit(&rs->sc_mutex);
2353 }
2354
2355
2356 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2357 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2358 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2359
2360 static daddr_t
2361 rf_component_info_offset(void)
2362 {
2363
2364 return RF_COMPONENT_INFO_OFFSET;
2365 }
2366
2367 static daddr_t
2368 rf_component_info_size(unsigned secsize)
2369 {
2370 daddr_t info_size;
2371
2372 KASSERT(secsize);
2373 if (secsize > RF_COMPONENT_INFO_SIZE)
2374 info_size = secsize;
2375 else
2376 info_size = RF_COMPONENT_INFO_SIZE;
2377
2378 return info_size;
2379 }
2380
2381 static daddr_t
2382 rf_parity_map_offset(RF_Raid_t *raidPtr)
2383 {
2384 daddr_t map_offset;
2385
2386 KASSERT(raidPtr->bytesPerSector);
2387 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2388 map_offset = raidPtr->bytesPerSector;
2389 else
2390 map_offset = RF_COMPONENT_INFO_SIZE;
2391 map_offset += rf_component_info_offset();
2392
2393 return map_offset;
2394 }
2395
2396 static daddr_t
2397 rf_parity_map_size(RF_Raid_t *raidPtr)
2398 {
2399 daddr_t map_size;
2400
2401 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2402 map_size = raidPtr->bytesPerSector;
2403 else
2404 map_size = RF_PARITY_MAP_SIZE;
2405
2406 return map_size;
2407 }
2408
2409 int
2410 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2411 {
2412 RF_ComponentLabel_t *clabel;
2413
2414 clabel = raidget_component_label(raidPtr, col);
2415 clabel->clean = RF_RAID_CLEAN;
2416 raidflush_component_label(raidPtr, col);
2417 return(0);
2418 }
2419
2420
2421 int
2422 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2423 {
2424 RF_ComponentLabel_t *clabel;
2425
2426 clabel = raidget_component_label(raidPtr, col);
2427 clabel->clean = RF_RAID_DIRTY;
2428 raidflush_component_label(raidPtr, col);
2429 return(0);
2430 }
2431
2432 int
2433 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2434 {
2435 KASSERT(raidPtr->bytesPerSector);
2436
2437 return raidread_component_label(raidPtr->bytesPerSector,
2438 raidPtr->Disks[col].dev,
2439 raidPtr->raid_cinfo[col].ci_vp,
2440 &raidPtr->raid_cinfo[col].ci_label);
2441 }
2442
2443 RF_ComponentLabel_t *
2444 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2445 {
2446 return &raidPtr->raid_cinfo[col].ci_label;
2447 }
2448
2449 int
2450 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2451 {
2452 RF_ComponentLabel_t *label;
2453
2454 label = &raidPtr->raid_cinfo[col].ci_label;
2455 label->mod_counter = raidPtr->mod_counter;
2456 #ifndef RF_NO_PARITY_MAP
2457 label->parity_map_modcount = label->mod_counter;
2458 #endif
2459 return raidwrite_component_label(raidPtr->bytesPerSector,
2460 raidPtr->Disks[col].dev,
2461 raidPtr->raid_cinfo[col].ci_vp, label);
2462 }
2463
2464 /*
2465 * Swap the label endianness.
2466 *
2467 * Everything in the component label is 4-byte-swapped except the version,
2468 * which is kept in the byte-swapped version at all times, and indicates
2469 * for the writer that a swap is necessary.
2470 *
2471 * For reads it is expected that out_label == clabel, but writes expect
2472 * separate labels so only the re-swapped label is written out to disk,
2473 * leaving the swapped-except-version internally.
2474 *
2475 * Only support swapping label version 2.
2476 */
2477 static void
2478 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2479 {
2480 int *in, *out, *in_last;
2481
2482 KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2483
2484 /* Don't swap the label, but do copy it. */
2485 out_label->version = clabel->version;
2486
2487 in = &clabel->serial_number;
2488 in_last = &clabel->future_use2[42];
2489 out = &out_label->serial_number;
2490
2491 for (; in < in_last; in++, out++)
2492 *out = bswap32(*in);
2493 }
2494
2495 static int
2496 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2497 RF_ComponentLabel_t *clabel)
2498 {
2499 int error;
2500
2501 error = raidread_component_area(dev, b_vp, clabel,
2502 sizeof(RF_ComponentLabel_t),
2503 rf_component_info_offset(),
2504 rf_component_info_size(secsize));
2505
2506 if (error == 0 &&
2507 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2508 rf_swap_label(clabel, clabel);
2509 }
2510
2511 return error;
2512 }
2513
2514 /* ARGSUSED */
2515 static int
2516 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2517 size_t msize, daddr_t offset, daddr_t dsize)
2518 {
2519 struct buf *bp;
2520 int error;
2521
2522 /* XXX should probably ensure that we don't try to do this if
2523 someone has changed rf_protected_sectors. */
2524
2525 if (b_vp == NULL) {
2526 /* For whatever reason, this component is not valid.
2527 Don't try to read a component label from it. */
2528 return(EINVAL);
2529 }
2530
2531 /* get a block of the appropriate size... */
2532 bp = geteblk((int)dsize);
2533 bp->b_dev = dev;
2534
2535 /* get our ducks in a row for the read */
2536 bp->b_blkno = offset / DEV_BSIZE;
2537 bp->b_bcount = dsize;
2538 bp->b_flags |= B_READ;
2539 bp->b_resid = dsize;
2540
2541 bdev_strategy(bp);
2542 error = biowait(bp);
2543
2544 if (!error) {
2545 memcpy(data, bp->b_data, msize);
2546 }
2547
2548 brelse(bp, 0);
2549 return(error);
2550 }
2551
2552 static int
2553 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2554 RF_ComponentLabel_t *clabel)
2555 {
2556 RF_ComponentLabel_t *clabel_write = clabel;
2557 RF_ComponentLabel_t lclabel;
2558 int error;
2559
2560 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2561 clabel_write = &lclabel;
2562 rf_swap_label(clabel, clabel_write);
2563 }
2564 error = raidwrite_component_area(dev, b_vp, clabel_write,
2565 sizeof(RF_ComponentLabel_t),
2566 rf_component_info_offset(),
2567 rf_component_info_size(secsize), 0);
2568
2569 return error;
2570 }
2571
2572 /* ARGSUSED */
2573 static int
2574 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2575 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2576 {
2577 struct buf *bp;
2578 int error;
2579
2580 /* get a block of the appropriate size... */
2581 bp = geteblk((int)dsize);
2582 bp->b_dev = dev;
2583
2584 /* get our ducks in a row for the write */
2585 bp->b_blkno = offset / DEV_BSIZE;
2586 bp->b_bcount = dsize;
2587 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2588 bp->b_resid = dsize;
2589
2590 memset(bp->b_data, 0, dsize);
2591 memcpy(bp->b_data, data, msize);
2592
2593 bdev_strategy(bp);
2594 if (asyncp)
2595 return 0;
2596 error = biowait(bp);
2597 brelse(bp, 0);
2598 if (error) {
2599 #if 1
2600 printf("Failed to write RAID component info!\n");
2601 #endif
2602 }
2603
2604 return(error);
2605 }
2606
2607 void
2608 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2609 {
2610 int c;
2611
2612 for (c = 0; c < raidPtr->numCol; c++) {
2613 /* Skip dead disks. */
2614 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2615 continue;
2616 /* XXXjld: what if an error occurs here? */
2617 raidwrite_component_area(raidPtr->Disks[c].dev,
2618 raidPtr->raid_cinfo[c].ci_vp, map,
2619 RF_PARITYMAP_NBYTE,
2620 rf_parity_map_offset(raidPtr),
2621 rf_parity_map_size(raidPtr), 0);
2622 }
2623 }
2624
2625 void
2626 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2627 {
2628 struct rf_paritymap_ondisk tmp;
2629 int c,first;
2630
2631 first=1;
2632 for (c = 0; c < raidPtr->numCol; c++) {
2633 /* Skip dead disks. */
2634 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2635 continue;
2636 raidread_component_area(raidPtr->Disks[c].dev,
2637 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2638 RF_PARITYMAP_NBYTE,
2639 rf_parity_map_offset(raidPtr),
2640 rf_parity_map_size(raidPtr));
2641 if (first) {
2642 memcpy(map, &tmp, sizeof(*map));
2643 first = 0;
2644 } else {
2645 rf_paritymap_merge(map, &tmp);
2646 }
2647 }
2648 }
2649
2650 void
2651 rf_markalldirty(RF_Raid_t *raidPtr)
2652 {
2653 RF_ComponentLabel_t *clabel;
2654 int sparecol;
2655 int c;
2656 int j;
2657 int scol = -1;
2658
2659 raidPtr->mod_counter++;
2660 for (c = 0; c < raidPtr->numCol; c++) {
2661 /* we don't want to touch (at all) a disk that has
2662 failed */
2663 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2664 clabel = raidget_component_label(raidPtr, c);
2665 if (clabel->status == rf_ds_spared) {
2666 /* XXX do something special...
2667 but whatever you do, don't
2668 try to access it!! */
2669 } else {
2670 raidmarkdirty(raidPtr, c);
2671 }
2672 }
2673 }
2674
2675 for( c = 0; c < raidPtr->numSpare ; c++) {
2676 sparecol = raidPtr->numCol + c;
2677 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2678 /*
2679
2680 we claim this disk is "optimal" if it's
2681 rf_ds_used_spare, as that means it should be
2682 directly substitutable for the disk it replaced.
2683 We note that too...
2684
2685 */
2686
2687 for(j=0;j<raidPtr->numCol;j++) {
2688 if (raidPtr->Disks[j].spareCol == sparecol) {
2689 scol = j;
2690 break;
2691 }
2692 }
2693
2694 clabel = raidget_component_label(raidPtr, sparecol);
2695 /* make sure status is noted */
2696
2697 raid_init_component_label(raidPtr, clabel);
2698
2699 clabel->row = 0;
2700 clabel->column = scol;
2701 /* Note: we *don't* change status from rf_ds_used_spare
2702 to rf_ds_optimal */
2703 /* clabel.status = rf_ds_optimal; */
2704
2705 raidmarkdirty(raidPtr, sparecol);
2706 }
2707 }
2708 }
2709
2710
2711 void
2712 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2713 {
2714 RF_ComponentLabel_t *clabel;
2715 int sparecol;
2716 int c;
2717 int j;
2718 int scol;
2719 struct raid_softc *rs = raidPtr->softc;
2720
2721 scol = -1;
2722
2723 /* XXX should do extra checks to make sure things really are clean,
2724 rather than blindly setting the clean bit... */
2725
2726 raidPtr->mod_counter++;
2727
2728 for (c = 0; c < raidPtr->numCol; c++) {
2729 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2730 clabel = raidget_component_label(raidPtr, c);
2731 /* make sure status is noted */
2732 clabel->status = rf_ds_optimal;
2733
2734 /* note what unit we are configured as */
2735 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2736 clabel->last_unit = raidPtr->raidid;
2737
2738 raidflush_component_label(raidPtr, c);
2739 if (final == RF_FINAL_COMPONENT_UPDATE) {
2740 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2741 raidmarkclean(raidPtr, c);
2742 }
2743 }
2744 }
2745 /* else we don't touch it.. */
2746 }
2747
2748 for( c = 0; c < raidPtr->numSpare ; c++) {
2749 sparecol = raidPtr->numCol + c;
2750 /* Need to ensure that the reconstruct actually completed! */
2751 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2752 /*
2753
2754 we claim this disk is "optimal" if it's
2755 rf_ds_used_spare, as that means it should be
2756 directly substitutable for the disk it replaced.
2757 We note that too...
2758
2759 */
2760
2761 for(j=0;j<raidPtr->numCol;j++) {
2762 if (raidPtr->Disks[j].spareCol == sparecol) {
2763 scol = j;
2764 break;
2765 }
2766 }
2767
2768 /* XXX shouldn't *really* need this... */
2769 clabel = raidget_component_label(raidPtr, sparecol);
2770 /* make sure status is noted */
2771
2772 raid_init_component_label(raidPtr, clabel);
2773
2774 clabel->column = scol;
2775 clabel->status = rf_ds_optimal;
2776 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2777 clabel->last_unit = raidPtr->raidid;
2778
2779 raidflush_component_label(raidPtr, sparecol);
2780 if (final == RF_FINAL_COMPONENT_UPDATE) {
2781 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2782 raidmarkclean(raidPtr, sparecol);
2783 }
2784 }
2785 }
2786 }
2787 }
2788
2789 void
2790 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2791 {
2792
2793 if (vp != NULL) {
2794 if (auto_configured == 1) {
2795 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2796 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2797 vput(vp);
2798
2799 } else {
2800 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2801 }
2802 }
2803 }
2804
2805
2806 void
2807 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2808 {
2809 int r,c;
2810 struct vnode *vp;
2811 int acd;
2812
2813
2814 /* We take this opportunity to close the vnodes like we should.. */
2815
2816 for (c = 0; c < raidPtr->numCol; c++) {
2817 vp = raidPtr->raid_cinfo[c].ci_vp;
2818 acd = raidPtr->Disks[c].auto_configured;
2819 rf_close_component(raidPtr, vp, acd);
2820 raidPtr->raid_cinfo[c].ci_vp = NULL;
2821 raidPtr->Disks[c].auto_configured = 0;
2822 }
2823
2824 for (r = 0; r < raidPtr->numSpare; r++) {
2825 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2826 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2827 rf_close_component(raidPtr, vp, acd);
2828 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2829 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2830 }
2831 }
2832
2833
2834 static void
2835 rf_ReconThread(struct rf_recon_req_internal *req)
2836 {
2837 int s;
2838 RF_Raid_t *raidPtr;
2839
2840 s = splbio();
2841 raidPtr = (RF_Raid_t *) req->raidPtr;
2842 raidPtr->recon_in_progress = 1;
2843
2844 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2845 raidPtr->forceRecon = 1;
2846 }
2847
2848 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2849 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2850
2851 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2852 raidPtr->forceRecon = 0;
2853 }
2854
2855 RF_Free(req, sizeof(*req));
2856
2857 raidPtr->recon_in_progress = 0;
2858 splx(s);
2859
2860 /* That's all... */
2861 kthread_exit(0); /* does not return */
2862 }
2863
2864 static void
2865 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2866 {
2867 int retcode;
2868 int s;
2869
2870 raidPtr->parity_rewrite_stripes_done = 0;
2871 raidPtr->parity_rewrite_in_progress = 1;
2872 s = splbio();
2873 retcode = rf_RewriteParity(raidPtr);
2874 splx(s);
2875 if (retcode) {
2876 printf("raid%d: Error re-writing parity (%d)!\n",
2877 raidPtr->raidid, retcode);
2878 } else {
2879 /* set the clean bit! If we shutdown correctly,
2880 the clean bit on each component label will get
2881 set */
2882 raidPtr->parity_good = RF_RAID_CLEAN;
2883 }
2884 raidPtr->parity_rewrite_in_progress = 0;
2885
2886 /* Anyone waiting for us to stop? If so, inform them... */
2887 if (raidPtr->waitShutdown) {
2888 rf_lock_mutex2(raidPtr->rad_lock);
2889 cv_broadcast(&raidPtr->parity_rewrite_cv);
2890 rf_unlock_mutex2(raidPtr->rad_lock);
2891 }
2892
2893 /* That's all... */
2894 kthread_exit(0); /* does not return */
2895 }
2896
2897
2898 static void
2899 rf_CopybackThread(RF_Raid_t *raidPtr)
2900 {
2901 int s;
2902
2903 raidPtr->copyback_in_progress = 1;
2904 s = splbio();
2905 rf_CopybackReconstructedData(raidPtr);
2906 splx(s);
2907 raidPtr->copyback_in_progress = 0;
2908
2909 /* That's all... */
2910 kthread_exit(0); /* does not return */
2911 }
2912
2913
2914 static void
2915 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2916 {
2917 int s;
2918 RF_Raid_t *raidPtr;
2919
2920 s = splbio();
2921 raidPtr = req->raidPtr;
2922 raidPtr->recon_in_progress = 1;
2923
2924 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2925 raidPtr->forceRecon = 1;
2926 }
2927
2928 rf_ReconstructInPlace(raidPtr, req->col);
2929
2930 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2931 raidPtr->forceRecon = 0;
2932 }
2933
2934 RF_Free(req, sizeof(*req));
2935 raidPtr->recon_in_progress = 0;
2936 splx(s);
2937
2938 /* That's all... */
2939 kthread_exit(0); /* does not return */
2940 }
2941
2942 static RF_AutoConfig_t *
2943 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2944 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2945 unsigned secsize)
2946 {
2947 int good_one = 0;
2948 RF_ComponentLabel_t *clabel;
2949 RF_AutoConfig_t *ac;
2950
2951 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2952
2953 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2954 /* Got the label. Does it look reasonable? */
2955 if (rf_reasonable_label(clabel, numsecs) &&
2956 (rf_component_label_partitionsize(clabel) <= size)) {
2957 #ifdef DEBUG
2958 printf("Component on: %s: %llu\n",
2959 cname, (unsigned long long)size);
2960 rf_print_component_label(clabel);
2961 #endif
2962 /* if it's reasonable, add it, else ignore it. */
2963 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2964 M_WAITOK);
2965 strlcpy(ac->devname, cname, sizeof(ac->devname));
2966 ac->dev = dev;
2967 ac->vp = vp;
2968 ac->clabel = clabel;
2969 ac->next = ac_list;
2970 ac_list = ac;
2971 good_one = 1;
2972 }
2973 }
2974 if (!good_one) {
2975 /* cleanup */
2976 free(clabel, M_RAIDFRAME);
2977 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2978 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2979 vput(vp);
2980 }
2981 return ac_list;
2982 }
2983
2984 static RF_AutoConfig_t *
2985 rf_find_raid_components(void)
2986 {
2987 struct vnode *vp;
2988 struct disklabel label;
2989 device_t dv;
2990 deviter_t di;
2991 dev_t dev;
2992 int bmajor, bminor, wedge, rf_part_found;
2993 int error;
2994 int i;
2995 RF_AutoConfig_t *ac_list;
2996 uint64_t numsecs;
2997 unsigned secsize;
2998 int dowedges;
2999
3000 /* initialize the AutoConfig list */
3001 ac_list = NULL;
3002
3003 /*
3004 * we begin by trolling through *all* the devices on the system *twice*
3005 * first we scan for wedges, second for other devices. This avoids
3006 * using a raw partition instead of a wedge that covers the whole disk
3007 */
3008
3009 for (dowedges=1; dowedges>=0; --dowedges) {
3010 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3011 dv = deviter_next(&di)) {
3012
3013 /* we are only interested in disks */
3014 if (device_class(dv) != DV_DISK)
3015 continue;
3016
3017 /* we don't care about floppies */
3018 if (device_is_a(dv, "fd")) {
3019 continue;
3020 }
3021
3022 /* we don't care about CDs. */
3023 if (device_is_a(dv, "cd")) {
3024 continue;
3025 }
3026
3027 /* we don't care about md. */
3028 if (device_is_a(dv, "md")) {
3029 continue;
3030 }
3031
3032 /* hdfd is the Atari/Hades floppy driver */
3033 if (device_is_a(dv, "hdfd")) {
3034 continue;
3035 }
3036
3037 /* fdisa is the Atari/Milan floppy driver */
3038 if (device_is_a(dv, "fdisa")) {
3039 continue;
3040 }
3041
3042 /* we don't care about spiflash */
3043 if (device_is_a(dv, "spiflash")) {
3044 continue;
3045 }
3046
3047 /* are we in the wedges pass ? */
3048 wedge = device_is_a(dv, "dk");
3049 if (wedge != dowedges) {
3050 continue;
3051 }
3052
3053 /* need to find the device_name_to_block_device_major stuff */
3054 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3055
3056 rf_part_found = 0; /*No raid partition as yet*/
3057
3058 /* get a vnode for the raw partition of this disk */
3059 bminor = minor(device_unit(dv));
3060 dev = wedge ? makedev(bmajor, bminor) :
3061 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3062 if (bdevvp(dev, &vp))
3063 panic("RAID can't alloc vnode");
3064
3065 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3066 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3067
3068 if (error) {
3069 /* "Who cares." Continue looking
3070 for something that exists*/
3071 vput(vp);
3072 continue;
3073 }
3074
3075 error = getdisksize(vp, &numsecs, &secsize);
3076 if (error) {
3077 /*
3078 * Pseudo devices like vnd and cgd can be
3079 * opened but may still need some configuration.
3080 * Ignore these quietly.
3081 */
3082 if (error != ENXIO)
3083 printf("RAIDframe: can't get disk size"
3084 " for dev %s (%d)\n",
3085 device_xname(dv), error);
3086 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3087 vput(vp);
3088 continue;
3089 }
3090 if (wedge) {
3091 struct dkwedge_info dkw;
3092 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3093 NOCRED);
3094 if (error) {
3095 printf("RAIDframe: can't get wedge info for "
3096 "dev %s (%d)\n", device_xname(dv), error);
3097 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3098 vput(vp);
3099 continue;
3100 }
3101
3102 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3103 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3104 vput(vp);
3105 continue;
3106 }
3107
3108 VOP_UNLOCK(vp);
3109 ac_list = rf_get_component(ac_list, dev, vp,
3110 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3111 rf_part_found = 1; /*There is a raid component on this disk*/
3112 continue;
3113 }
3114
3115 /* Ok, the disk exists. Go get the disklabel. */
3116 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3117 if (error) {
3118 /*
3119 * XXX can't happen - open() would
3120 * have errored out (or faked up one)
3121 */
3122 if (error != ENOTTY)
3123 printf("RAIDframe: can't get label for dev "
3124 "%s (%d)\n", device_xname(dv), error);
3125 }
3126
3127 /* don't need this any more. We'll allocate it again
3128 a little later if we really do... */
3129 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3130 vput(vp);
3131
3132 if (error)
3133 continue;
3134
3135 rf_part_found = 0; /*No raid partitions yet*/
3136 for (i = 0; i < label.d_npartitions; i++) {
3137 char cname[sizeof(ac_list->devname)];
3138
3139 /* We only support partitions marked as RAID */
3140 if (label.d_partitions[i].p_fstype != FS_RAID)
3141 continue;
3142
3143 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3144 if (bdevvp(dev, &vp))
3145 panic("RAID can't alloc vnode");
3146
3147 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3148 error = VOP_OPEN(vp, FREAD, NOCRED);
3149 if (error) {
3150 /* Not quite a 'whatever'. In
3151 * this situation we know
3152 * there is a FS_RAID
3153 * partition, but we can't
3154 * open it. The most likely
3155 * reason is that the
3156 * partition is already in
3157 * use by another RAID set.
3158 * So note that we've already
3159 * found a partition on this
3160 * disk so we don't attempt
3161 * to use the raw disk later. */
3162 rf_part_found = 1;
3163 vput(vp);
3164 continue;
3165 }
3166 VOP_UNLOCK(vp);
3167 snprintf(cname, sizeof(cname), "%s%c",
3168 device_xname(dv), 'a' + i);
3169 ac_list = rf_get_component(ac_list, dev, vp, cname,
3170 label.d_partitions[i].p_size, numsecs, secsize);
3171 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3172 }
3173
3174 /*
3175 *If there is no raid component on this disk, either in a
3176 *disklabel or inside a wedge, check the raw partition as well,
3177 *as it is possible to configure raid components on raw disk
3178 *devices.
3179 */
3180
3181 if (!rf_part_found) {
3182 char cname[sizeof(ac_list->devname)];
3183
3184 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3185 if (bdevvp(dev, &vp))
3186 panic("RAID can't alloc vnode");
3187
3188 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3189
3190 error = VOP_OPEN(vp, FREAD, NOCRED);
3191 if (error) {
3192 /* Whatever... */
3193 vput(vp);
3194 continue;
3195 }
3196 VOP_UNLOCK(vp);
3197 snprintf(cname, sizeof(cname), "%s%c",
3198 device_xname(dv), 'a' + RAW_PART);
3199 ac_list = rf_get_component(ac_list, dev, vp, cname,
3200 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3201 }
3202 }
3203 deviter_release(&di);
3204 }
3205 return ac_list;
3206 }
3207
3208 int
3209 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3210 {
3211
3212 if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3213 clabel->version==RF_COMPONENT_LABEL_VERSION ||
3214 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3215 (clabel->clean == RF_RAID_CLEAN ||
3216 clabel->clean == RF_RAID_DIRTY) &&
3217 clabel->row >=0 &&
3218 clabel->column >= 0 &&
3219 clabel->num_rows > 0 &&
3220 clabel->num_columns > 0 &&
3221 clabel->row < clabel->num_rows &&
3222 clabel->column < clabel->num_columns &&
3223 clabel->blockSize > 0 &&
3224 /*
3225 * numBlocksHi may contain garbage, but it is ok since
3226 * the type is unsigned. If it is really garbage,
3227 * rf_fix_old_label_size() will fix it.
3228 */
3229 rf_component_label_numblocks(clabel) > 0) {
3230 /*
3231 * label looks reasonable enough...
3232 * let's make sure it has no old garbage.
3233 */
3234 if (numsecs)
3235 rf_fix_old_label_size(clabel, numsecs);
3236 return(1);
3237 }
3238 return(0);
3239 }
3240
3241
3242 /*
3243 * For reasons yet unknown, some old component labels have garbage in
3244 * the newer numBlocksHi region, and this causes lossage. Since those
3245 * disks will also have numsecs set to less than 32 bits of sectors,
3246 * we can determine when this corruption has occurred, and fix it.
3247 *
3248 * The exact same problem, with the same unknown reason, happens to
3249 * the partitionSizeHi member as well.
3250 */
3251 static void
3252 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3253 {
3254
3255 if (numsecs < ((uint64_t)1 << 32)) {
3256 if (clabel->numBlocksHi) {
3257 printf("WARNING: total sectors < 32 bits, yet "
3258 "numBlocksHi set\n"
3259 "WARNING: resetting numBlocksHi to zero.\n");
3260 clabel->numBlocksHi = 0;
3261 }
3262
3263 if (clabel->partitionSizeHi) {
3264 printf("WARNING: total sectors < 32 bits, yet "
3265 "partitionSizeHi set\n"
3266 "WARNING: resetting partitionSizeHi to zero.\n");
3267 clabel->partitionSizeHi = 0;
3268 }
3269 }
3270 }
3271
3272
3273 #ifdef DEBUG
3274 void
3275 rf_print_component_label(RF_ComponentLabel_t *clabel)
3276 {
3277 uint64_t numBlocks;
3278 static const char *rp[] = {
3279 "No", "Force", "Soft", "*invalid*"
3280 };
3281
3282
3283 numBlocks = rf_component_label_numblocks(clabel);
3284
3285 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3286 clabel->row, clabel->column,
3287 clabel->num_rows, clabel->num_columns);
3288 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3289 clabel->version, clabel->serial_number,
3290 clabel->mod_counter);
3291 printf(" Clean: %s Status: %d\n",
3292 clabel->clean ? "Yes" : "No", clabel->status);
3293 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3294 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3295 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3296 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3297 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3298 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3299 printf(" Last configured as: raid%d\n", clabel->last_unit);
3300 #if 0
3301 printf(" Config order: %d\n", clabel->config_order);
3302 #endif
3303
3304 }
3305 #endif
3306
3307 static RF_ConfigSet_t *
3308 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3309 {
3310 RF_AutoConfig_t *ac;
3311 RF_ConfigSet_t *config_sets;
3312 RF_ConfigSet_t *cset;
3313 RF_AutoConfig_t *ac_next;
3314
3315
3316 config_sets = NULL;
3317
3318 /* Go through the AutoConfig list, and figure out which components
3319 belong to what sets. */
3320 ac = ac_list;
3321 while(ac!=NULL) {
3322 /* we're going to putz with ac->next, so save it here
3323 for use at the end of the loop */
3324 ac_next = ac->next;
3325
3326 if (config_sets == NULL) {
3327 /* will need at least this one... */
3328 config_sets = malloc(sizeof(RF_ConfigSet_t),
3329 M_RAIDFRAME, M_WAITOK);
3330 /* this one is easy :) */
3331 config_sets->ac = ac;
3332 config_sets->next = NULL;
3333 config_sets->rootable = 0;
3334 ac->next = NULL;
3335 } else {
3336 /* which set does this component fit into? */
3337 cset = config_sets;
3338 while(cset!=NULL) {
3339 if (rf_does_it_fit(cset, ac)) {
3340 /* looks like it matches... */
3341 ac->next = cset->ac;
3342 cset->ac = ac;
3343 break;
3344 }
3345 cset = cset->next;
3346 }
3347 if (cset==NULL) {
3348 /* didn't find a match above... new set..*/
3349 cset = malloc(sizeof(RF_ConfigSet_t),
3350 M_RAIDFRAME, M_WAITOK);
3351 cset->ac = ac;
3352 ac->next = NULL;
3353 cset->next = config_sets;
3354 cset->rootable = 0;
3355 config_sets = cset;
3356 }
3357 }
3358 ac = ac_next;
3359 }
3360
3361
3362 return(config_sets);
3363 }
3364
3365 static int
3366 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3367 {
3368 RF_ComponentLabel_t *clabel1, *clabel2;
3369
3370 /* If this one matches the *first* one in the set, that's good
3371 enough, since the other members of the set would have been
3372 through here too... */
3373 /* note that we are not checking partitionSize here..
3374
3375 Note that we are also not checking the mod_counters here.
3376 If everything else matches except the mod_counter, that's
3377 good enough for this test. We will deal with the mod_counters
3378 a little later in the autoconfiguration process.
3379
3380 (clabel1->mod_counter == clabel2->mod_counter) &&
3381
3382 The reason we don't check for this is that failed disks
3383 will have lower modification counts. If those disks are
3384 not added to the set they used to belong to, then they will
3385 form their own set, which may result in 2 different sets,
3386 for example, competing to be configured at raid0, and
3387 perhaps competing to be the root filesystem set. If the
3388 wrong ones get configured, or both attempt to become /,
3389 weird behaviour and or serious lossage will occur. Thus we
3390 need to bring them into the fold here, and kick them out at
3391 a later point.
3392
3393 */
3394
3395 clabel1 = cset->ac->clabel;
3396 clabel2 = ac->clabel;
3397 if ((clabel1->version == clabel2->version) &&
3398 (clabel1->serial_number == clabel2->serial_number) &&
3399 (clabel1->num_rows == clabel2->num_rows) &&
3400 (clabel1->num_columns == clabel2->num_columns) &&
3401 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3402 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3403 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3404 (clabel1->parityConfig == clabel2->parityConfig) &&
3405 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3406 (clabel1->blockSize == clabel2->blockSize) &&
3407 rf_component_label_numblocks(clabel1) ==
3408 rf_component_label_numblocks(clabel2) &&
3409 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3410 (clabel1->root_partition == clabel2->root_partition) &&
3411 (clabel1->last_unit == clabel2->last_unit) &&
3412 (clabel1->config_order == clabel2->config_order)) {
3413 /* if it get's here, it almost *has* to be a match */
3414 } else {
3415 /* it's not consistent with somebody in the set..
3416 punt */
3417 return(0);
3418 }
3419 /* all was fine.. it must fit... */
3420 return(1);
3421 }
3422
3423 static int
3424 rf_have_enough_components(RF_ConfigSet_t *cset)
3425 {
3426 RF_AutoConfig_t *ac;
3427 RF_AutoConfig_t *auto_config;
3428 RF_ComponentLabel_t *clabel;
3429 int c;
3430 int num_cols;
3431 int num_missing;
3432 int mod_counter;
3433 int mod_counter_found;
3434 int even_pair_failed;
3435 char parity_type;
3436
3437
3438 /* check to see that we have enough 'live' components
3439 of this set. If so, we can configure it if necessary */
3440
3441 num_cols = cset->ac->clabel->num_columns;
3442 parity_type = cset->ac->clabel->parityConfig;
3443
3444 /* XXX Check for duplicate components!?!?!? */
3445
3446 /* Determine what the mod_counter is supposed to be for this set. */
3447
3448 mod_counter_found = 0;
3449 mod_counter = 0;
3450 ac = cset->ac;
3451 while(ac!=NULL) {
3452 if (mod_counter_found==0) {
3453 mod_counter = ac->clabel->mod_counter;
3454 mod_counter_found = 1;
3455 } else {
3456 if (ac->clabel->mod_counter > mod_counter) {
3457 mod_counter = ac->clabel->mod_counter;
3458 }
3459 }
3460 ac = ac->next;
3461 }
3462
3463 num_missing = 0;
3464 auto_config = cset->ac;
3465
3466 even_pair_failed = 0;
3467 for(c=0; c<num_cols; c++) {
3468 ac = auto_config;
3469 while(ac!=NULL) {
3470 if ((ac->clabel->column == c) &&
3471 (ac->clabel->mod_counter == mod_counter)) {
3472 /* it's this one... */
3473 #ifdef DEBUG
3474 printf("Found: %s at %d\n",
3475 ac->devname,c);
3476 #endif
3477 break;
3478 }
3479 ac=ac->next;
3480 }
3481 if (ac==NULL) {
3482 /* Didn't find one here! */
3483 /* special case for RAID 1, especially
3484 where there are more than 2
3485 components (where RAIDframe treats
3486 things a little differently :( ) */
3487 if (parity_type == '1') {
3488 if (c%2 == 0) { /* even component */
3489 even_pair_failed = 1;
3490 } else { /* odd component. If
3491 we're failed, and
3492 so is the even
3493 component, it's
3494 "Good Night, Charlie" */
3495 if (even_pair_failed == 1) {
3496 return(0);
3497 }
3498 }
3499 } else {
3500 /* normal accounting */
3501 num_missing++;
3502 }
3503 }
3504 if ((parity_type == '1') && (c%2 == 1)) {
3505 /* Just did an even component, and we didn't
3506 bail.. reset the even_pair_failed flag,
3507 and go on to the next component.... */
3508 even_pair_failed = 0;
3509 }
3510 }
3511
3512 clabel = cset->ac->clabel;
3513
3514 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3515 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3516 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3517 /* XXX this needs to be made *much* more general */
3518 /* Too many failures */
3519 return(0);
3520 }
3521 /* otherwise, all is well, and we've got enough to take a kick
3522 at autoconfiguring this set */
3523 return(1);
3524 }
3525
3526 static void
3527 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3528 RF_Raid_t *raidPtr)
3529 {
3530 RF_ComponentLabel_t *clabel;
3531 int i;
3532
3533 clabel = ac->clabel;
3534
3535 /* 1. Fill in the common stuff */
3536 config->numCol = clabel->num_columns;
3537 config->numSpare = 0; /* XXX should this be set here? */
3538 config->sectPerSU = clabel->sectPerSU;
3539 config->SUsPerPU = clabel->SUsPerPU;
3540 config->SUsPerRU = clabel->SUsPerRU;
3541 config->parityConfig = clabel->parityConfig;
3542 /* XXX... */
3543 strcpy(config->diskQueueType,"fifo");
3544 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3545 config->layoutSpecificSize = 0; /* XXX ?? */
3546
3547 while(ac!=NULL) {
3548 /* row/col values will be in range due to the checks
3549 in reasonable_label() */
3550 strcpy(config->devnames[0][ac->clabel->column],
3551 ac->devname);
3552 ac = ac->next;
3553 }
3554
3555 for(i=0;i<RF_MAXDBGV;i++) {
3556 config->debugVars[i][0] = 0;
3557 }
3558 }
3559
3560 static int
3561 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3562 {
3563 RF_ComponentLabel_t *clabel;
3564 int column;
3565 int sparecol;
3566
3567 raidPtr->autoconfigure = new_value;
3568
3569 for(column=0; column<raidPtr->numCol; column++) {
3570 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3571 clabel = raidget_component_label(raidPtr, column);
3572 clabel->autoconfigure = new_value;
3573 raidflush_component_label(raidPtr, column);
3574 }
3575 }
3576 for(column = 0; column < raidPtr->numSpare ; column++) {
3577 sparecol = raidPtr->numCol + column;
3578 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3579 clabel = raidget_component_label(raidPtr, sparecol);
3580 clabel->autoconfigure = new_value;
3581 raidflush_component_label(raidPtr, sparecol);
3582 }
3583 }
3584 return(new_value);
3585 }
3586
3587 static int
3588 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3589 {
3590 RF_ComponentLabel_t *clabel;
3591 int column;
3592 int sparecol;
3593
3594 raidPtr->root_partition = new_value;
3595 for(column=0; column<raidPtr->numCol; column++) {
3596 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3597 clabel = raidget_component_label(raidPtr, column);
3598 clabel->root_partition = new_value;
3599 raidflush_component_label(raidPtr, column);
3600 }
3601 }
3602 for(column = 0; column < raidPtr->numSpare ; column++) {
3603 sparecol = raidPtr->numCol + column;
3604 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3605 clabel = raidget_component_label(raidPtr, sparecol);
3606 clabel->root_partition = new_value;
3607 raidflush_component_label(raidPtr, sparecol);
3608 }
3609 }
3610 return(new_value);
3611 }
3612
3613 static void
3614 rf_release_all_vps(RF_ConfigSet_t *cset)
3615 {
3616 RF_AutoConfig_t *ac;
3617
3618 ac = cset->ac;
3619 while(ac!=NULL) {
3620 /* Close the vp, and give it back */
3621 if (ac->vp) {
3622 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3623 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3624 vput(ac->vp);
3625 ac->vp = NULL;
3626 }
3627 ac = ac->next;
3628 }
3629 }
3630
3631
3632 static void
3633 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3634 {
3635 RF_AutoConfig_t *ac;
3636 RF_AutoConfig_t *next_ac;
3637
3638 ac = cset->ac;
3639 while(ac!=NULL) {
3640 next_ac = ac->next;
3641 /* nuke the label */
3642 free(ac->clabel, M_RAIDFRAME);
3643 /* cleanup the config structure */
3644 free(ac, M_RAIDFRAME);
3645 /* "next.." */
3646 ac = next_ac;
3647 }
3648 /* and, finally, nuke the config set */
3649 free(cset, M_RAIDFRAME);
3650 }
3651
3652
3653 void
3654 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3655 {
3656 /* avoid over-writing byteswapped version. */
3657 if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3658 clabel->version = RF_COMPONENT_LABEL_VERSION;
3659 clabel->serial_number = raidPtr->serial_number;
3660 clabel->mod_counter = raidPtr->mod_counter;
3661
3662 clabel->num_rows = 1;
3663 clabel->num_columns = raidPtr->numCol;
3664 clabel->clean = RF_RAID_DIRTY; /* not clean */
3665 clabel->status = rf_ds_optimal; /* "It's good!" */
3666
3667 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3668 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3669 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3670
3671 clabel->blockSize = raidPtr->bytesPerSector;
3672 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3673
3674 /* XXX not portable */
3675 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3676 clabel->maxOutstanding = raidPtr->maxOutstanding;
3677 clabel->autoconfigure = raidPtr->autoconfigure;
3678 clabel->root_partition = raidPtr->root_partition;
3679 clabel->last_unit = raidPtr->raidid;
3680 clabel->config_order = raidPtr->config_order;
3681
3682 #ifndef RF_NO_PARITY_MAP
3683 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3684 #endif
3685 }
3686
3687 static struct raid_softc *
3688 rf_auto_config_set(RF_ConfigSet_t *cset)
3689 {
3690 RF_Raid_t *raidPtr;
3691 RF_Config_t *config;
3692 int raidID;
3693 struct raid_softc *sc;
3694
3695 #ifdef DEBUG
3696 printf("RAID autoconfigure\n");
3697 #endif
3698
3699 /* 1. Create a config structure */
3700 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3701
3702 /*
3703 2. Figure out what RAID ID this one is supposed to live at
3704 See if we can get the same RAID dev that it was configured
3705 on last time..
3706 */
3707
3708 raidID = cset->ac->clabel->last_unit;
3709 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3710 sc = raidget(++raidID, false))
3711 continue;
3712 #ifdef DEBUG
3713 printf("Configuring raid%d:\n",raidID);
3714 #endif
3715
3716 if (sc == NULL)
3717 sc = raidget(raidID, true);
3718 raidPtr = &sc->sc_r;
3719
3720 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3721 raidPtr->softc = sc;
3722 raidPtr->raidid = raidID;
3723 raidPtr->openings = RAIDOUTSTANDING;
3724
3725 /* 3. Build the configuration structure */
3726 rf_create_configuration(cset->ac, config, raidPtr);
3727
3728 /* 4. Do the configuration */
3729 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3730 raidinit(sc);
3731
3732 rf_markalldirty(raidPtr);
3733 raidPtr->autoconfigure = 1; /* XXX do this here? */
3734 switch (cset->ac->clabel->root_partition) {
3735 case 1: /* Force Root */
3736 case 2: /* Soft Root: root when boot partition part of raid */
3737 /*
3738 * everything configured just fine. Make a note
3739 * that this set is eligible to be root,
3740 * or forced to be root
3741 */
3742 cset->rootable = cset->ac->clabel->root_partition;
3743 /* XXX do this here? */
3744 raidPtr->root_partition = cset->rootable;
3745 break;
3746 default:
3747 break;
3748 }
3749 } else {
3750 raidput(sc);
3751 sc = NULL;
3752 }
3753
3754 /* 5. Cleanup */
3755 free(config, M_RAIDFRAME);
3756 return sc;
3757 }
3758
3759 void
3760 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3761 size_t xmin, size_t xmax)
3762 {
3763
3764 /* Format: raid%d_foo */
3765 snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3766
3767 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3768 pool_sethiwat(p, xmax);
3769 pool_prime(p, xmin);
3770 }
3771
3772
3773 /*
3774 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3775 * to see if there is IO pending and if that IO could possibly be done
3776 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3777 * otherwise.
3778 *
3779 */
3780 int
3781 rf_buf_queue_check(RF_Raid_t *raidPtr)
3782 {
3783 struct raid_softc *rs;
3784 struct dk_softc *dksc;
3785
3786 rs = raidPtr->softc;
3787 dksc = &rs->sc_dksc;
3788
3789 if ((rs->sc_flags & RAIDF_INITED) == 0)
3790 return 1;
3791
3792 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3793 /* there is work to do */
3794 return 0;
3795 }
3796 /* default is nothing to do */
3797 return 1;
3798 }
3799
3800 int
3801 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3802 {
3803 uint64_t numsecs;
3804 unsigned secsize;
3805 int error;
3806
3807 error = getdisksize(vp, &numsecs, &secsize);
3808 if (error == 0) {
3809 diskPtr->blockSize = secsize;
3810 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3811 diskPtr->partitionSize = numsecs;
3812 return 0;
3813 }
3814 return error;
3815 }
3816
3817 static int
3818 raid_match(device_t self, cfdata_t cfdata, void *aux)
3819 {
3820 return 1;
3821 }
3822
3823 static void
3824 raid_attach(device_t parent, device_t self, void *aux)
3825 {
3826 }
3827
3828
3829 static int
3830 raid_detach(device_t self, int flags)
3831 {
3832 int error;
3833 struct raid_softc *rs = raidsoftc(self);
3834
3835 if (rs == NULL)
3836 return ENXIO;
3837
3838 if ((error = raidlock(rs)) != 0)
3839 return error;
3840
3841 error = raid_detach_unlocked(rs);
3842
3843 raidunlock(rs);
3844
3845 /* XXX raid can be referenced here */
3846
3847 if (error)
3848 return error;
3849
3850 /* Free the softc */
3851 raidput(rs);
3852
3853 return 0;
3854 }
3855
3856 static void
3857 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3858 {
3859 struct dk_softc *dksc = &rs->sc_dksc;
3860 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3861
3862 memset(dg, 0, sizeof(*dg));
3863
3864 dg->dg_secperunit = raidPtr->totalSectors;
3865 dg->dg_secsize = raidPtr->bytesPerSector;
3866 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3867 dg->dg_ntracks = 4 * raidPtr->numCol;
3868
3869 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3870 }
3871
3872 /*
3873 * Get cache info for all the components (including spares).
3874 * Returns intersection of all the cache flags of all disks, or first
3875 * error if any encountered.
3876 * XXXfua feature flags can change as spares are added - lock down somehow
3877 */
3878 static int
3879 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3880 {
3881 int c;
3882 int error;
3883 int dkwhole = 0, dkpart;
3884
3885 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3886 /*
3887 * Check any non-dead disk, even when currently being
3888 * reconstructed.
3889 */
3890 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
3891 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3892 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3893 if (error) {
3894 if (error != ENODEV) {
3895 printf("raid%d: get cache for component %s failed\n",
3896 raidPtr->raidid,
3897 raidPtr->Disks[c].devname);
3898 }
3899
3900 return error;
3901 }
3902
3903 if (c == 0)
3904 dkwhole = dkpart;
3905 else
3906 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3907 }
3908 }
3909
3910 *data = dkwhole;
3911
3912 return 0;
3913 }
3914
3915 /*
3916 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3917 * We end up returning whatever error was returned by the first cache flush
3918 * that fails.
3919 */
3920
3921 static int
3922 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3923 {
3924 int e = 0;
3925 for (int i = 0; i < 5; i++) {
3926 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3927 &force, FWRITE, NOCRED);
3928 if (!e || e == ENODEV)
3929 return e;
3930 printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3931 raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3932 }
3933 return e;
3934 }
3935
3936 int
3937 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3938 {
3939 int c, error;
3940
3941 error = 0;
3942 for (c = 0; c < raidPtr->numCol; c++) {
3943 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3944 int e = rf_sync_component_cache(raidPtr, c, force);
3945 if (e && !error)
3946 error = e;
3947 }
3948 }
3949
3950 for (c = 0; c < raidPtr->numSpare ; c++) {
3951 int sparecol = raidPtr->numCol + c;
3952 /* Need to ensure that the reconstruct actually completed! */
3953 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3954 int e = rf_sync_component_cache(raidPtr, sparecol,
3955 force);
3956 if (e && !error)
3957 error = e;
3958 }
3959 }
3960 return error;
3961 }
3962
3963 /* Fill in info with the current status */
3964 void
3965 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3966 {
3967
3968 memset(info, 0, sizeof(*info));
3969
3970 if (raidPtr->status != rf_rs_reconstructing) {
3971 info->total = 100;
3972 info->completed = 100;
3973 } else {
3974 info->total = raidPtr->reconControl->numRUsTotal;
3975 info->completed = raidPtr->reconControl->numRUsComplete;
3976 }
3977 info->remaining = info->total - info->completed;
3978 }
3979
3980 /* Fill in info with the current status */
3981 void
3982 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3983 {
3984
3985 memset(info, 0, sizeof(*info));
3986
3987 if (raidPtr->parity_rewrite_in_progress == 1) {
3988 info->total = raidPtr->Layout.numStripe;
3989 info->completed = raidPtr->parity_rewrite_stripes_done;
3990 } else {
3991 info->completed = 100;
3992 info->total = 100;
3993 }
3994 info->remaining = info->total - info->completed;
3995 }
3996
3997 /* Fill in info with the current status */
3998 void
3999 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
4000 {
4001
4002 memset(info, 0, sizeof(*info));
4003
4004 if (raidPtr->copyback_in_progress == 1) {
4005 info->total = raidPtr->Layout.numStripe;
4006 info->completed = raidPtr->copyback_stripes_done;
4007 info->remaining = info->total - info->completed;
4008 } else {
4009 info->remaining = 0;
4010 info->completed = 100;
4011 info->total = 100;
4012 }
4013 }
4014
4015 /* Fill in config with the current info */
4016 int
4017 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
4018 {
4019 int d, i, j;
4020
4021 if (!raidPtr->valid)
4022 return ENODEV;
4023 config->cols = raidPtr->numCol;
4024 config->ndevs = raidPtr->numCol;
4025 if (config->ndevs >= RF_MAX_DISKS)
4026 return ENOMEM;
4027 config->nspares = raidPtr->numSpare;
4028 if (config->nspares >= RF_MAX_DISKS)
4029 return ENOMEM;
4030 config->maxqdepth = raidPtr->maxQueueDepth;
4031 d = 0;
4032 for (j = 0; j < config->cols; j++) {
4033 config->devs[d] = raidPtr->Disks[j];
4034 d++;
4035 }
4036 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
4037 config->spares[i] = raidPtr->Disks[j];
4038 if (config->spares[i].status == rf_ds_rebuilding_spare) {
4039 /* XXX: raidctl(8) expects to see this as a used spare */
4040 config->spares[i].status = rf_ds_used_spare;
4041 }
4042 }
4043 return 0;
4044 }
4045
4046 int
4047 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
4048 {
4049 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
4050 RF_ComponentLabel_t *raid_clabel;
4051 int column = clabel->column;
4052
4053 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
4054 return EINVAL;
4055 raid_clabel = raidget_component_label(raidPtr, column);
4056 memcpy(clabel, raid_clabel, sizeof *clabel);
4057 /* Fix-up for userland. */
4058 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
4059 clabel->version = RF_COMPONENT_LABEL_VERSION;
4060
4061 return 0;
4062 }
4063
4064 /*
4065 * Module interface
4066 */
4067
4068 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
4069
4070 #ifdef _MODULE
4071 CFDRIVER_DECL(raid, DV_DISK, NULL);
4072 #endif
4073
4074 static int raid_modcmd(modcmd_t, void *);
4075 static int raid_modcmd_init(void);
4076 static int raid_modcmd_fini(void);
4077
4078 static int
4079 raid_modcmd(modcmd_t cmd, void *data)
4080 {
4081 int error;
4082
4083 error = 0;
4084 switch (cmd) {
4085 case MODULE_CMD_INIT:
4086 error = raid_modcmd_init();
4087 break;
4088 case MODULE_CMD_FINI:
4089 error = raid_modcmd_fini();
4090 break;
4091 default:
4092 error = ENOTTY;
4093 break;
4094 }
4095 return error;
4096 }
4097
4098 static int
4099 raid_modcmd_init(void)
4100 {
4101 int error;
4102 int bmajor, cmajor;
4103
4104 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
4105 mutex_enter(&raid_lock);
4106 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4107 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
4108 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4109 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4110
4111 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4112 #endif
4113
4114 bmajor = cmajor = -1;
4115 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4116 &raid_cdevsw, &cmajor);
4117 if (error != 0 && error != EEXIST) {
4118 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4119 mutex_exit(&raid_lock);
4120 return error;
4121 }
4122 #ifdef _MODULE
4123 error = config_cfdriver_attach(&raid_cd);
4124 if (error != 0) {
4125 aprint_error("%s: config_cfdriver_attach failed %d\n",
4126 __func__, error);
4127 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4128 mutex_exit(&raid_lock);
4129 return error;
4130 }
4131 #endif
4132 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4133 if (error != 0) {
4134 aprint_error("%s: config_cfattach_attach failed %d\n",
4135 __func__, error);
4136 #ifdef _MODULE
4137 config_cfdriver_detach(&raid_cd);
4138 #endif
4139 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4140 mutex_exit(&raid_lock);
4141 return error;
4142 }
4143
4144 raidautoconfigdone = false;
4145
4146 mutex_exit(&raid_lock);
4147
4148 if (error == 0) {
4149 if (rf_BootRaidframe(true) == 0)
4150 aprint_verbose("Kernelized RAIDframe activated\n");
4151 else
4152 panic("Serious error activating RAID!!");
4153 }
4154
4155 /*
4156 * Register a finalizer which will be used to auto-config RAID
4157 * sets once all real hardware devices have been found.
4158 */
4159 error = config_finalize_register(NULL, rf_autoconfig);
4160 if (error != 0) {
4161 aprint_error("WARNING: unable to register RAIDframe "
4162 "finalizer\n");
4163 error = 0;
4164 }
4165
4166 return error;
4167 }
4168
4169 static int
4170 raid_modcmd_fini(void)
4171 {
4172 int error;
4173
4174 mutex_enter(&raid_lock);
4175
4176 /* Don't allow unload if raid device(s) exist. */
4177 if (!LIST_EMPTY(&raids)) {
4178 mutex_exit(&raid_lock);
4179 return EBUSY;
4180 }
4181
4182 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4183 if (error != 0) {
4184 aprint_error("%s: cannot detach cfattach\n",__func__);
4185 mutex_exit(&raid_lock);
4186 return error;
4187 }
4188 #ifdef _MODULE
4189 error = config_cfdriver_detach(&raid_cd);
4190 if (error != 0) {
4191 aprint_error("%s: cannot detach cfdriver\n",__func__);
4192 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4193 mutex_exit(&raid_lock);
4194 return error;
4195 }
4196 #endif
4197 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4198 rf_BootRaidframe(false);
4199 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4200 rf_destroy_mutex2(rf_sparet_wait_mutex);
4201 rf_destroy_cond2(rf_sparet_wait_cv);
4202 rf_destroy_cond2(rf_sparet_resp_cv);
4203 #endif
4204 mutex_exit(&raid_lock);
4205 mutex_destroy(&raid_lock);
4206
4207 return error;
4208 }
4209