rf_netbsdkintf.c revision 1.410.4.4 1 /* $NetBSD: rf_netbsdkintf.c,v 1.410.4.4 2024/04/28 12:09:08 martin Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.410.4.4 2024/04/28 12:09:08 martin Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #define DEVICE_XNAME(dev) dev ? device_xname(dev) : "null"
162
163 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
164 static rf_declare_mutex2(rf_sparet_wait_mutex);
165 static rf_declare_cond2(rf_sparet_wait_cv);
166 static rf_declare_cond2(rf_sparet_resp_cv);
167
168 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
169 * spare table */
170 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
171 * installation process */
172 #endif
173
174 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
175
176 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
177
178 /* prototypes */
179 static void KernelWakeupFunc(struct buf *);
180 static void InitBP(struct buf *, struct vnode *, unsigned,
181 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
182 void *, int);
183 static void raidinit(struct raid_softc *);
184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
185 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
186
187 static int raid_match(device_t, cfdata_t, void *);
188 static void raid_attach(device_t, device_t, void *);
189 static int raid_detach(device_t, int);
190
191 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
192 daddr_t, daddr_t);
193 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
194 daddr_t, daddr_t, int);
195
196 static int raidwrite_component_label(unsigned,
197 dev_t, struct vnode *, RF_ComponentLabel_t *);
198 static int raidread_component_label(unsigned,
199 dev_t, struct vnode *, RF_ComponentLabel_t *);
200
201 static int raid_diskstart(device_t, struct buf *bp);
202 static int raid_dumpblocks(device_t, void *, daddr_t, int);
203 static int raid_lastclose(device_t);
204
205 static dev_type_open(raidopen);
206 static dev_type_close(raidclose);
207 static dev_type_read(raidread);
208 static dev_type_write(raidwrite);
209 static dev_type_ioctl(raidioctl);
210 static dev_type_strategy(raidstrategy);
211 static dev_type_dump(raiddump);
212 static dev_type_size(raidsize);
213
214 const struct bdevsw raid_bdevsw = {
215 .d_open = raidopen,
216 .d_close = raidclose,
217 .d_strategy = raidstrategy,
218 .d_ioctl = raidioctl,
219 .d_dump = raiddump,
220 .d_psize = raidsize,
221 .d_discard = nodiscard,
222 .d_flag = D_DISK
223 };
224
225 const struct cdevsw raid_cdevsw = {
226 .d_open = raidopen,
227 .d_close = raidclose,
228 .d_read = raidread,
229 .d_write = raidwrite,
230 .d_ioctl = raidioctl,
231 .d_stop = nostop,
232 .d_tty = notty,
233 .d_poll = nopoll,
234 .d_mmap = nommap,
235 .d_kqfilter = nokqfilter,
236 .d_discard = nodiscard,
237 .d_flag = D_DISK
238 };
239
240 static struct dkdriver rf_dkdriver = {
241 .d_open = raidopen,
242 .d_close = raidclose,
243 .d_strategy = raidstrategy,
244 .d_diskstart = raid_diskstart,
245 .d_dumpblocks = raid_dumpblocks,
246 .d_lastclose = raid_lastclose,
247 .d_minphys = minphys
248 };
249
250 #define raidunit(x) DISKUNIT(x)
251 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
252
253 extern struct cfdriver raid_cd;
254 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
255 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
256 DVF_DETACH_SHUTDOWN);
257
258 /* Internal representation of a rf_recon_req */
259 struct rf_recon_req_internal {
260 RF_RowCol_t col;
261 RF_ReconReqFlags_t flags;
262 void *raidPtr;
263 };
264
265 /*
266 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
267 * Be aware that large numbers can allow the driver to consume a lot of
268 * kernel memory, especially on writes, and in degraded mode reads.
269 *
270 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
271 * a single 64K write will typically require 64K for the old data,
272 * 64K for the old parity, and 64K for the new parity, for a total
273 * of 192K (if the parity buffer is not re-used immediately).
274 * Even it if is used immediately, that's still 128K, which when multiplied
275 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
276 *
277 * Now in degraded mode, for example, a 64K read on the above setup may
278 * require data reconstruction, which will require *all* of the 4 remaining
279 * disks to participate -- 4 * 32K/disk == 128K again.
280 */
281
282 #ifndef RAIDOUTSTANDING
283 #define RAIDOUTSTANDING 6
284 #endif
285
286 #define RAIDLABELDEV(dev) \
287 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
288
289 /* declared here, and made public, for the benefit of KVM stuff.. */
290
291 static int raidlock(struct raid_softc *);
292 static void raidunlock(struct raid_softc *);
293
294 static int raid_detach_unlocked(struct raid_softc *);
295
296 static void rf_markalldirty(RF_Raid_t *);
297 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
298
299 static void rf_ReconThread(struct rf_recon_req_internal *);
300 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
301 static void rf_CopybackThread(RF_Raid_t *raidPtr);
302 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
303 static int rf_autoconfig(device_t);
304 static int rf_rescan(void);
305 static void rf_buildroothack(RF_ConfigSet_t *);
306
307 static RF_AutoConfig_t *rf_find_raid_components(void);
308 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
309 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
310 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
311 static int rf_set_autoconfig(RF_Raid_t *, int);
312 static int rf_set_rootpartition(RF_Raid_t *, int);
313 static void rf_release_all_vps(RF_ConfigSet_t *);
314 static void rf_cleanup_config_set(RF_ConfigSet_t *);
315 static int rf_have_enough_components(RF_ConfigSet_t *);
316 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
317 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
318
319 /*
320 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
321 * Note that this is overridden by having RAID_AUTOCONFIG as an option
322 * in the kernel config file.
323 */
324 #ifdef RAID_AUTOCONFIG
325 int raidautoconfig = 1;
326 #else
327 int raidautoconfig = 0;
328 #endif
329 static bool raidautoconfigdone = false;
330
331 struct pool rf_alloclist_pool; /* AllocList */
332
333 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
334 static kmutex_t raid_lock;
335
336 static struct raid_softc *
337 raidcreate(int unit) {
338 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
339 sc->sc_unit = unit;
340 cv_init(&sc->sc_cv, "raidunit");
341 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
342 return sc;
343 }
344
345 static void
346 raiddestroy(struct raid_softc *sc) {
347 cv_destroy(&sc->sc_cv);
348 mutex_destroy(&sc->sc_mutex);
349 kmem_free(sc, sizeof(*sc));
350 }
351
352 static struct raid_softc *
353 raidget(int unit, bool create) {
354 struct raid_softc *sc;
355 if (unit < 0) {
356 #ifdef DIAGNOSTIC
357 panic("%s: unit %d!", __func__, unit);
358 #endif
359 return NULL;
360 }
361 mutex_enter(&raid_lock);
362 LIST_FOREACH(sc, &raids, sc_link) {
363 if (sc->sc_unit == unit) {
364 mutex_exit(&raid_lock);
365 return sc;
366 }
367 }
368 mutex_exit(&raid_lock);
369 if (!create)
370 return NULL;
371 sc = raidcreate(unit);
372 mutex_enter(&raid_lock);
373 LIST_INSERT_HEAD(&raids, sc, sc_link);
374 mutex_exit(&raid_lock);
375 return sc;
376 }
377
378 static void
379 raidput(struct raid_softc *sc) {
380 mutex_enter(&raid_lock);
381 LIST_REMOVE(sc, sc_link);
382 mutex_exit(&raid_lock);
383 raiddestroy(sc);
384 }
385
386 void
387 raidattach(int num)
388 {
389
390 /*
391 * Device attachment and associated initialization now occurs
392 * as part of the module initialization.
393 */
394 }
395
396 static int
397 rf_autoconfig(device_t self)
398 {
399 RF_AutoConfig_t *ac_list;
400 RF_ConfigSet_t *config_sets;
401
402 if (!raidautoconfig || raidautoconfigdone == true)
403 return 0;
404
405 /* XXX This code can only be run once. */
406 raidautoconfigdone = true;
407
408 #ifdef __HAVE_CPU_BOOTCONF
409 /*
410 * 0. find the boot device if needed first so we can use it later
411 * this needs to be done before we autoconfigure any raid sets,
412 * because if we use wedges we are not going to be able to open
413 * the boot device later
414 */
415 if (booted_device == NULL)
416 cpu_bootconf();
417 #endif
418 /* 1. locate all RAID components on the system */
419 aprint_debug("Searching for RAID components...\n");
420 ac_list = rf_find_raid_components();
421
422 /* 2. Sort them into their respective sets. */
423 config_sets = rf_create_auto_sets(ac_list);
424
425 /*
426 * 3. Evaluate each set and configure the valid ones.
427 * This gets done in rf_buildroothack().
428 */
429 rf_buildroothack(config_sets);
430
431 return 1;
432 }
433
434 int
435 rf_inited(const struct raid_softc *rs) {
436 return (rs->sc_flags & RAIDF_INITED) != 0;
437 }
438
439 RF_Raid_t *
440 rf_get_raid(struct raid_softc *rs) {
441 return &rs->sc_r;
442 }
443
444 int
445 rf_get_unit(const struct raid_softc *rs) {
446 return rs->sc_unit;
447 }
448
449 static int
450 rf_containsboot(RF_Raid_t *r, device_t bdv) {
451 const char *bootname;
452 size_t len;
453
454 /* if bdv is NULL, the set can't contain it. exit early. */
455 if (bdv == NULL)
456 return 0;
457
458 bootname = device_xname(bdv);
459 len = strlen(bootname);
460
461 for (int col = 0; col < r->numCol; col++) {
462 const char *devname = r->Disks[col].devname;
463 devname += sizeof("/dev/") - 1;
464 if (strncmp(devname, "dk", 2) == 0) {
465 const char *parent =
466 dkwedge_get_parent_name(r->Disks[col].dev);
467 if (parent != NULL)
468 devname = parent;
469 }
470 if (strncmp(devname, bootname, len) == 0) {
471 struct raid_softc *sc = r->softc;
472 aprint_debug("raid%d includes boot device %s\n",
473 sc->sc_unit, devname);
474 return 1;
475 }
476 }
477 return 0;
478 }
479
480 static int
481 rf_rescan(void)
482 {
483 RF_AutoConfig_t *ac_list;
484 RF_ConfigSet_t *config_sets, *cset, *next_cset;
485 struct raid_softc *sc;
486 int raid_added;
487
488 ac_list = rf_find_raid_components();
489 config_sets = rf_create_auto_sets(ac_list);
490
491 raid_added = 1;
492 while (raid_added > 0) {
493 raid_added = 0;
494 cset = config_sets;
495 while (cset != NULL) {
496 next_cset = cset->next;
497 if (rf_have_enough_components(cset) &&
498 cset->ac->clabel->autoconfigure == 1) {
499 sc = rf_auto_config_set(cset);
500 if (sc != NULL) {
501 aprint_debug("raid%d: configured ok, rootable %d\n",
502 sc->sc_unit, cset->rootable);
503 /* We added one RAID set */
504 raid_added++;
505 } else {
506 /* The autoconfig didn't work :( */
507 aprint_debug("Autoconfig failed\n");
508 rf_release_all_vps(cset);
509 }
510 } else {
511 /* we're not autoconfiguring this set...
512 release the associated resources */
513 rf_release_all_vps(cset);
514 }
515 /* cleanup */
516 rf_cleanup_config_set(cset);
517 cset = next_cset;
518 }
519 if (raid_added > 0) {
520 /* We added at least one RAID set, so re-scan for recursive RAID */
521 ac_list = rf_find_raid_components();
522 config_sets = rf_create_auto_sets(ac_list);
523 }
524 }
525
526 return 0;
527 }
528
529 /*
530 * Example setup:
531 * dk1 at wd0: "raid@wd0", 171965 blocks at 32802, type: raidframe
532 * dk3 at wd1: "raid@wd1", 171965 blocks at 32802, type: raidframz
533 * raid1: Components: /dev/dk1 /dev/dk3
534 * dk4 at raid1: "empty@raid1", 8192 blocks at 34, type: msdos
535 * dk5 at raid1: "root@raid1", 163517 blocks at 8226, type: ffs
536 *
537 * If booted from wd0, booted_device will be
538 * disk wd0, startblk = 41092, nblks = 163517
539 *
540 * That is, dk5 with startblk computed from the beginning of wd0
541 * instead of beginning of raid1:
542 * 32802 + 64 (RF_PROTECTED_SECTORS) + 8226 = 41092
543 *
544 * In order to find the boot wedge, we must iterate on each component,
545 * find its offset from disk beginning, abd look for the boot wedge with
546 * startblck adjusted.
547 */
548 static device_t
549 rf_find_bootwedge(struct raid_softc *rsc)
550 {
551 RF_Raid_t *r = &rsc->sc_r;
552 const char *bootname;
553 size_t len;
554 device_t rdev = NULL;
555
556 if (booted_device == NULL)
557 goto out;
558
559 bootname = device_xname(booted_device);
560 len = strlen(bootname);
561
562 aprint_debug("%s: booted_device %s, startblk = %"PRId64", "
563 "nblks = %"PRId64"\n", __func__,
564 bootname, booted_startblk, booted_nblks);
565
566 for (int col = 0; col < r->numCol; col++) {
567 const char *devname = r->Disks[col].devname;
568 const char *parent;
569 struct disk *dk;
570 u_int nwedges;
571 struct dkwedge_info *dkwi;
572 struct dkwedge_list dkwl;
573 size_t dkwi_len;
574 int i;
575
576 devname += sizeof("/dev/") - 1;
577 if (strncmp(devname, "dk", 2) != 0)
578 continue;
579
580 parent = dkwedge_get_parent_name(r->Disks[col].dev);
581 if (parent == NULL) {
582 aprint_debug("%s: cannot find parent for "
583 "component /dev/%s", __func__, devname);
584 continue;
585 }
586
587 if (strncmp(parent, bootname, len) != 0)
588 continue;
589
590 aprint_debug("%s: looking up wedge %s in device %s\n",
591 __func__, devname, parent);
592
593 dk = disk_find(parent);
594 nwedges = dk->dk_nwedges;
595 dkwi_len = sizeof(*dkwi) * nwedges;
596 dkwi = RF_Malloc(dkwi_len);
597
598 dkwl.dkwl_buf = dkwi;
599 dkwl.dkwl_bufsize = dkwi_len;
600 dkwl.dkwl_nwedges = 0;
601 dkwl.dkwl_ncopied = 0;
602
603 if (dkwedge_list(dk, &dkwl, curlwp) == 0) {
604 daddr_t startblk;
605
606 for (i = 0; i < dkwl.dkwl_ncopied; i++) {
607 if (strcmp(dkwi[i].dkw_devname, devname) == 0)
608 break;
609 }
610
611 KASSERT(i < dkwl.dkwl_ncopied);
612
613 aprint_debug("%s: wedge %s, "
614 "startblk = %"PRId64", "
615 "nblks = %"PRId64"\n",
616 __func__,
617 dkwi[i].dkw_devname,
618 dkwi[i].dkw_offset,
619 dkwi[i].dkw_size);
620
621 startblk = booted_startblk
622 - dkwi[i].dkw_offset
623 - RF_PROTECTED_SECTORS;
624
625 aprint_debug("%s: looking for wedge in %s, "
626 "startblk = %"PRId64", "
627 "nblks = %"PRId64"\n",
628 __func__,
629 DEVICE_XNAME(rsc->sc_dksc.sc_dev),
630 startblk, booted_nblks);
631
632 rdev = dkwedge_find_partition(rsc->sc_dksc.sc_dev,
633 startblk,
634 booted_nblks);
635 if (rdev) {
636 aprint_debug("%s: root candidate wedge %s "
637 "shifted from %s\n", __func__,
638 device_xname(rdev),
639 dkwi[i].dkw_devname);
640 goto done;
641 } else {
642 aprint_debug("%s: not found\n", __func__);
643 }
644 }
645
646 aprint_debug("%s: nothing found for col %d\n", __func__, col);
647 done:
648 RF_Free(dkwi, dkwi_len);
649 }
650
651 out:
652 if (!rdev)
653 aprint_debug("%s: nothing found\n", __func__);
654
655 return rdev;
656 }
657
658 static void
659 rf_buildroothack(RF_ConfigSet_t *config_sets)
660 {
661 RF_AutoConfig_t *ac_list;
662 RF_ConfigSet_t *cset;
663 RF_ConfigSet_t *next_cset;
664 int num_root;
665 int raid_added;
666 struct raid_softc *sc, *rsc;
667 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
668
669 sc = rsc = NULL;
670 num_root = 0;
671
672 raid_added = 1;
673 while (raid_added > 0) {
674 raid_added = 0;
675 cset = config_sets;
676 while (cset != NULL) {
677 next_cset = cset->next;
678 if (rf_have_enough_components(cset) &&
679 cset->ac->clabel->autoconfigure == 1) {
680 sc = rf_auto_config_set(cset);
681 if (sc != NULL) {
682 aprint_debug("raid%d: configured ok, rootable %d\n",
683 sc->sc_unit, cset->rootable);
684 /* We added one RAID set */
685 raid_added++;
686 if (cset->rootable) {
687 rsc = sc;
688 num_root++;
689 }
690 } else {
691 /* The autoconfig didn't work :( */
692 aprint_debug("Autoconfig failed\n");
693 rf_release_all_vps(cset);
694 }
695 } else {
696 /* we're not autoconfiguring this set...
697 release the associated resources */
698 rf_release_all_vps(cset);
699 }
700 /* cleanup */
701 rf_cleanup_config_set(cset);
702 cset = next_cset;
703 }
704 if (raid_added > 0) {
705 /* We added at least one RAID set, so re-scan for recursive RAID */
706 ac_list = rf_find_raid_components();
707 config_sets = rf_create_auto_sets(ac_list);
708 }
709 }
710
711 /* if the user has specified what the root device should be
712 then we don't touch booted_device or boothowto... */
713
714 if (rootspec != NULL) {
715 aprint_debug("%s: rootspec %s\n", __func__, rootspec);
716 return;
717 }
718
719 /* we found something bootable... */
720 if (num_root == 1) {
721 device_t candidate_root = NULL;
722 dksc = &rsc->sc_dksc;
723
724 if (dksc->sc_dkdev.dk_nwedges != 0) {
725
726 /* Find the wedge we booted from */
727 candidate_root = rf_find_bootwedge(rsc);
728
729 /* Try first partition */
730 if (candidate_root == NULL) {
731 size_t i = 0;
732 candidate_root = dkwedge_find_by_parent(
733 device_xname(dksc->sc_dev), &i);
734 }
735 aprint_debug("%s: candidate wedge root %s\n",
736 __func__, DEVICE_XNAME(candidate_root));
737 } else {
738 candidate_root = dksc->sc_dev;
739 }
740
741 aprint_debug("%s: candidate root = %s, booted_device = %s, "
742 "root_partition = %d, contains_boot=%d\n",
743 __func__, DEVICE_XNAME(candidate_root),
744 DEVICE_XNAME(booted_device), rsc->sc_r.root_partition,
745 rf_containsboot(&rsc->sc_r, booted_device));
746
747 /* XXX the check for booted_device == NULL can probably be
748 * dropped, now that rf_containsboot handles that case.
749 */
750 if (booted_device == NULL ||
751 rsc->sc_r.root_partition == 1 ||
752 rf_containsboot(&rsc->sc_r, booted_device)) {
753 booted_device = candidate_root;
754 booted_method = "raidframe/single";
755 booted_partition = 0; /* XXX assume 'a' */
756 aprint_debug("%s: set booted_device = %s\n", __func__,
757 DEVICE_XNAME(booted_device));
758 }
759 } else if (num_root > 1) {
760 aprint_debug("%s: many roots=%d, %s\n", __func__, num_root,
761 DEVICE_XNAME(booted_device));
762
763 /*
764 * Maybe the MD code can help. If it cannot, then
765 * setroot() will discover that we have no
766 * booted_device and will ask the user if nothing was
767 * hardwired in the kernel config file
768 */
769 if (booted_device == NULL)
770 return;
771
772 num_root = 0;
773 mutex_enter(&raid_lock);
774 LIST_FOREACH(sc, &raids, sc_link) {
775 RF_Raid_t *r = &sc->sc_r;
776 if (r->valid == 0)
777 continue;
778
779 if (r->root_partition == 0)
780 continue;
781
782 if (rf_containsboot(r, booted_device)) {
783 num_root++;
784 rsc = sc;
785 dksc = &rsc->sc_dksc;
786 }
787 }
788 mutex_exit(&raid_lock);
789
790 if (num_root == 1) {
791 booted_device = dksc->sc_dev;
792 booted_method = "raidframe/multi";
793 booted_partition = 0; /* XXX assume 'a' */
794 } else {
795 /* we can't guess.. require the user to answer... */
796 boothowto |= RB_ASKNAME;
797 }
798 }
799 }
800
801 static int
802 raidsize(dev_t dev)
803 {
804 struct raid_softc *rs;
805 struct dk_softc *dksc;
806 unsigned int unit;
807
808 unit = raidunit(dev);
809 if ((rs = raidget(unit, false)) == NULL)
810 return -1;
811 dksc = &rs->sc_dksc;
812
813 if ((rs->sc_flags & RAIDF_INITED) == 0)
814 return -1;
815
816 return dk_size(dksc, dev);
817 }
818
819 static int
820 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
821 {
822 unsigned int unit;
823 struct raid_softc *rs;
824 struct dk_softc *dksc;
825
826 unit = raidunit(dev);
827 if ((rs = raidget(unit, false)) == NULL)
828 return ENXIO;
829 dksc = &rs->sc_dksc;
830
831 if ((rs->sc_flags & RAIDF_INITED) == 0)
832 return ENODEV;
833
834 /*
835 Note that blkno is relative to this particular partition.
836 By adding adding RF_PROTECTED_SECTORS, we get a value that
837 is relative to the partition used for the underlying component.
838 */
839 blkno += RF_PROTECTED_SECTORS;
840
841 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
842 }
843
844 static int
845 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
846 {
847 struct raid_softc *rs = raidsoftc(dev);
848 const struct bdevsw *bdev;
849 RF_Raid_t *raidPtr;
850 int c, sparecol, j, scol, dumpto;
851 int error = 0;
852
853 raidPtr = &rs->sc_r;
854
855 /* we only support dumping to RAID 1 sets */
856 if (raidPtr->Layout.numDataCol != 1 ||
857 raidPtr->Layout.numParityCol != 1)
858 return EINVAL;
859
860 if ((error = raidlock(rs)) != 0)
861 return error;
862
863 /* figure out what device is alive.. */
864
865 /*
866 Look for a component to dump to. The preference for the
867 component to dump to is as follows:
868 1) the first component
869 2) a used_spare of the first component
870 3) the second component
871 4) a used_spare of the second component
872 */
873
874 dumpto = -1;
875 for (c = 0; c < raidPtr->numCol; c++) {
876 if (raidPtr->Disks[c].status == rf_ds_optimal) {
877 /* this might be the one */
878 dumpto = c;
879 break;
880 }
881 }
882
883 /*
884 At this point we have possibly selected a live component.
885 If we didn't find a live ocmponent, we now check to see
886 if there is a relevant spared component.
887 */
888
889 for (c = 0; c < raidPtr->numSpare; c++) {
890 sparecol = raidPtr->numCol + c;
891
892 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
893 /* How about this one? */
894 scol = -1;
895 for(j=0;j<raidPtr->numCol;j++) {
896 if (raidPtr->Disks[j].spareCol == sparecol) {
897 scol = j;
898 break;
899 }
900 }
901 if (scol == 0) {
902 /*
903 We must have found a spared first
904 component! We'll take that over
905 anything else found so far. (We
906 couldn't have found a real first
907 component before, since this is a
908 used spare, and it's saying that
909 it's replacing the first
910 component.) On reboot (with
911 autoconfiguration turned on)
912 sparecol will become the first
913 component (component0) of this set.
914 */
915 dumpto = sparecol;
916 break;
917 } else if (scol != -1) {
918 /*
919 Must be a spared second component.
920 We'll dump to that if we havn't found
921 anything else so far.
922 */
923 if (dumpto == -1)
924 dumpto = sparecol;
925 }
926 }
927 }
928
929 if (dumpto == -1) {
930 /* we couldn't find any live components to dump to!?!?
931 */
932 error = EINVAL;
933 goto out;
934 }
935
936 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
937 if (bdev == NULL) {
938 error = ENXIO;
939 goto out;
940 }
941
942 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
943 blkno, va, nblk * raidPtr->bytesPerSector);
944
945 out:
946 raidunlock(rs);
947
948 return error;
949 }
950
951 /* ARGSUSED */
952 static int
953 raidopen(dev_t dev, int flags, int fmt,
954 struct lwp *l)
955 {
956 int unit = raidunit(dev);
957 struct raid_softc *rs;
958 struct dk_softc *dksc;
959 int error = 0;
960 int part, pmask;
961
962 if ((rs = raidget(unit, true)) == NULL)
963 return ENXIO;
964 if ((error = raidlock(rs)) != 0)
965 return error;
966
967 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
968 error = EBUSY;
969 goto bad;
970 }
971
972 dksc = &rs->sc_dksc;
973
974 part = DISKPART(dev);
975 pmask = (1 << part);
976
977 if (!DK_BUSY(dksc, pmask) &&
978 ((rs->sc_flags & RAIDF_INITED) != 0)) {
979 /* First one... mark things as dirty... Note that we *MUST*
980 have done a configure before this. I DO NOT WANT TO BE
981 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
982 THAT THEY BELONG TOGETHER!!!!! */
983 /* XXX should check to see if we're only open for reading
984 here... If so, we needn't do this, but then need some
985 other way of keeping track of what's happened.. */
986
987 rf_markalldirty(&rs->sc_r);
988 }
989
990 if ((rs->sc_flags & RAIDF_INITED) != 0)
991 error = dk_open(dksc, dev, flags, fmt, l);
992
993 bad:
994 raidunlock(rs);
995
996 return error;
997
998
999 }
1000
1001 static int
1002 raid_lastclose(device_t self)
1003 {
1004 struct raid_softc *rs = raidsoftc(self);
1005
1006 /* Last one... device is not unconfigured yet.
1007 Device shutdown has taken care of setting the
1008 clean bits if RAIDF_INITED is not set
1009 mark things as clean... */
1010
1011 rf_update_component_labels(&rs->sc_r,
1012 RF_FINAL_COMPONENT_UPDATE);
1013
1014 /* pass to unlocked code */
1015 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1016 rs->sc_flags |= RAIDF_DETACH;
1017
1018 return 0;
1019 }
1020
1021 /* ARGSUSED */
1022 static int
1023 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
1024 {
1025 int unit = raidunit(dev);
1026 struct raid_softc *rs;
1027 struct dk_softc *dksc;
1028 cfdata_t cf;
1029 int error = 0, do_detach = 0, do_put = 0;
1030
1031 if ((rs = raidget(unit, false)) == NULL)
1032 return ENXIO;
1033 dksc = &rs->sc_dksc;
1034
1035 if ((error = raidlock(rs)) != 0)
1036 return error;
1037
1038 if ((rs->sc_flags & RAIDF_INITED) != 0) {
1039 error = dk_close(dksc, dev, flags, fmt, l);
1040 if ((rs->sc_flags & RAIDF_DETACH) != 0)
1041 do_detach = 1;
1042 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1043 do_put = 1;
1044
1045 raidunlock(rs);
1046
1047 if (do_detach) {
1048 /* free the pseudo device attach bits */
1049 cf = device_cfdata(dksc->sc_dev);
1050 error = config_detach(dksc->sc_dev, 0);
1051 if (error == 0)
1052 free(cf, M_RAIDFRAME);
1053 } else if (do_put) {
1054 raidput(rs);
1055 }
1056
1057 return error;
1058
1059 }
1060
1061 static void
1062 raid_wakeup(RF_Raid_t *raidPtr)
1063 {
1064 rf_lock_mutex2(raidPtr->iodone_lock);
1065 rf_signal_cond2(raidPtr->iodone_cv);
1066 rf_unlock_mutex2(raidPtr->iodone_lock);
1067 }
1068
1069 static void
1070 raidstrategy(struct buf *bp)
1071 {
1072 unsigned int unit;
1073 struct raid_softc *rs;
1074 struct dk_softc *dksc;
1075 RF_Raid_t *raidPtr;
1076
1077 unit = raidunit(bp->b_dev);
1078 if ((rs = raidget(unit, false)) == NULL) {
1079 bp->b_error = ENXIO;
1080 goto fail;
1081 }
1082 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1083 bp->b_error = ENXIO;
1084 goto fail;
1085 }
1086 dksc = &rs->sc_dksc;
1087 raidPtr = &rs->sc_r;
1088
1089 /* Queue IO only */
1090 if (dk_strategy_defer(dksc, bp))
1091 goto done;
1092
1093 /* schedule the IO to happen at the next convenient time */
1094 raid_wakeup(raidPtr);
1095
1096 done:
1097 return;
1098
1099 fail:
1100 bp->b_resid = bp->b_bcount;
1101 biodone(bp);
1102 }
1103
1104 static int
1105 raid_diskstart(device_t dev, struct buf *bp)
1106 {
1107 struct raid_softc *rs = raidsoftc(dev);
1108 RF_Raid_t *raidPtr;
1109
1110 raidPtr = &rs->sc_r;
1111 if (!raidPtr->valid) {
1112 db1_printf(("raid is not valid..\n"));
1113 return ENODEV;
1114 }
1115
1116 /* XXX */
1117 bp->b_resid = 0;
1118
1119 return raiddoaccess(raidPtr, bp);
1120 }
1121
1122 void
1123 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1124 {
1125 struct raid_softc *rs;
1126 struct dk_softc *dksc;
1127
1128 rs = raidPtr->softc;
1129 dksc = &rs->sc_dksc;
1130
1131 dk_done(dksc, bp);
1132
1133 rf_lock_mutex2(raidPtr->mutex);
1134 raidPtr->openings++;
1135 rf_unlock_mutex2(raidPtr->mutex);
1136
1137 /* schedule more IO */
1138 raid_wakeup(raidPtr);
1139 }
1140
1141 /* ARGSUSED */
1142 static int
1143 raidread(dev_t dev, struct uio *uio, int flags)
1144 {
1145 int unit = raidunit(dev);
1146 struct raid_softc *rs;
1147
1148 if ((rs = raidget(unit, false)) == NULL)
1149 return ENXIO;
1150
1151 if ((rs->sc_flags & RAIDF_INITED) == 0)
1152 return ENXIO;
1153
1154 return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1155
1156 }
1157
1158 /* ARGSUSED */
1159 static int
1160 raidwrite(dev_t dev, struct uio *uio, int flags)
1161 {
1162 int unit = raidunit(dev);
1163 struct raid_softc *rs;
1164
1165 if ((rs = raidget(unit, false)) == NULL)
1166 return ENXIO;
1167
1168 if ((rs->sc_flags & RAIDF_INITED) == 0)
1169 return ENXIO;
1170
1171 return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1172
1173 }
1174
1175 static int
1176 raid_detach_unlocked(struct raid_softc *rs)
1177 {
1178 struct dk_softc *dksc = &rs->sc_dksc;
1179 RF_Raid_t *raidPtr;
1180 int error;
1181
1182 raidPtr = &rs->sc_r;
1183
1184 if (DK_BUSY(dksc, 0) ||
1185 raidPtr->recon_in_progress != 0 ||
1186 raidPtr->parity_rewrite_in_progress != 0 ||
1187 raidPtr->copyback_in_progress != 0)
1188 return EBUSY;
1189
1190 if ((rs->sc_flags & RAIDF_INITED) == 0)
1191 return 0;
1192
1193 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1194
1195 if ((error = rf_Shutdown(raidPtr)) != 0)
1196 return error;
1197
1198 rs->sc_flags &= ~RAIDF_INITED;
1199
1200 /* Kill off any queued buffers */
1201 dk_drain(dksc);
1202 bufq_free(dksc->sc_bufq);
1203
1204 /* Detach the disk. */
1205 dkwedge_delall(&dksc->sc_dkdev);
1206 disk_detach(&dksc->sc_dkdev);
1207 disk_destroy(&dksc->sc_dkdev);
1208 dk_detach(dksc);
1209
1210 return 0;
1211 }
1212
1213 int
1214 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1215 {
1216 struct rf_recon_req_internal *rrint;
1217
1218 if (raidPtr->Layout.map->faultsTolerated == 0) {
1219 /* Can't do this on a RAID 0!! */
1220 return EINVAL;
1221 }
1222
1223 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1224 /* bad column */
1225 return EINVAL;
1226 }
1227
1228 rf_lock_mutex2(raidPtr->mutex);
1229 if (raidPtr->status == rf_rs_reconstructing) {
1230 raidPtr->abortRecon[rr->col] = 1;
1231 }
1232 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1233 (raidPtr->numFailures > 0)) {
1234 /* some other component has failed. Let's not make
1235 things worse. XXX wrong for RAID6 */
1236 goto out;
1237 }
1238 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1239 int spareCol = raidPtr->Disks[rr->col].spareCol;
1240
1241 if (spareCol < raidPtr->numCol ||
1242 spareCol >= raidPtr->numCol + raidPtr->numSpare)
1243 goto out;
1244
1245 /*
1246 * Fail the spare disk so that we can
1247 * reconstruct on another one.
1248 */
1249 raidPtr->Disks[spareCol].status = rf_ds_failed;
1250
1251 }
1252 rf_unlock_mutex2(raidPtr->mutex);
1253
1254 /* make a copy of the recon request so that we don't rely on
1255 * the user's buffer */
1256 rrint = RF_Malloc(sizeof(*rrint));
1257 if (rrint == NULL)
1258 return(ENOMEM);
1259 rrint->col = rr->col;
1260 rrint->flags = rr->flags;
1261 rrint->raidPtr = raidPtr;
1262
1263 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1264 rrint, "raid_recon");
1265 out:
1266 rf_unlock_mutex2(raidPtr->mutex);
1267 return EINVAL;
1268 }
1269
1270 static int
1271 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1272 {
1273 /* allocate a buffer for the layout-specific data, and copy it in */
1274 if (k_cfg->layoutSpecificSize == 0)
1275 return 0;
1276
1277 if (k_cfg->layoutSpecificSize > 10000) {
1278 /* sanity check */
1279 return EINVAL;
1280 }
1281
1282 u_char *specific_buf;
1283 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1284 if (specific_buf == NULL)
1285 return ENOMEM;
1286
1287 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1288 k_cfg->layoutSpecificSize);
1289 if (retcode) {
1290 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1291 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1292 return retcode;
1293 }
1294
1295 k_cfg->layoutSpecific = specific_buf;
1296 return 0;
1297 }
1298
1299 static int
1300 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1301 {
1302 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1303
1304 if (rs->sc_r.valid) {
1305 /* There is a valid RAID set running on this unit! */
1306 printf("raid%d: Device already configured!\n", rs->sc_unit);
1307 return EINVAL;
1308 }
1309
1310 /* copy-in the configuration information */
1311 /* data points to a pointer to the configuration structure */
1312 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1313 if (*k_cfg == NULL) {
1314 return ENOMEM;
1315 }
1316 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1317 if (retcode == 0)
1318 return 0;
1319 RF_Free(*k_cfg, sizeof(RF_Config_t));
1320 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1321 rs->sc_flags |= RAIDF_SHUTDOWN;
1322 return retcode;
1323 }
1324
1325 int
1326 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1327 {
1328 int retcode, i;
1329 RF_Raid_t *raidPtr = &rs->sc_r;
1330
1331 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1332
1333 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1334 goto out;
1335
1336 /* should do some kind of sanity check on the configuration.
1337 * Store the sum of all the bytes in the last byte? */
1338
1339 /* Force nul-termination on all strings. */
1340 #define ZERO_FINAL(s) do { s[sizeof(s) - 1] = '\0'; } while (0)
1341 for (i = 0; i < RF_MAXCOL; i++) {
1342 ZERO_FINAL(k_cfg->devnames[0][i]);
1343 }
1344 for (i = 0; i < RF_MAXSPARE; i++) {
1345 ZERO_FINAL(k_cfg->spare_names[i]);
1346 }
1347 for (i = 0; i < RF_MAXDBGV; i++) {
1348 ZERO_FINAL(k_cfg->debugVars[i]);
1349 }
1350 #undef ZERO_FINAL
1351
1352 /* Check some basic limits. */
1353 if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1354 retcode = EINVAL;
1355 goto out;
1356 }
1357 if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1358 retcode = EINVAL;
1359 goto out;
1360 }
1361
1362 /* configure the system */
1363
1364 /*
1365 * Clear the entire RAID descriptor, just to make sure
1366 * there is no stale data left in the case of a
1367 * reconfiguration
1368 */
1369 memset(raidPtr, 0, sizeof(*raidPtr));
1370 raidPtr->softc = rs;
1371 raidPtr->raidid = rs->sc_unit;
1372
1373 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1374
1375 if (retcode == 0) {
1376 /* allow this many simultaneous IO's to
1377 this RAID device */
1378 raidPtr->openings = RAIDOUTSTANDING;
1379
1380 raidinit(rs);
1381 raid_wakeup(raidPtr);
1382 rf_markalldirty(raidPtr);
1383 }
1384
1385 /* free the buffers. No return code here. */
1386 if (k_cfg->layoutSpecificSize) {
1387 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1388 }
1389 out:
1390 RF_Free(k_cfg, sizeof(RF_Config_t));
1391 if (retcode) {
1392 /*
1393 * If configuration failed, set sc_flags so that we
1394 * will detach the device when we close it.
1395 */
1396 rs->sc_flags |= RAIDF_SHUTDOWN;
1397 }
1398 return retcode;
1399 }
1400
1401 #if RF_DISABLED
1402 static int
1403 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1404 {
1405
1406 /* XXX check the label for valid stuff... */
1407 /* Note that some things *should not* get modified --
1408 the user should be re-initing the labels instead of
1409 trying to patch things.
1410 */
1411 #ifdef DEBUG
1412 int raidid = raidPtr->raidid;
1413 printf("raid%d: Got component label:\n", raidid);
1414 printf("raid%d: Version: %d\n", raidid, clabel->version);
1415 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1416 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1417 printf("raid%d: Column: %d\n", raidid, clabel->column);
1418 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1419 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1420 printf("raid%d: Status: %d\n", raidid, clabel->status);
1421 #endif /* DEBUG */
1422 clabel->row = 0;
1423 int column = clabel->column;
1424
1425 if ((column < 0) || (column >= raidPtr->numCol)) {
1426 return(EINVAL);
1427 }
1428
1429 /* XXX this isn't allowed to do anything for now :-) */
1430
1431 /* XXX and before it is, we need to fill in the rest
1432 of the fields!?!?!?! */
1433 memcpy(raidget_component_label(raidPtr, column),
1434 clabel, sizeof(*clabel));
1435 raidflush_component_label(raidPtr, column);
1436 return 0;
1437 }
1438 #endif
1439
1440 static int
1441 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1442 {
1443 /*
1444 we only want the serial number from
1445 the above. We get all the rest of the information
1446 from the config that was used to create this RAID
1447 set.
1448 */
1449
1450 raidPtr->serial_number = clabel->serial_number;
1451
1452 for (int column = 0; column < raidPtr->numCol; column++) {
1453 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1454 if (RF_DEAD_DISK(diskPtr->status))
1455 continue;
1456 RF_ComponentLabel_t *ci_label = raidget_component_label(
1457 raidPtr, column);
1458 /* Zeroing this is important. */
1459 memset(ci_label, 0, sizeof(*ci_label));
1460 raid_init_component_label(raidPtr, ci_label);
1461 ci_label->serial_number = raidPtr->serial_number;
1462 ci_label->row = 0; /* we dont' pretend to support more */
1463 rf_component_label_set_partitionsize(ci_label,
1464 diskPtr->partitionSize);
1465 ci_label->column = column;
1466 raidflush_component_label(raidPtr, column);
1467 /* XXXjld what about the spares? */
1468 }
1469
1470 return 0;
1471 }
1472
1473 static int
1474 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1475 {
1476
1477 if (raidPtr->Layout.map->faultsTolerated == 0) {
1478 /* Can't do this on a RAID 0!! */
1479 return EINVAL;
1480 }
1481
1482 if (raidPtr->recon_in_progress == 1) {
1483 /* a reconstruct is already in progress! */
1484 return EINVAL;
1485 }
1486
1487 RF_SingleComponent_t component;
1488 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1489 component.row = 0; /* we don't support any more */
1490 int column = component.column;
1491
1492 if ((column < 0) || (column >= raidPtr->numCol)) {
1493 return EINVAL;
1494 }
1495
1496 rf_lock_mutex2(raidPtr->mutex);
1497 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1498 (raidPtr->numFailures > 0)) {
1499 /* XXX 0 above shouldn't be constant!!! */
1500 /* some component other than this has failed.
1501 Let's not make things worse than they already
1502 are... */
1503 printf("raid%d: Unable to reconstruct to disk at:\n",
1504 raidPtr->raidid);
1505 printf("raid%d: Col: %d Too many failures.\n",
1506 raidPtr->raidid, column);
1507 rf_unlock_mutex2(raidPtr->mutex);
1508 return EINVAL;
1509 }
1510
1511 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1512 printf("raid%d: Unable to reconstruct to disk at:\n",
1513 raidPtr->raidid);
1514 printf("raid%d: Col: %d "
1515 "Reconstruction already occurring!\n",
1516 raidPtr->raidid, column);
1517
1518 rf_unlock_mutex2(raidPtr->mutex);
1519 return EINVAL;
1520 }
1521
1522 if (raidPtr->Disks[column].status == rf_ds_spared) {
1523 rf_unlock_mutex2(raidPtr->mutex);
1524 return EINVAL;
1525 }
1526
1527 rf_unlock_mutex2(raidPtr->mutex);
1528
1529 struct rf_recon_req_internal *rrint;
1530 rrint = RF_Malloc(sizeof(*rrint));
1531 if (rrint == NULL)
1532 return ENOMEM;
1533
1534 rrint->col = column;
1535 rrint->raidPtr = raidPtr;
1536
1537 return RF_CREATE_THREAD(raidPtr->recon_thread,
1538 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1539 }
1540
1541 static int
1542 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1543 {
1544 /*
1545 * This makes no sense on a RAID 0, or if we are not reconstructing
1546 * so tell the user it's done.
1547 */
1548 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1549 raidPtr->status != rf_rs_reconstructing) {
1550 *data = 100;
1551 return 0;
1552 }
1553 if (raidPtr->reconControl->numRUsTotal == 0) {
1554 *data = 0;
1555 return 0;
1556 }
1557 *data = (raidPtr->reconControl->numRUsComplete * 100
1558 / raidPtr->reconControl->numRUsTotal);
1559 return 0;
1560 }
1561
1562 /*
1563 * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1564 * on the component_name[] array.
1565 */
1566 static void
1567 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1568 {
1569
1570 memcpy(component, data, sizeof *component);
1571 component->component_name[sizeof(component->component_name) - 1] = '\0';
1572 }
1573
1574 static int
1575 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1576 {
1577 int unit = raidunit(dev);
1578 int part, pmask;
1579 struct raid_softc *rs;
1580 struct dk_softc *dksc;
1581 RF_Config_t *k_cfg;
1582 RF_Raid_t *raidPtr;
1583 RF_AccTotals_t *totals;
1584 RF_SingleComponent_t component;
1585 RF_DeviceConfig_t *d_cfg, *ucfgp;
1586 int retcode = 0;
1587 int column;
1588 RF_ComponentLabel_t *clabel;
1589 int d;
1590
1591 if ((rs = raidget(unit, false)) == NULL)
1592 return ENXIO;
1593
1594 dksc = &rs->sc_dksc;
1595 raidPtr = &rs->sc_r;
1596
1597 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1598 (int) DISKPART(dev), (int) unit, cmd));
1599
1600 /* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
1601 switch (cmd) {
1602 case RAIDFRAME_CONFIGURE:
1603 case RAIDFRAME_RESCAN:
1604 break;
1605 default:
1606 if (!rf_inited(rs))
1607 return ENXIO;
1608 }
1609
1610 switch (cmd) {
1611 /* configure the system */
1612 case RAIDFRAME_CONFIGURE:
1613 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1614 return retcode;
1615 return rf_construct(rs, k_cfg);
1616
1617 /* shutdown the system */
1618 case RAIDFRAME_SHUTDOWN:
1619
1620 part = DISKPART(dev);
1621 pmask = (1 << part);
1622
1623 if ((retcode = raidlock(rs)) != 0)
1624 return retcode;
1625
1626 if (DK_BUSY(dksc, pmask) ||
1627 raidPtr->recon_in_progress != 0 ||
1628 raidPtr->parity_rewrite_in_progress != 0 ||
1629 raidPtr->copyback_in_progress != 0)
1630 retcode = EBUSY;
1631 else {
1632 /* detach and free on close */
1633 rs->sc_flags |= RAIDF_SHUTDOWN;
1634 retcode = 0;
1635 }
1636
1637 raidunlock(rs);
1638
1639 return retcode;
1640 case RAIDFRAME_GET_COMPONENT_LABEL:
1641 return rf_get_component_label(raidPtr, data);
1642
1643 #if RF_DISABLED
1644 case RAIDFRAME_SET_COMPONENT_LABEL:
1645 return rf_set_component_label(raidPtr, data);
1646 #endif
1647
1648 case RAIDFRAME_INIT_LABELS:
1649 return rf_init_component_label(raidPtr, data);
1650
1651 case RAIDFRAME_SET_AUTOCONFIG:
1652 d = rf_set_autoconfig(raidPtr, *(int *) data);
1653 printf("raid%d: New autoconfig value is: %d\n",
1654 raidPtr->raidid, d);
1655 *(int *) data = d;
1656 return retcode;
1657
1658 case RAIDFRAME_SET_ROOT:
1659 d = rf_set_rootpartition(raidPtr, *(int *) data);
1660 printf("raid%d: New rootpartition value is: %d\n",
1661 raidPtr->raidid, d);
1662 *(int *) data = d;
1663 return retcode;
1664
1665 /* initialize all parity */
1666 case RAIDFRAME_REWRITEPARITY:
1667
1668 if (raidPtr->Layout.map->faultsTolerated == 0) {
1669 /* Parity for RAID 0 is trivially correct */
1670 raidPtr->parity_good = RF_RAID_CLEAN;
1671 return 0;
1672 }
1673
1674 if (raidPtr->parity_rewrite_in_progress == 1) {
1675 /* Re-write is already in progress! */
1676 return EINVAL;
1677 }
1678
1679 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1680 rf_RewriteParityThread, raidPtr,"raid_parity");
1681
1682 case RAIDFRAME_ADD_HOT_SPARE:
1683 rf_copy_single_component(&component, data);
1684 return rf_add_hot_spare(raidPtr, &component);
1685
1686 /* Remove a non hot-spare component, never implemented in userland */
1687 case RAIDFRAME_DELETE_COMPONENT:
1688 rf_copy_single_component(&component, data);
1689 return rf_delete_component(raidPtr, &component);
1690
1691 case RAIDFRAME_REMOVE_COMPONENT:
1692 rf_copy_single_component(&component, data);
1693 return rf_remove_component(raidPtr, &component);
1694
1695 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1696 rf_copy_single_component(&component, data);
1697 return rf_incorporate_hot_spare(raidPtr, &component);
1698
1699 case RAIDFRAME_REBUILD_IN_PLACE:
1700 return rf_rebuild_in_place(raidPtr, data);
1701
1702 case RAIDFRAME_GET_INFO:
1703 ucfgp = *(RF_DeviceConfig_t **)data;
1704 d_cfg = RF_Malloc(sizeof(*d_cfg));
1705 if (d_cfg == NULL)
1706 return ENOMEM;
1707 retcode = rf_get_info(raidPtr, d_cfg);
1708 if (retcode == 0) {
1709 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1710 }
1711 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1712 return retcode;
1713
1714 case RAIDFRAME_CHECK_PARITY:
1715 *(int *) data = raidPtr->parity_good;
1716 return 0;
1717
1718 case RAIDFRAME_PARITYMAP_STATUS:
1719 if (rf_paritymap_ineligible(raidPtr))
1720 return EINVAL;
1721 rf_paritymap_status(raidPtr->parity_map, data);
1722 return 0;
1723
1724 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1725 if (rf_paritymap_ineligible(raidPtr))
1726 return EINVAL;
1727 if (raidPtr->parity_map == NULL)
1728 return ENOENT; /* ??? */
1729 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1730 return EINVAL;
1731 return 0;
1732
1733 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1734 if (rf_paritymap_ineligible(raidPtr))
1735 return EINVAL;
1736 *(int *) data = rf_paritymap_get_disable(raidPtr);
1737 return 0;
1738
1739 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1740 if (rf_paritymap_ineligible(raidPtr))
1741 return EINVAL;
1742 rf_paritymap_set_disable(raidPtr, *(int *)data);
1743 /* XXX should errors be passed up? */
1744 return 0;
1745
1746 case RAIDFRAME_RESCAN:
1747 return rf_rescan();
1748
1749 case RAIDFRAME_RESET_ACCTOTALS:
1750 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1751 return 0;
1752
1753 case RAIDFRAME_GET_ACCTOTALS:
1754 totals = (RF_AccTotals_t *) data;
1755 *totals = raidPtr->acc_totals;
1756 return 0;
1757
1758 case RAIDFRAME_KEEP_ACCTOTALS:
1759 raidPtr->keep_acc_totals = *(int *)data;
1760 return 0;
1761
1762 case RAIDFRAME_GET_SIZE:
1763 *(int *) data = raidPtr->totalSectors;
1764 return 0;
1765
1766 case RAIDFRAME_FAIL_DISK:
1767 return rf_fail_disk(raidPtr, data);
1768
1769 /* invoke a copyback operation after recon on whatever disk
1770 * needs it, if any */
1771 case RAIDFRAME_COPYBACK:
1772
1773 if (raidPtr->Layout.map->faultsTolerated == 0) {
1774 /* This makes no sense on a RAID 0!! */
1775 return EINVAL;
1776 }
1777
1778 if (raidPtr->copyback_in_progress == 1) {
1779 /* Copyback is already in progress! */
1780 return EINVAL;
1781 }
1782
1783 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1784 rf_CopybackThread, raidPtr, "raid_copyback");
1785
1786 /* return the percentage completion of reconstruction */
1787 case RAIDFRAME_CHECK_RECON_STATUS:
1788 return rf_check_recon_status(raidPtr, data);
1789
1790 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1791 rf_check_recon_status_ext(raidPtr, data);
1792 return 0;
1793
1794 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1795 if (raidPtr->Layout.map->faultsTolerated == 0) {
1796 /* This makes no sense on a RAID 0, so tell the
1797 user it's done. */
1798 *(int *) data = 100;
1799 return 0;
1800 }
1801 if (raidPtr->parity_rewrite_in_progress == 1) {
1802 *(int *) data = 100 *
1803 raidPtr->parity_rewrite_stripes_done /
1804 raidPtr->Layout.numStripe;
1805 } else {
1806 *(int *) data = 100;
1807 }
1808 return 0;
1809
1810 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1811 rf_check_parityrewrite_status_ext(raidPtr, data);
1812 return 0;
1813
1814 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1815 if (raidPtr->Layout.map->faultsTolerated == 0) {
1816 /* This makes no sense on a RAID 0 */
1817 *(int *) data = 100;
1818 return 0;
1819 }
1820 if (raidPtr->copyback_in_progress == 1) {
1821 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1822 raidPtr->Layout.numStripe;
1823 } else {
1824 *(int *) data = 100;
1825 }
1826 return 0;
1827
1828 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1829 rf_check_copyback_status_ext(raidPtr, data);
1830 return 0;
1831
1832 case RAIDFRAME_SET_LAST_UNIT:
1833 for (column = 0; column < raidPtr->numCol; column++)
1834 if (raidPtr->Disks[column].status != rf_ds_optimal)
1835 return EBUSY;
1836
1837 for (column = 0; column < raidPtr->numCol; column++) {
1838 clabel = raidget_component_label(raidPtr, column);
1839 clabel->last_unit = *(int *)data;
1840 raidflush_component_label(raidPtr, column);
1841 }
1842 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1843 return 0;
1844
1845 /* the sparetable daemon calls this to wait for the kernel to
1846 * need a spare table. this ioctl does not return until a
1847 * spare table is needed. XXX -- calling mpsleep here in the
1848 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1849 * -- I should either compute the spare table in the kernel,
1850 * or have a different -- XXX XXX -- interface (a different
1851 * character device) for delivering the table -- XXX */
1852 #if RF_DISABLED
1853 case RAIDFRAME_SPARET_WAIT:
1854 rf_lock_mutex2(rf_sparet_wait_mutex);
1855 while (!rf_sparet_wait_queue)
1856 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1857 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1858 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1859 rf_unlock_mutex2(rf_sparet_wait_mutex);
1860
1861 /* structure assignment */
1862 *((RF_SparetWait_t *) data) = *waitreq;
1863
1864 RF_Free(waitreq, sizeof(*waitreq));
1865 return 0;
1866
1867 /* wakes up a process waiting on SPARET_WAIT and puts an error
1868 * code in it that will cause the dameon to exit */
1869 case RAIDFRAME_ABORT_SPARET_WAIT:
1870 waitreq = RF_Malloc(sizeof(*waitreq));
1871 waitreq->fcol = -1;
1872 rf_lock_mutex2(rf_sparet_wait_mutex);
1873 waitreq->next = rf_sparet_wait_queue;
1874 rf_sparet_wait_queue = waitreq;
1875 rf_broadcast_cond2(rf_sparet_wait_cv);
1876 rf_unlock_mutex2(rf_sparet_wait_mutex);
1877 return 0;
1878
1879 /* used by the spare table daemon to deliver a spare table
1880 * into the kernel */
1881 case RAIDFRAME_SEND_SPARET:
1882
1883 /* install the spare table */
1884 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1885
1886 /* respond to the requestor. the return status of the spare
1887 * table installation is passed in the "fcol" field */
1888 waitred = RF_Malloc(sizeof(*waitreq));
1889 waitreq->fcol = retcode;
1890 rf_lock_mutex2(rf_sparet_wait_mutex);
1891 waitreq->next = rf_sparet_resp_queue;
1892 rf_sparet_resp_queue = waitreq;
1893 rf_broadcast_cond2(rf_sparet_resp_cv);
1894 rf_unlock_mutex2(rf_sparet_wait_mutex);
1895
1896 return retcode;
1897 #endif
1898 default:
1899 /*
1900 * Don't bother trying to load compat modules
1901 * if it is not our ioctl. This is more efficient
1902 * and makes rump tests not depend on compat code
1903 */
1904 if (IOCGROUP(cmd) != 'r')
1905 break;
1906 #ifdef _LP64
1907 if ((l->l_proc->p_flag & PK_32) != 0) {
1908 module_autoload("compat_netbsd32_raid",
1909 MODULE_CLASS_EXEC);
1910 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1911 (rs, cmd, data), enosys(), retcode);
1912 if (retcode != EPASSTHROUGH)
1913 return retcode;
1914 }
1915 #endif
1916 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1917 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1918 (rs, cmd, data), enosys(), retcode);
1919 if (retcode != EPASSTHROUGH)
1920 return retcode;
1921
1922 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1923 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1924 (rs, cmd, data), enosys(), retcode);
1925 if (retcode != EPASSTHROUGH)
1926 return retcode;
1927 break; /* fall through to the os-specific code below */
1928
1929 }
1930
1931 if (!raidPtr->valid)
1932 return EINVAL;
1933
1934 /*
1935 * Add support for "regular" device ioctls here.
1936 */
1937
1938 switch (cmd) {
1939 case DIOCGCACHE:
1940 retcode = rf_get_component_caches(raidPtr, (int *)data);
1941 break;
1942
1943 case DIOCCACHESYNC:
1944 retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1945 break;
1946
1947 default:
1948 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1949 break;
1950 }
1951
1952 return retcode;
1953
1954 }
1955
1956
1957 /* raidinit -- complete the rest of the initialization for the
1958 RAIDframe device. */
1959
1960
1961 static void
1962 raidinit(struct raid_softc *rs)
1963 {
1964 cfdata_t cf;
1965 unsigned int unit;
1966 struct dk_softc *dksc = &rs->sc_dksc;
1967 RF_Raid_t *raidPtr = &rs->sc_r;
1968 device_t dev;
1969
1970 unit = raidPtr->raidid;
1971
1972 /* XXX doesn't check bounds. */
1973 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1974
1975 /* attach the pseudo device */
1976 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1977 cf->cf_name = raid_cd.cd_name;
1978 cf->cf_atname = raid_cd.cd_name;
1979 cf->cf_unit = unit;
1980 cf->cf_fstate = FSTATE_STAR;
1981
1982 dev = config_attach_pseudo(cf);
1983 if (dev == NULL) {
1984 printf("raid%d: config_attach_pseudo failed\n",
1985 raidPtr->raidid);
1986 free(cf, M_RAIDFRAME);
1987 return;
1988 }
1989
1990 /* provide a backpointer to the real softc */
1991 raidsoftc(dev) = rs;
1992
1993 /* disk_attach actually creates space for the CPU disklabel, among
1994 * other things, so it's critical to call this *BEFORE* we try putzing
1995 * with disklabels. */
1996 dk_init(dksc, dev, DKTYPE_RAID);
1997 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1998
1999 /* XXX There may be a weird interaction here between this, and
2000 * protectedSectors, as used in RAIDframe. */
2001
2002 rs->sc_size = raidPtr->totalSectors;
2003
2004 /* Attach dk and disk subsystems */
2005 dk_attach(dksc);
2006 disk_attach(&dksc->sc_dkdev);
2007 rf_set_geometry(rs, raidPtr);
2008
2009 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
2010
2011 /* mark unit as usuable */
2012 rs->sc_flags |= RAIDF_INITED;
2013
2014 dkwedge_discover(&dksc->sc_dkdev);
2015 }
2016
2017 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
2018 /* wake up the daemon & tell it to get us a spare table
2019 * XXX
2020 * the entries in the queues should be tagged with the raidPtr
2021 * so that in the extremely rare case that two recons happen at once,
2022 * we know for which device were requesting a spare table
2023 * XXX
2024 *
2025 * XXX This code is not currently used. GO
2026 */
2027 int
2028 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2029 {
2030 int retcode;
2031
2032 rf_lock_mutex2(rf_sparet_wait_mutex);
2033 req->next = rf_sparet_wait_queue;
2034 rf_sparet_wait_queue = req;
2035 rf_broadcast_cond2(rf_sparet_wait_cv);
2036
2037 /* mpsleep unlocks the mutex */
2038 while (!rf_sparet_resp_queue) {
2039 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2040 }
2041 req = rf_sparet_resp_queue;
2042 rf_sparet_resp_queue = req->next;
2043 rf_unlock_mutex2(rf_sparet_wait_mutex);
2044
2045 retcode = req->fcol;
2046 RF_Free(req, sizeof(*req)); /* this is not the same req as we
2047 * alloc'd */
2048 return retcode;
2049 }
2050 #endif
2051
2052 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2053 * bp & passes it down.
2054 * any calls originating in the kernel must use non-blocking I/O
2055 * do some extra sanity checking to return "appropriate" error values for
2056 * certain conditions (to make some standard utilities work)
2057 *
2058 * Formerly known as: rf_DoAccessKernel
2059 */
2060 void
2061 raidstart(RF_Raid_t *raidPtr)
2062 {
2063 struct raid_softc *rs;
2064 struct dk_softc *dksc;
2065
2066 rs = raidPtr->softc;
2067 dksc = &rs->sc_dksc;
2068 /* quick check to see if anything has died recently */
2069 rf_lock_mutex2(raidPtr->mutex);
2070 if (raidPtr->numNewFailures > 0) {
2071 rf_unlock_mutex2(raidPtr->mutex);
2072 rf_update_component_labels(raidPtr,
2073 RF_NORMAL_COMPONENT_UPDATE);
2074 rf_lock_mutex2(raidPtr->mutex);
2075 raidPtr->numNewFailures--;
2076 }
2077 rf_unlock_mutex2(raidPtr->mutex);
2078
2079 if ((rs->sc_flags & RAIDF_INITED) == 0) {
2080 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
2081 return;
2082 }
2083
2084 dk_start(dksc, NULL);
2085 }
2086
2087 static int
2088 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
2089 {
2090 RF_SectorCount_t num_blocks, pb, sum;
2091 RF_RaidAddr_t raid_addr;
2092 daddr_t blocknum;
2093 int rc;
2094
2095 rf_lock_mutex2(raidPtr->mutex);
2096 if (raidPtr->openings == 0) {
2097 rf_unlock_mutex2(raidPtr->mutex);
2098 return EAGAIN;
2099 }
2100 rf_unlock_mutex2(raidPtr->mutex);
2101
2102 blocknum = bp->b_rawblkno;
2103
2104 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2105 (int) blocknum));
2106
2107 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2108 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2109
2110 /* *THIS* is where we adjust what block we're going to...
2111 * but DO NOT TOUCH bp->b_blkno!!! */
2112 raid_addr = blocknum;
2113
2114 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2115 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2116 sum = raid_addr + num_blocks + pb;
2117 if (1 || rf_debugKernelAccess) {
2118 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2119 (int) raid_addr, (int) sum, (int) num_blocks,
2120 (int) pb, (int) bp->b_resid));
2121 }
2122 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2123 || (sum < num_blocks) || (sum < pb)) {
2124 rc = ENOSPC;
2125 goto done;
2126 }
2127 /*
2128 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2129 */
2130
2131 if (bp->b_bcount & raidPtr->sectorMask) {
2132 rc = ENOSPC;
2133 goto done;
2134 }
2135 db1_printf(("Calling DoAccess..\n"));
2136
2137
2138 rf_lock_mutex2(raidPtr->mutex);
2139 raidPtr->openings--;
2140 rf_unlock_mutex2(raidPtr->mutex);
2141
2142 /* don't ever condition on bp->b_flags & B_WRITE.
2143 * always condition on B_READ instead */
2144
2145 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2146 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2147 raid_addr, num_blocks,
2148 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2149
2150 done:
2151 return rc;
2152 }
2153
2154 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2155
2156 int
2157 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2158 {
2159 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2160 struct buf *bp;
2161
2162 req->queue = queue;
2163 bp = req->bp;
2164
2165 switch (req->type) {
2166 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2167 /* XXX need to do something extra here.. */
2168 /* I'm leaving this in, as I've never actually seen it used,
2169 * and I'd like folks to report it... GO */
2170 printf("%s: WAKEUP CALLED\n", __func__);
2171 queue->numOutstanding++;
2172
2173 bp->b_flags = 0;
2174 bp->b_private = req;
2175
2176 KernelWakeupFunc(bp);
2177 break;
2178
2179 case RF_IO_TYPE_READ:
2180 case RF_IO_TYPE_WRITE:
2181 #if RF_ACC_TRACE > 0
2182 if (req->tracerec) {
2183 RF_ETIMER_START(req->tracerec->timer);
2184 }
2185 #endif
2186 InitBP(bp, queue->rf_cinfo->ci_vp,
2187 op, queue->rf_cinfo->ci_dev,
2188 req->sectorOffset, req->numSector,
2189 req->buf, KernelWakeupFunc, (void *) req,
2190 queue->raidPtr->logBytesPerSector);
2191
2192 if (rf_debugKernelAccess) {
2193 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2194 (long) bp->b_blkno));
2195 }
2196 queue->numOutstanding++;
2197 queue->last_deq_sector = req->sectorOffset;
2198 /* acc wouldn't have been let in if there were any pending
2199 * reqs at any other priority */
2200 queue->curPriority = req->priority;
2201
2202 db1_printf(("Going for %c to unit %d col %d\n",
2203 req->type, queue->raidPtr->raidid,
2204 queue->col));
2205 db1_printf(("sector %d count %d (%d bytes) %d\n",
2206 (int) req->sectorOffset, (int) req->numSector,
2207 (int) (req->numSector <<
2208 queue->raidPtr->logBytesPerSector),
2209 (int) queue->raidPtr->logBytesPerSector));
2210
2211 /*
2212 * XXX: drop lock here since this can block at
2213 * least with backing SCSI devices. Retake it
2214 * to minimize fuss with calling interfaces.
2215 */
2216
2217 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2218 bdev_strategy(bp);
2219 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2220 break;
2221
2222 default:
2223 panic("bad req->type in rf_DispatchKernelIO");
2224 }
2225 db1_printf(("Exiting from DispatchKernelIO\n"));
2226
2227 return 0;
2228 }
2229 /* this is the callback function associated with a I/O invoked from
2230 kernel code.
2231 */
2232 static void
2233 KernelWakeupFunc(struct buf *bp)
2234 {
2235 RF_DiskQueueData_t *req = NULL;
2236 RF_DiskQueue_t *queue;
2237
2238 db1_printf(("recovering the request queue:\n"));
2239
2240 req = bp->b_private;
2241
2242 queue = (RF_DiskQueue_t *) req->queue;
2243
2244 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2245
2246 #if RF_ACC_TRACE > 0
2247 if (req->tracerec) {
2248 RF_ETIMER_STOP(req->tracerec->timer);
2249 RF_ETIMER_EVAL(req->tracerec->timer);
2250 rf_lock_mutex2(rf_tracing_mutex);
2251 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2252 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2253 req->tracerec->num_phys_ios++;
2254 rf_unlock_mutex2(rf_tracing_mutex);
2255 }
2256 #endif
2257
2258 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2259 * ballistic, and mark the component as hosed... */
2260
2261 if (bp->b_error != 0) {
2262 /* Mark the disk as dead */
2263 /* but only mark it once... */
2264 /* and only if it wouldn't leave this RAID set
2265 completely broken */
2266 if (((queue->raidPtr->Disks[queue->col].status ==
2267 rf_ds_optimal) ||
2268 (queue->raidPtr->Disks[queue->col].status ==
2269 rf_ds_used_spare)) &&
2270 (queue->raidPtr->numFailures <
2271 queue->raidPtr->Layout.map->faultsTolerated)) {
2272 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2273 queue->raidPtr->raidid,
2274 bp->b_error,
2275 queue->raidPtr->Disks[queue->col].devname);
2276 queue->raidPtr->Disks[queue->col].status =
2277 rf_ds_failed;
2278 queue->raidPtr->status = rf_rs_degraded;
2279 queue->raidPtr->numFailures++;
2280 queue->raidPtr->numNewFailures++;
2281 } else { /* Disk is already dead... */
2282 /* printf("Disk already marked as dead!\n"); */
2283 }
2284
2285 }
2286
2287 /* Fill in the error value */
2288 req->error = bp->b_error;
2289
2290 /* Drop this one on the "finished" queue... */
2291 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2292
2293 /* Let the raidio thread know there is work to be done. */
2294 rf_signal_cond2(queue->raidPtr->iodone_cv);
2295
2296 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2297 }
2298
2299
2300 /*
2301 * initialize a buf structure for doing an I/O in the kernel.
2302 */
2303 static void
2304 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2305 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2306 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2307 {
2308 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2309 bp->b_oflags = 0;
2310 bp->b_cflags = 0;
2311 bp->b_bcount = numSect << logBytesPerSector;
2312 bp->b_bufsize = bp->b_bcount;
2313 bp->b_error = 0;
2314 bp->b_dev = dev;
2315 bp->b_data = bf;
2316 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2317 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2318 if (bp->b_bcount == 0) {
2319 panic("bp->b_bcount is zero in InitBP!!");
2320 }
2321 bp->b_iodone = cbFunc;
2322 bp->b_private = cbArg;
2323 }
2324
2325 /*
2326 * Wait interruptibly for an exclusive lock.
2327 *
2328 * XXX
2329 * Several drivers do this; it should be abstracted and made MP-safe.
2330 * (Hmm... where have we seen this warning before :-> GO )
2331 */
2332 static int
2333 raidlock(struct raid_softc *rs)
2334 {
2335 int error;
2336
2337 error = 0;
2338 mutex_enter(&rs->sc_mutex);
2339 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2340 rs->sc_flags |= RAIDF_WANTED;
2341 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2342 if (error != 0)
2343 goto done;
2344 }
2345 rs->sc_flags |= RAIDF_LOCKED;
2346 done:
2347 mutex_exit(&rs->sc_mutex);
2348 return error;
2349 }
2350 /*
2351 * Unlock and wake up any waiters.
2352 */
2353 static void
2354 raidunlock(struct raid_softc *rs)
2355 {
2356
2357 mutex_enter(&rs->sc_mutex);
2358 rs->sc_flags &= ~RAIDF_LOCKED;
2359 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2360 rs->sc_flags &= ~RAIDF_WANTED;
2361 cv_broadcast(&rs->sc_cv);
2362 }
2363 mutex_exit(&rs->sc_mutex);
2364 }
2365
2366
2367 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2368 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2369 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2370
2371 static daddr_t
2372 rf_component_info_offset(void)
2373 {
2374
2375 return RF_COMPONENT_INFO_OFFSET;
2376 }
2377
2378 static daddr_t
2379 rf_component_info_size(unsigned secsize)
2380 {
2381 daddr_t info_size;
2382
2383 KASSERT(secsize);
2384 if (secsize > RF_COMPONENT_INFO_SIZE)
2385 info_size = secsize;
2386 else
2387 info_size = RF_COMPONENT_INFO_SIZE;
2388
2389 return info_size;
2390 }
2391
2392 static daddr_t
2393 rf_parity_map_offset(RF_Raid_t *raidPtr)
2394 {
2395 daddr_t map_offset;
2396
2397 KASSERT(raidPtr->bytesPerSector);
2398 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2399 map_offset = raidPtr->bytesPerSector;
2400 else
2401 map_offset = RF_COMPONENT_INFO_SIZE;
2402 map_offset += rf_component_info_offset();
2403
2404 return map_offset;
2405 }
2406
2407 static daddr_t
2408 rf_parity_map_size(RF_Raid_t *raidPtr)
2409 {
2410 daddr_t map_size;
2411
2412 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2413 map_size = raidPtr->bytesPerSector;
2414 else
2415 map_size = RF_PARITY_MAP_SIZE;
2416
2417 return map_size;
2418 }
2419
2420 int
2421 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2422 {
2423 RF_ComponentLabel_t *clabel;
2424
2425 clabel = raidget_component_label(raidPtr, col);
2426 clabel->clean = RF_RAID_CLEAN;
2427 raidflush_component_label(raidPtr, col);
2428 return(0);
2429 }
2430
2431
2432 int
2433 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2434 {
2435 RF_ComponentLabel_t *clabel;
2436
2437 clabel = raidget_component_label(raidPtr, col);
2438 clabel->clean = RF_RAID_DIRTY;
2439 raidflush_component_label(raidPtr, col);
2440 return(0);
2441 }
2442
2443 int
2444 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2445 {
2446 KASSERT(raidPtr->bytesPerSector);
2447
2448 return raidread_component_label(raidPtr->bytesPerSector,
2449 raidPtr->Disks[col].dev,
2450 raidPtr->raid_cinfo[col].ci_vp,
2451 &raidPtr->raid_cinfo[col].ci_label);
2452 }
2453
2454 RF_ComponentLabel_t *
2455 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2456 {
2457 return &raidPtr->raid_cinfo[col].ci_label;
2458 }
2459
2460 int
2461 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2462 {
2463 RF_ComponentLabel_t *label;
2464
2465 label = &raidPtr->raid_cinfo[col].ci_label;
2466 label->mod_counter = raidPtr->mod_counter;
2467 #ifndef RF_NO_PARITY_MAP
2468 label->parity_map_modcount = label->mod_counter;
2469 #endif
2470 return raidwrite_component_label(raidPtr->bytesPerSector,
2471 raidPtr->Disks[col].dev,
2472 raidPtr->raid_cinfo[col].ci_vp, label);
2473 }
2474
2475 /*
2476 * Swap the label endianness.
2477 *
2478 * Everything in the component label is 4-byte-swapped except the version,
2479 * which is kept in the byte-swapped version at all times, and indicates
2480 * for the writer that a swap is necessary.
2481 *
2482 * For reads it is expected that out_label == clabel, but writes expect
2483 * separate labels so only the re-swapped label is written out to disk,
2484 * leaving the swapped-except-version internally.
2485 *
2486 * Only support swapping label version 2.
2487 */
2488 static void
2489 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2490 {
2491 int *in, *out, *in_last;
2492
2493 KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2494
2495 /* Don't swap the label, but do copy it. */
2496 out_label->version = clabel->version;
2497
2498 in = &clabel->serial_number;
2499 in_last = &clabel->future_use2[42];
2500 out = &out_label->serial_number;
2501
2502 for (; in < in_last; in++, out++)
2503 *out = bswap32(*in);
2504 }
2505
2506 static int
2507 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2508 RF_ComponentLabel_t *clabel)
2509 {
2510 int error;
2511
2512 error = raidread_component_area(dev, b_vp, clabel,
2513 sizeof(RF_ComponentLabel_t),
2514 rf_component_info_offset(),
2515 rf_component_info_size(secsize));
2516
2517 if (error == 0 &&
2518 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2519 rf_swap_label(clabel, clabel);
2520 }
2521
2522 return error;
2523 }
2524
2525 /* ARGSUSED */
2526 static int
2527 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2528 size_t msize, daddr_t offset, daddr_t dsize)
2529 {
2530 struct buf *bp;
2531 int error;
2532
2533 /* XXX should probably ensure that we don't try to do this if
2534 someone has changed rf_protected_sectors. */
2535
2536 if (b_vp == NULL) {
2537 /* For whatever reason, this component is not valid.
2538 Don't try to read a component label from it. */
2539 return(EINVAL);
2540 }
2541
2542 /* get a block of the appropriate size... */
2543 bp = geteblk((int)dsize);
2544 bp->b_dev = dev;
2545
2546 /* get our ducks in a row for the read */
2547 bp->b_blkno = offset / DEV_BSIZE;
2548 bp->b_bcount = dsize;
2549 bp->b_flags |= B_READ;
2550 bp->b_resid = dsize;
2551
2552 bdev_strategy(bp);
2553 error = biowait(bp);
2554
2555 if (!error) {
2556 memcpy(data, bp->b_data, msize);
2557 }
2558
2559 brelse(bp, 0);
2560 return(error);
2561 }
2562
2563 static int
2564 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2565 RF_ComponentLabel_t *clabel)
2566 {
2567 RF_ComponentLabel_t *clabel_write = clabel;
2568 RF_ComponentLabel_t lclabel;
2569 int error;
2570
2571 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2572 clabel_write = &lclabel;
2573 rf_swap_label(clabel, clabel_write);
2574 }
2575 error = raidwrite_component_area(dev, b_vp, clabel_write,
2576 sizeof(RF_ComponentLabel_t),
2577 rf_component_info_offset(),
2578 rf_component_info_size(secsize), 0);
2579
2580 return error;
2581 }
2582
2583 /* ARGSUSED */
2584 static int
2585 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2586 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2587 {
2588 struct buf *bp;
2589 int error;
2590
2591 /* get a block of the appropriate size... */
2592 bp = geteblk((int)dsize);
2593 bp->b_dev = dev;
2594
2595 /* get our ducks in a row for the write */
2596 bp->b_blkno = offset / DEV_BSIZE;
2597 bp->b_bcount = dsize;
2598 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2599 bp->b_resid = dsize;
2600
2601 memset(bp->b_data, 0, dsize);
2602 memcpy(bp->b_data, data, msize);
2603
2604 bdev_strategy(bp);
2605 if (asyncp)
2606 return 0;
2607 error = biowait(bp);
2608 brelse(bp, 0);
2609 if (error) {
2610 #if 1
2611 printf("Failed to write RAID component info!\n");
2612 #endif
2613 }
2614
2615 return(error);
2616 }
2617
2618 void
2619 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2620 {
2621 int c;
2622
2623 for (c = 0; c < raidPtr->numCol; c++) {
2624 /* Skip dead disks. */
2625 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2626 continue;
2627 /* XXXjld: what if an error occurs here? */
2628 raidwrite_component_area(raidPtr->Disks[c].dev,
2629 raidPtr->raid_cinfo[c].ci_vp, map,
2630 RF_PARITYMAP_NBYTE,
2631 rf_parity_map_offset(raidPtr),
2632 rf_parity_map_size(raidPtr), 0);
2633 }
2634 }
2635
2636 void
2637 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2638 {
2639 struct rf_paritymap_ondisk tmp;
2640 int c,first;
2641
2642 first=1;
2643 for (c = 0; c < raidPtr->numCol; c++) {
2644 /* Skip dead disks. */
2645 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2646 continue;
2647 raidread_component_area(raidPtr->Disks[c].dev,
2648 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2649 RF_PARITYMAP_NBYTE,
2650 rf_parity_map_offset(raidPtr),
2651 rf_parity_map_size(raidPtr));
2652 if (first) {
2653 memcpy(map, &tmp, sizeof(*map));
2654 first = 0;
2655 } else {
2656 rf_paritymap_merge(map, &tmp);
2657 }
2658 }
2659 }
2660
2661 void
2662 rf_markalldirty(RF_Raid_t *raidPtr)
2663 {
2664 RF_ComponentLabel_t *clabel;
2665 int sparecol;
2666 int c;
2667 int j;
2668 int scol = -1;
2669
2670 raidPtr->mod_counter++;
2671 for (c = 0; c < raidPtr->numCol; c++) {
2672 /* we don't want to touch (at all) a disk that has
2673 failed */
2674 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2675 clabel = raidget_component_label(raidPtr, c);
2676 if (clabel->status == rf_ds_spared) {
2677 /* XXX do something special...
2678 but whatever you do, don't
2679 try to access it!! */
2680 } else {
2681 raidmarkdirty(raidPtr, c);
2682 }
2683 }
2684 }
2685
2686 for (c = 0; c < raidPtr->numSpare ; c++) {
2687 sparecol = raidPtr->numCol + c;
2688
2689 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2690 /*
2691
2692 we claim this disk is "optimal" if it's
2693 rf_ds_used_spare, as that means it should be
2694 directly substitutable for the disk it replaced.
2695 We note that too...
2696
2697 */
2698
2699 for(j=0;j<raidPtr->numCol;j++) {
2700 if (raidPtr->Disks[j].spareCol == sparecol) {
2701 scol = j;
2702 break;
2703 }
2704 }
2705
2706 clabel = raidget_component_label(raidPtr, sparecol);
2707 /* make sure status is noted */
2708
2709 raid_init_component_label(raidPtr, clabel);
2710
2711 clabel->row = 0;
2712 clabel->column = scol;
2713 /* Note: we *don't* change status from rf_ds_used_spare
2714 to rf_ds_optimal */
2715 /* clabel.status = rf_ds_optimal; */
2716
2717 raidmarkdirty(raidPtr, sparecol);
2718 }
2719 }
2720 }
2721
2722
2723 void
2724 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2725 {
2726 RF_ComponentLabel_t *clabel;
2727 int sparecol;
2728 int c;
2729 int j;
2730 int scol;
2731 struct raid_softc *rs = raidPtr->softc;
2732
2733 scol = -1;
2734
2735 /* XXX should do extra checks to make sure things really are clean,
2736 rather than blindly setting the clean bit... */
2737
2738 raidPtr->mod_counter++;
2739
2740 for (c = 0; c < raidPtr->numCol; c++) {
2741 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2742 clabel = raidget_component_label(raidPtr, c);
2743 /* make sure status is noted */
2744 clabel->status = rf_ds_optimal;
2745
2746 /* note what unit we are configured as */
2747 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2748 clabel->last_unit = raidPtr->raidid;
2749
2750 raidflush_component_label(raidPtr, c);
2751 if (final == RF_FINAL_COMPONENT_UPDATE) {
2752 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2753 raidmarkclean(raidPtr, c);
2754 }
2755 }
2756 }
2757 /* else we don't touch it.. */
2758 }
2759
2760 for (c = 0; c < raidPtr->numSpare ; c++) {
2761 sparecol = raidPtr->numCol + c;
2762
2763 /* Need to ensure that the reconstruct actually completed! */
2764 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2765 /*
2766
2767 we claim this disk is "optimal" if it's
2768 rf_ds_used_spare, as that means it should be
2769 directly substitutable for the disk it replaced.
2770 We note that too...
2771
2772 */
2773
2774 for(j=0;j<raidPtr->numCol;j++) {
2775 if (raidPtr->Disks[j].spareCol == sparecol) {
2776 scol = j;
2777 break;
2778 }
2779 }
2780
2781 /* XXX shouldn't *really* need this... */
2782 clabel = raidget_component_label(raidPtr, sparecol);
2783 /* make sure status is noted */
2784
2785 raid_init_component_label(raidPtr, clabel);
2786
2787 clabel->column = scol;
2788 clabel->status = rf_ds_optimal;
2789 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2790 clabel->last_unit = raidPtr->raidid;
2791
2792 raidflush_component_label(raidPtr, sparecol);
2793 if (final == RF_FINAL_COMPONENT_UPDATE) {
2794 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2795 raidmarkclean(raidPtr, sparecol);
2796 }
2797 }
2798 }
2799 }
2800 }
2801
2802 void
2803 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2804 {
2805
2806 if (vp != NULL) {
2807 if (auto_configured == 1) {
2808 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2809 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2810 vput(vp);
2811
2812 } else {
2813 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2814 }
2815 }
2816 }
2817
2818
2819 void
2820 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2821 {
2822 int r,c;
2823 struct vnode *vp;
2824 int acd;
2825
2826
2827 /* We take this opportunity to close the vnodes like we should.. */
2828
2829 for (c = 0; c < raidPtr->numCol; c++) {
2830 vp = raidPtr->raid_cinfo[c].ci_vp;
2831 acd = raidPtr->Disks[c].auto_configured;
2832 rf_close_component(raidPtr, vp, acd);
2833 raidPtr->raid_cinfo[c].ci_vp = NULL;
2834 raidPtr->Disks[c].auto_configured = 0;
2835 }
2836
2837 for (r = 0; r < raidPtr->numSpare; r++) {
2838 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2839 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2840 rf_close_component(raidPtr, vp, acd);
2841 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2842 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2843 }
2844 }
2845
2846
2847 static void
2848 rf_ReconThread(struct rf_recon_req_internal *req)
2849 {
2850 int s;
2851 RF_Raid_t *raidPtr;
2852
2853 s = splbio();
2854 raidPtr = (RF_Raid_t *) req->raidPtr;
2855 raidPtr->recon_in_progress = 1;
2856
2857 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2858 raidPtr->forceRecon = 1;
2859 }
2860
2861 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2862 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2863
2864 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2865 raidPtr->forceRecon = 0;
2866 }
2867
2868 RF_Free(req, sizeof(*req));
2869
2870 raidPtr->recon_in_progress = 0;
2871 splx(s);
2872
2873 /* That's all... */
2874 kthread_exit(0); /* does not return */
2875 }
2876
2877 static void
2878 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2879 {
2880 int retcode;
2881 int s;
2882
2883 raidPtr->parity_rewrite_stripes_done = 0;
2884 raidPtr->parity_rewrite_in_progress = 1;
2885 s = splbio();
2886 retcode = rf_RewriteParity(raidPtr);
2887 splx(s);
2888 if (retcode) {
2889 printf("raid%d: Error re-writing parity (%d)!\n",
2890 raidPtr->raidid, retcode);
2891 } else {
2892 /* set the clean bit! If we shutdown correctly,
2893 the clean bit on each component label will get
2894 set */
2895 raidPtr->parity_good = RF_RAID_CLEAN;
2896 }
2897 raidPtr->parity_rewrite_in_progress = 0;
2898
2899 /* Anyone waiting for us to stop? If so, inform them... */
2900 if (raidPtr->waitShutdown) {
2901 rf_lock_mutex2(raidPtr->rad_lock);
2902 cv_broadcast(&raidPtr->parity_rewrite_cv);
2903 rf_unlock_mutex2(raidPtr->rad_lock);
2904 }
2905
2906 /* That's all... */
2907 kthread_exit(0); /* does not return */
2908 }
2909
2910
2911 static void
2912 rf_CopybackThread(RF_Raid_t *raidPtr)
2913 {
2914 int s;
2915
2916 raidPtr->copyback_in_progress = 1;
2917 s = splbio();
2918 rf_CopybackReconstructedData(raidPtr);
2919 splx(s);
2920 raidPtr->copyback_in_progress = 0;
2921
2922 /* That's all... */
2923 kthread_exit(0); /* does not return */
2924 }
2925
2926
2927 static void
2928 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2929 {
2930 int s;
2931 RF_Raid_t *raidPtr;
2932
2933 s = splbio();
2934 raidPtr = req->raidPtr;
2935 raidPtr->recon_in_progress = 1;
2936
2937 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2938 raidPtr->forceRecon = 1;
2939 }
2940
2941 rf_ReconstructInPlace(raidPtr, req->col);
2942
2943 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2944 raidPtr->forceRecon = 0;
2945 }
2946
2947 RF_Free(req, sizeof(*req));
2948 raidPtr->recon_in_progress = 0;
2949 splx(s);
2950
2951 /* That's all... */
2952 kthread_exit(0); /* does not return */
2953 }
2954
2955 static RF_AutoConfig_t *
2956 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2957 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2958 unsigned secsize)
2959 {
2960 int good_one = 0;
2961 RF_ComponentLabel_t *clabel;
2962 RF_AutoConfig_t *ac;
2963
2964 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2965
2966 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2967 /* Got the label. Does it look reasonable? */
2968 if (rf_reasonable_label(clabel, numsecs) &&
2969 (rf_component_label_partitionsize(clabel) <= size)) {
2970 #ifdef DEBUG
2971 printf("Component on: %s: %llu\n",
2972 cname, (unsigned long long)size);
2973 rf_print_component_label(clabel);
2974 #endif
2975 /* if it's reasonable, add it, else ignore it. */
2976 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2977 M_WAITOK);
2978 strlcpy(ac->devname, cname, sizeof(ac->devname));
2979 ac->dev = dev;
2980 ac->vp = vp;
2981 ac->clabel = clabel;
2982 ac->next = ac_list;
2983 ac_list = ac;
2984 good_one = 1;
2985 }
2986 }
2987 if (!good_one) {
2988 /* cleanup */
2989 free(clabel, M_RAIDFRAME);
2990 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2991 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2992 vput(vp);
2993 }
2994 return ac_list;
2995 }
2996
2997 static RF_AutoConfig_t *
2998 rf_find_raid_components(void)
2999 {
3000 struct vnode *vp;
3001 struct disklabel label;
3002 device_t dv;
3003 deviter_t di;
3004 dev_t dev;
3005 int bmajor, bminor, wedge, rf_part_found;
3006 int error;
3007 int i;
3008 RF_AutoConfig_t *ac_list;
3009 uint64_t numsecs;
3010 unsigned secsize;
3011 int dowedges;
3012
3013 /* initialize the AutoConfig list */
3014 ac_list = NULL;
3015
3016 /*
3017 * we begin by trolling through *all* the devices on the system *twice*
3018 * first we scan for wedges, second for other devices. This avoids
3019 * using a raw partition instead of a wedge that covers the whole disk
3020 */
3021
3022 for (dowedges=1; dowedges>=0; --dowedges) {
3023 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3024 dv = deviter_next(&di)) {
3025
3026 /* we are only interested in disks */
3027 if (device_class(dv) != DV_DISK)
3028 continue;
3029
3030 /* we don't care about floppies */
3031 if (device_is_a(dv, "fd")) {
3032 continue;
3033 }
3034
3035 /* we don't care about CDs. */
3036 if (device_is_a(dv, "cd")) {
3037 continue;
3038 }
3039
3040 /* we don't care about md. */
3041 if (device_is_a(dv, "md")) {
3042 continue;
3043 }
3044
3045 /* hdfd is the Atari/Hades floppy driver */
3046 if (device_is_a(dv, "hdfd")) {
3047 continue;
3048 }
3049
3050 /* fdisa is the Atari/Milan floppy driver */
3051 if (device_is_a(dv, "fdisa")) {
3052 continue;
3053 }
3054
3055 /* we don't care about spiflash */
3056 if (device_is_a(dv, "spiflash")) {
3057 continue;
3058 }
3059
3060 /* are we in the wedges pass ? */
3061 wedge = device_is_a(dv, "dk");
3062 if (wedge != dowedges) {
3063 continue;
3064 }
3065
3066 /* need to find the device_name_to_block_device_major stuff */
3067 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3068
3069 rf_part_found = 0; /*No raid partition as yet*/
3070
3071 /* get a vnode for the raw partition of this disk */
3072 bminor = minor(device_unit(dv));
3073 dev = wedge ? makedev(bmajor, bminor) :
3074 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3075 if (bdevvp(dev, &vp))
3076 panic("RAID can't alloc vnode");
3077
3078 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3079 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3080
3081 if (error) {
3082 /* "Who cares." Continue looking
3083 for something that exists*/
3084 vput(vp);
3085 continue;
3086 }
3087
3088 error = getdisksize(vp, &numsecs, &secsize);
3089 if (error) {
3090 /*
3091 * Pseudo devices like vnd and cgd can be
3092 * opened but may still need some configuration.
3093 * Ignore these quietly.
3094 */
3095 if (error != ENXIO)
3096 printf("RAIDframe: can't get disk size"
3097 " for dev %s (%d)\n",
3098 device_xname(dv), error);
3099 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3100 vput(vp);
3101 continue;
3102 }
3103 if (wedge) {
3104 struct dkwedge_info dkw;
3105 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3106 NOCRED);
3107 if (error) {
3108 printf("RAIDframe: can't get wedge info for "
3109 "dev %s (%d)\n", device_xname(dv), error);
3110 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3111 vput(vp);
3112 continue;
3113 }
3114
3115 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3116 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3117 vput(vp);
3118 continue;
3119 }
3120
3121 VOP_UNLOCK(vp);
3122 ac_list = rf_get_component(ac_list, dev, vp,
3123 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3124 rf_part_found = 1; /*There is a raid component on this disk*/
3125 continue;
3126 }
3127
3128 /* Ok, the disk exists. Go get the disklabel. */
3129 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3130 if (error) {
3131 /*
3132 * XXX can't happen - open() would
3133 * have errored out (or faked up one)
3134 */
3135 if (error != ENOTTY)
3136 printf("RAIDframe: can't get label for dev "
3137 "%s (%d)\n", device_xname(dv), error);
3138 }
3139
3140 /* don't need this any more. We'll allocate it again
3141 a little later if we really do... */
3142 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3143 vput(vp);
3144
3145 if (error)
3146 continue;
3147
3148 rf_part_found = 0; /*No raid partitions yet*/
3149 for (i = 0; i < label.d_npartitions; i++) {
3150 char cname[sizeof(ac_list->devname)];
3151
3152 /* We only support partitions marked as RAID */
3153 if (label.d_partitions[i].p_fstype != FS_RAID)
3154 continue;
3155
3156 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3157 if (bdevvp(dev, &vp))
3158 panic("RAID can't alloc vnode");
3159
3160 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3161 error = VOP_OPEN(vp, FREAD, NOCRED);
3162 if (error) {
3163 /* Not quite a 'whatever'. In
3164 * this situation we know
3165 * there is a FS_RAID
3166 * partition, but we can't
3167 * open it. The most likely
3168 * reason is that the
3169 * partition is already in
3170 * use by another RAID set.
3171 * So note that we've already
3172 * found a partition on this
3173 * disk so we don't attempt
3174 * to use the raw disk later. */
3175 rf_part_found = 1;
3176 vput(vp);
3177 continue;
3178 }
3179 VOP_UNLOCK(vp);
3180 snprintf(cname, sizeof(cname), "%s%c",
3181 device_xname(dv), 'a' + i);
3182 ac_list = rf_get_component(ac_list, dev, vp, cname,
3183 label.d_partitions[i].p_size, numsecs, secsize);
3184 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3185 }
3186
3187 /*
3188 *If there is no raid component on this disk, either in a
3189 *disklabel or inside a wedge, check the raw partition as well,
3190 *as it is possible to configure raid components on raw disk
3191 *devices.
3192 */
3193
3194 if (!rf_part_found) {
3195 char cname[sizeof(ac_list->devname)];
3196
3197 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3198 if (bdevvp(dev, &vp))
3199 panic("RAID can't alloc vnode");
3200
3201 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3202
3203 error = VOP_OPEN(vp, FREAD, NOCRED);
3204 if (error) {
3205 /* Whatever... */
3206 vput(vp);
3207 continue;
3208 }
3209 VOP_UNLOCK(vp);
3210 snprintf(cname, sizeof(cname), "%s%c",
3211 device_xname(dv), 'a' + RAW_PART);
3212 ac_list = rf_get_component(ac_list, dev, vp, cname,
3213 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3214 }
3215 }
3216 deviter_release(&di);
3217 }
3218 return ac_list;
3219 }
3220
3221 int
3222 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3223 {
3224
3225 if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3226 clabel->version==RF_COMPONENT_LABEL_VERSION ||
3227 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3228 (clabel->clean == RF_RAID_CLEAN ||
3229 clabel->clean == RF_RAID_DIRTY) &&
3230 clabel->row >=0 &&
3231 clabel->column >= 0 &&
3232 clabel->num_rows > 0 &&
3233 clabel->num_columns > 0 &&
3234 clabel->row < clabel->num_rows &&
3235 clabel->column < clabel->num_columns &&
3236 clabel->blockSize > 0 &&
3237 /*
3238 * numBlocksHi may contain garbage, but it is ok since
3239 * the type is unsigned. If it is really garbage,
3240 * rf_fix_old_label_size() will fix it.
3241 */
3242 rf_component_label_numblocks(clabel) > 0) {
3243 /*
3244 * label looks reasonable enough...
3245 * let's make sure it has no old garbage.
3246 */
3247 if (numsecs)
3248 rf_fix_old_label_size(clabel, numsecs);
3249 return(1);
3250 }
3251 return(0);
3252 }
3253
3254
3255 /*
3256 * For reasons yet unknown, some old component labels have garbage in
3257 * the newer numBlocksHi region, and this causes lossage. Since those
3258 * disks will also have numsecs set to less than 32 bits of sectors,
3259 * we can determine when this corruption has occurred, and fix it.
3260 *
3261 * The exact same problem, with the same unknown reason, happens to
3262 * the partitionSizeHi member as well.
3263 */
3264 static void
3265 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3266 {
3267
3268 if (numsecs < ((uint64_t)1 << 32)) {
3269 if (clabel->numBlocksHi) {
3270 printf("WARNING: total sectors < 32 bits, yet "
3271 "numBlocksHi set\n"
3272 "WARNING: resetting numBlocksHi to zero.\n");
3273 clabel->numBlocksHi = 0;
3274 }
3275
3276 if (clabel->partitionSizeHi) {
3277 printf("WARNING: total sectors < 32 bits, yet "
3278 "partitionSizeHi set\n"
3279 "WARNING: resetting partitionSizeHi to zero.\n");
3280 clabel->partitionSizeHi = 0;
3281 }
3282 }
3283 }
3284
3285
3286 #ifdef DEBUG
3287 void
3288 rf_print_component_label(RF_ComponentLabel_t *clabel)
3289 {
3290 uint64_t numBlocks;
3291 static const char *rp[] = {
3292 "No", "Force", "Soft", "*invalid*"
3293 };
3294
3295
3296 numBlocks = rf_component_label_numblocks(clabel);
3297
3298 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3299 clabel->row, clabel->column,
3300 clabel->num_rows, clabel->num_columns);
3301 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3302 clabel->version, clabel->serial_number,
3303 clabel->mod_counter);
3304 printf(" Clean: %s Status: %d\n",
3305 clabel->clean ? "Yes" : "No", clabel->status);
3306 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3307 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3308 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3309 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3310 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3311 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3312 printf(" Last configured as: raid%d\n", clabel->last_unit);
3313 #if 0
3314 printf(" Config order: %d\n", clabel->config_order);
3315 #endif
3316
3317 }
3318 #endif
3319
3320 static RF_ConfigSet_t *
3321 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3322 {
3323 RF_AutoConfig_t *ac;
3324 RF_ConfigSet_t *config_sets;
3325 RF_ConfigSet_t *cset;
3326 RF_AutoConfig_t *ac_next;
3327
3328
3329 config_sets = NULL;
3330
3331 /* Go through the AutoConfig list, and figure out which components
3332 belong to what sets. */
3333 ac = ac_list;
3334 while(ac!=NULL) {
3335 /* we're going to putz with ac->next, so save it here
3336 for use at the end of the loop */
3337 ac_next = ac->next;
3338
3339 if (config_sets == NULL) {
3340 /* will need at least this one... */
3341 config_sets = malloc(sizeof(RF_ConfigSet_t),
3342 M_RAIDFRAME, M_WAITOK);
3343 /* this one is easy :) */
3344 config_sets->ac = ac;
3345 config_sets->next = NULL;
3346 config_sets->rootable = 0;
3347 ac->next = NULL;
3348 } else {
3349 /* which set does this component fit into? */
3350 cset = config_sets;
3351 while(cset!=NULL) {
3352 if (rf_does_it_fit(cset, ac)) {
3353 /* looks like it matches... */
3354 ac->next = cset->ac;
3355 cset->ac = ac;
3356 break;
3357 }
3358 cset = cset->next;
3359 }
3360 if (cset==NULL) {
3361 /* didn't find a match above... new set..*/
3362 cset = malloc(sizeof(RF_ConfigSet_t),
3363 M_RAIDFRAME, M_WAITOK);
3364 cset->ac = ac;
3365 ac->next = NULL;
3366 cset->next = config_sets;
3367 cset->rootable = 0;
3368 config_sets = cset;
3369 }
3370 }
3371 ac = ac_next;
3372 }
3373
3374
3375 return(config_sets);
3376 }
3377
3378 static int
3379 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3380 {
3381 RF_ComponentLabel_t *clabel1, *clabel2;
3382
3383 /* If this one matches the *first* one in the set, that's good
3384 enough, since the other members of the set would have been
3385 through here too... */
3386 /* note that we are not checking partitionSize here..
3387
3388 Note that we are also not checking the mod_counters here.
3389 If everything else matches except the mod_counter, that's
3390 good enough for this test. We will deal with the mod_counters
3391 a little later in the autoconfiguration process.
3392
3393 (clabel1->mod_counter == clabel2->mod_counter) &&
3394
3395 The reason we don't check for this is that failed disks
3396 will have lower modification counts. If those disks are
3397 not added to the set they used to belong to, then they will
3398 form their own set, which may result in 2 different sets,
3399 for example, competing to be configured at raid0, and
3400 perhaps competing to be the root filesystem set. If the
3401 wrong ones get configured, or both attempt to become /,
3402 weird behaviour and or serious lossage will occur. Thus we
3403 need to bring them into the fold here, and kick them out at
3404 a later point.
3405
3406 */
3407
3408 clabel1 = cset->ac->clabel;
3409 clabel2 = ac->clabel;
3410 if ((clabel1->version == clabel2->version) &&
3411 (clabel1->serial_number == clabel2->serial_number) &&
3412 (clabel1->num_rows == clabel2->num_rows) &&
3413 (clabel1->num_columns == clabel2->num_columns) &&
3414 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3415 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3416 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3417 (clabel1->parityConfig == clabel2->parityConfig) &&
3418 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3419 (clabel1->blockSize == clabel2->blockSize) &&
3420 rf_component_label_numblocks(clabel1) ==
3421 rf_component_label_numblocks(clabel2) &&
3422 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3423 (clabel1->root_partition == clabel2->root_partition) &&
3424 (clabel1->last_unit == clabel2->last_unit) &&
3425 (clabel1->config_order == clabel2->config_order)) {
3426 /* if it get's here, it almost *has* to be a match */
3427 } else {
3428 /* it's not consistent with somebody in the set..
3429 punt */
3430 return(0);
3431 }
3432 /* all was fine.. it must fit... */
3433 return(1);
3434 }
3435
3436 static int
3437 rf_have_enough_components(RF_ConfigSet_t *cset)
3438 {
3439 RF_AutoConfig_t *ac;
3440 RF_AutoConfig_t *auto_config;
3441 RF_ComponentLabel_t *clabel;
3442 int c;
3443 int num_cols;
3444 int num_missing;
3445 int mod_counter;
3446 int mod_counter_found;
3447 int even_pair_failed;
3448 char parity_type;
3449
3450
3451 /* check to see that we have enough 'live' components
3452 of this set. If so, we can configure it if necessary */
3453
3454 num_cols = cset->ac->clabel->num_columns;
3455 parity_type = cset->ac->clabel->parityConfig;
3456
3457 /* XXX Check for duplicate components!?!?!? */
3458
3459 /* Determine what the mod_counter is supposed to be for this set. */
3460
3461 mod_counter_found = 0;
3462 mod_counter = 0;
3463 ac = cset->ac;
3464 while(ac!=NULL) {
3465 if (mod_counter_found==0) {
3466 mod_counter = ac->clabel->mod_counter;
3467 mod_counter_found = 1;
3468 } else {
3469 if (ac->clabel->mod_counter > mod_counter) {
3470 mod_counter = ac->clabel->mod_counter;
3471 }
3472 }
3473 ac = ac->next;
3474 }
3475
3476 num_missing = 0;
3477 auto_config = cset->ac;
3478
3479 even_pair_failed = 0;
3480 for(c=0; c<num_cols; c++) {
3481 ac = auto_config;
3482 while(ac!=NULL) {
3483 if ((ac->clabel->column == c) &&
3484 (ac->clabel->mod_counter == mod_counter)) {
3485 /* it's this one... */
3486 #ifdef DEBUG
3487 printf("Found: %s at %d\n",
3488 ac->devname,c);
3489 #endif
3490 break;
3491 }
3492 ac=ac->next;
3493 }
3494 if (ac==NULL) {
3495 /* Didn't find one here! */
3496 /* special case for RAID 1, especially
3497 where there are more than 2
3498 components (where RAIDframe treats
3499 things a little differently :( ) */
3500 if (parity_type == '1') {
3501 if (c%2 == 0) { /* even component */
3502 even_pair_failed = 1;
3503 } else { /* odd component. If
3504 we're failed, and
3505 so is the even
3506 component, it's
3507 "Good Night, Charlie" */
3508 if (even_pair_failed == 1) {
3509 return(0);
3510 }
3511 }
3512 } else {
3513 /* normal accounting */
3514 num_missing++;
3515 }
3516 }
3517 if ((parity_type == '1') && (c%2 == 1)) {
3518 /* Just did an even component, and we didn't
3519 bail.. reset the even_pair_failed flag,
3520 and go on to the next component.... */
3521 even_pair_failed = 0;
3522 }
3523 }
3524
3525 clabel = cset->ac->clabel;
3526
3527 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3528 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3529 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3530 /* XXX this needs to be made *much* more general */
3531 /* Too many failures */
3532 return(0);
3533 }
3534 /* otherwise, all is well, and we've got enough to take a kick
3535 at autoconfiguring this set */
3536 return(1);
3537 }
3538
3539 static void
3540 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3541 RF_Raid_t *raidPtr)
3542 {
3543 RF_ComponentLabel_t *clabel;
3544 int i;
3545
3546 clabel = ac->clabel;
3547
3548 /* 1. Fill in the common stuff */
3549 config->numCol = clabel->num_columns;
3550 config->numSpare = 0; /* XXX should this be set here? */
3551 config->sectPerSU = clabel->sectPerSU;
3552 config->SUsPerPU = clabel->SUsPerPU;
3553 config->SUsPerRU = clabel->SUsPerRU;
3554 config->parityConfig = clabel->parityConfig;
3555 /* XXX... */
3556 strcpy(config->diskQueueType,"fifo");
3557 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3558 config->layoutSpecificSize = 0; /* XXX ?? */
3559
3560 while(ac!=NULL) {
3561 /* row/col values will be in range due to the checks
3562 in reasonable_label() */
3563 strcpy(config->devnames[0][ac->clabel->column],
3564 ac->devname);
3565 ac = ac->next;
3566 }
3567
3568 for(i=0;i<RF_MAXDBGV;i++) {
3569 config->debugVars[i][0] = 0;
3570 }
3571 }
3572
3573 static int
3574 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3575 {
3576 RF_ComponentLabel_t *clabel;
3577 int column;
3578 int sparecol;
3579
3580 raidPtr->autoconfigure = new_value;
3581
3582 for(column=0; column<raidPtr->numCol; column++) {
3583 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3584 clabel = raidget_component_label(raidPtr, column);
3585 clabel->autoconfigure = new_value;
3586 raidflush_component_label(raidPtr, column);
3587 }
3588 }
3589 for(column = 0; column < raidPtr->numSpare ; column++) {
3590 sparecol = raidPtr->numCol + column;
3591
3592 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3593 clabel = raidget_component_label(raidPtr, sparecol);
3594 clabel->autoconfigure = new_value;
3595 raidflush_component_label(raidPtr, sparecol);
3596 }
3597 }
3598 return(new_value);
3599 }
3600
3601 static int
3602 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3603 {
3604 RF_ComponentLabel_t *clabel;
3605 int column;
3606 int sparecol;
3607
3608 raidPtr->root_partition = new_value;
3609 for(column=0; column<raidPtr->numCol; column++) {
3610 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3611 clabel = raidget_component_label(raidPtr, column);
3612 clabel->root_partition = new_value;
3613 raidflush_component_label(raidPtr, column);
3614 }
3615 }
3616 for (column = 0; column < raidPtr->numSpare ; column++) {
3617 sparecol = raidPtr->numCol + column;
3618
3619 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3620 clabel = raidget_component_label(raidPtr, sparecol);
3621 clabel->root_partition = new_value;
3622 raidflush_component_label(raidPtr, sparecol);
3623 }
3624 }
3625 return(new_value);
3626 }
3627
3628 static void
3629 rf_release_all_vps(RF_ConfigSet_t *cset)
3630 {
3631 RF_AutoConfig_t *ac;
3632
3633 ac = cset->ac;
3634 while(ac!=NULL) {
3635 /* Close the vp, and give it back */
3636 if (ac->vp) {
3637 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3638 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3639 vput(ac->vp);
3640 ac->vp = NULL;
3641 }
3642 ac = ac->next;
3643 }
3644 }
3645
3646
3647 static void
3648 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3649 {
3650 RF_AutoConfig_t *ac;
3651 RF_AutoConfig_t *next_ac;
3652
3653 ac = cset->ac;
3654 while(ac!=NULL) {
3655 next_ac = ac->next;
3656 /* nuke the label */
3657 free(ac->clabel, M_RAIDFRAME);
3658 /* cleanup the config structure */
3659 free(ac, M_RAIDFRAME);
3660 /* "next.." */
3661 ac = next_ac;
3662 }
3663 /* and, finally, nuke the config set */
3664 free(cset, M_RAIDFRAME);
3665 }
3666
3667
3668 void
3669 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3670 {
3671 /* avoid over-writing byteswapped version. */
3672 if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3673 clabel->version = RF_COMPONENT_LABEL_VERSION;
3674 clabel->serial_number = raidPtr->serial_number;
3675 clabel->mod_counter = raidPtr->mod_counter;
3676
3677 clabel->num_rows = 1;
3678 clabel->num_columns = raidPtr->numCol;
3679 clabel->clean = RF_RAID_DIRTY; /* not clean */
3680 clabel->status = rf_ds_optimal; /* "It's good!" */
3681
3682 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3683 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3684 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3685
3686 clabel->blockSize = raidPtr->bytesPerSector;
3687 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3688
3689 /* XXX not portable */
3690 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3691 clabel->maxOutstanding = raidPtr->maxOutstanding;
3692 clabel->autoconfigure = raidPtr->autoconfigure;
3693 clabel->root_partition = raidPtr->root_partition;
3694 clabel->last_unit = raidPtr->raidid;
3695 clabel->config_order = raidPtr->config_order;
3696
3697 #ifndef RF_NO_PARITY_MAP
3698 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3699 #endif
3700 }
3701
3702 static struct raid_softc *
3703 rf_auto_config_set(RF_ConfigSet_t *cset)
3704 {
3705 RF_Raid_t *raidPtr;
3706 RF_Config_t *config;
3707 int raidID;
3708 struct raid_softc *sc;
3709
3710 #ifdef DEBUG
3711 printf("RAID autoconfigure\n");
3712 #endif
3713
3714 /* 1. Create a config structure */
3715 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3716
3717 /*
3718 2. Figure out what RAID ID this one is supposed to live at
3719 See if we can get the same RAID dev that it was configured
3720 on last time..
3721 */
3722
3723 raidID = cset->ac->clabel->last_unit;
3724 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3725 sc = raidget(++raidID, false))
3726 continue;
3727 #ifdef DEBUG
3728 printf("Configuring raid%d:\n",raidID);
3729 #endif
3730
3731 if (sc == NULL)
3732 sc = raidget(raidID, true);
3733 raidPtr = &sc->sc_r;
3734
3735 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3736 raidPtr->softc = sc;
3737 raidPtr->raidid = raidID;
3738 raidPtr->openings = RAIDOUTSTANDING;
3739
3740 /* 3. Build the configuration structure */
3741 rf_create_configuration(cset->ac, config, raidPtr);
3742
3743 /* 4. Do the configuration */
3744 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3745 raidinit(sc);
3746
3747 rf_markalldirty(raidPtr);
3748 raidPtr->autoconfigure = 1; /* XXX do this here? */
3749 switch (cset->ac->clabel->root_partition) {
3750 case 1: /* Force Root */
3751 case 2: /* Soft Root: root when boot partition part of raid */
3752 /*
3753 * everything configured just fine. Make a note
3754 * that this set is eligible to be root,
3755 * or forced to be root
3756 */
3757 cset->rootable = cset->ac->clabel->root_partition;
3758 /* XXX do this here? */
3759 raidPtr->root_partition = cset->rootable;
3760 break;
3761 default:
3762 break;
3763 }
3764 } else {
3765 raidput(sc);
3766 sc = NULL;
3767 }
3768
3769 /* 5. Cleanup */
3770 free(config, M_RAIDFRAME);
3771 return sc;
3772 }
3773
3774 void
3775 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3776 size_t xmin, size_t xmax)
3777 {
3778
3779 /* Format: raid%d_foo */
3780 snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3781
3782 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3783 pool_sethiwat(p, xmax);
3784 pool_prime(p, xmin);
3785 }
3786
3787
3788 /*
3789 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3790 * to see if there is IO pending and if that IO could possibly be done
3791 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3792 * otherwise.
3793 *
3794 */
3795 int
3796 rf_buf_queue_check(RF_Raid_t *raidPtr)
3797 {
3798 struct raid_softc *rs;
3799 struct dk_softc *dksc;
3800
3801 rs = raidPtr->softc;
3802 dksc = &rs->sc_dksc;
3803
3804 if ((rs->sc_flags & RAIDF_INITED) == 0)
3805 return 1;
3806
3807 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3808 /* there is work to do */
3809 return 0;
3810 }
3811 /* default is nothing to do */
3812 return 1;
3813 }
3814
3815 int
3816 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3817 {
3818 uint64_t numsecs;
3819 unsigned secsize;
3820 int error;
3821
3822 error = getdisksize(vp, &numsecs, &secsize);
3823 if (error == 0) {
3824 diskPtr->blockSize = secsize;
3825 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3826 diskPtr->partitionSize = numsecs;
3827 return 0;
3828 }
3829 return error;
3830 }
3831
3832 static int
3833 raid_match(device_t self, cfdata_t cfdata, void *aux)
3834 {
3835 return 1;
3836 }
3837
3838 static void
3839 raid_attach(device_t parent, device_t self, void *aux)
3840 {
3841 }
3842
3843
3844 static int
3845 raid_detach(device_t self, int flags)
3846 {
3847 int error;
3848 struct raid_softc *rs = raidsoftc(self);
3849
3850 if (rs == NULL)
3851 return ENXIO;
3852
3853 if ((error = raidlock(rs)) != 0)
3854 return error;
3855
3856 error = raid_detach_unlocked(rs);
3857
3858 raidunlock(rs);
3859
3860 /* XXX raid can be referenced here */
3861
3862 if (error)
3863 return error;
3864
3865 /* Free the softc */
3866 raidput(rs);
3867
3868 return 0;
3869 }
3870
3871 static void
3872 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3873 {
3874 struct dk_softc *dksc = &rs->sc_dksc;
3875 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3876
3877 memset(dg, 0, sizeof(*dg));
3878
3879 dg->dg_secperunit = raidPtr->totalSectors;
3880 dg->dg_secsize = raidPtr->bytesPerSector;
3881 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3882 dg->dg_ntracks = 4 * raidPtr->numCol;
3883
3884 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3885 }
3886
3887 /*
3888 * Get cache info for all the components (including spares).
3889 * Returns intersection of all the cache flags of all disks, or first
3890 * error if any encountered.
3891 * XXXfua feature flags can change as spares are added - lock down somehow
3892 */
3893 static int
3894 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3895 {
3896 int c;
3897 int error;
3898 int dkwhole = 0, dkpart;
3899
3900 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3901 /*
3902 * Check any non-dead disk, even when currently being
3903 * reconstructed.
3904 */
3905 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
3906 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3907 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3908 if (error) {
3909 if (error != ENODEV) {
3910 printf("raid%d: get cache for component %s failed\n",
3911 raidPtr->raidid,
3912 raidPtr->Disks[c].devname);
3913 }
3914
3915 return error;
3916 }
3917
3918 if (c == 0)
3919 dkwhole = dkpart;
3920 else
3921 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3922 }
3923 }
3924
3925 *data = dkwhole;
3926
3927 return 0;
3928 }
3929
3930 /*
3931 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3932 * We end up returning whatever error was returned by the first cache flush
3933 * that fails.
3934 */
3935
3936 static int
3937 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3938 {
3939 int e = 0;
3940 for (int i = 0; i < 5; i++) {
3941 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3942 &force, FWRITE, NOCRED);
3943 if (!e || e == ENODEV)
3944 return e;
3945 printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3946 raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3947 }
3948 return e;
3949 }
3950
3951 int
3952 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3953 {
3954 int c, error;
3955
3956 error = 0;
3957 for (c = 0; c < raidPtr->numCol; c++) {
3958 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3959 int e = rf_sync_component_cache(raidPtr, c, force);
3960 if (e && !error)
3961 error = e;
3962 }
3963 }
3964
3965 for (c = 0; c < raidPtr->numSpare ; c++) {
3966 int sparecol = raidPtr->numCol + c;
3967
3968 /* Need to ensure that the reconstruct actually completed! */
3969 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3970 int e = rf_sync_component_cache(raidPtr, sparecol,
3971 force);
3972 if (e && !error)
3973 error = e;
3974 }
3975 }
3976 return error;
3977 }
3978
3979 /* Fill in info with the current status */
3980 void
3981 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3982 {
3983
3984 memset(info, 0, sizeof(*info));
3985
3986 if (raidPtr->status != rf_rs_reconstructing) {
3987 info->total = 100;
3988 info->completed = 100;
3989 } else {
3990 info->total = raidPtr->reconControl->numRUsTotal;
3991 info->completed = raidPtr->reconControl->numRUsComplete;
3992 }
3993 info->remaining = info->total - info->completed;
3994 }
3995
3996 /* Fill in info with the current status */
3997 void
3998 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3999 {
4000
4001 memset(info, 0, sizeof(*info));
4002
4003 if (raidPtr->parity_rewrite_in_progress == 1) {
4004 info->total = raidPtr->Layout.numStripe;
4005 info->completed = raidPtr->parity_rewrite_stripes_done;
4006 } else {
4007 info->completed = 100;
4008 info->total = 100;
4009 }
4010 info->remaining = info->total - info->completed;
4011 }
4012
4013 /* Fill in info with the current status */
4014 void
4015 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
4016 {
4017
4018 memset(info, 0, sizeof(*info));
4019
4020 if (raidPtr->copyback_in_progress == 1) {
4021 info->total = raidPtr->Layout.numStripe;
4022 info->completed = raidPtr->copyback_stripes_done;
4023 info->remaining = info->total - info->completed;
4024 } else {
4025 info->remaining = 0;
4026 info->completed = 100;
4027 info->total = 100;
4028 }
4029 }
4030
4031 /* Fill in config with the current info */
4032 int
4033 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
4034 {
4035 int d, i, j;
4036
4037 if (!raidPtr->valid)
4038 return ENODEV;
4039 config->cols = raidPtr->numCol;
4040 config->ndevs = raidPtr->numCol;
4041 if (config->ndevs >= RF_MAX_DISKS)
4042 return ENOMEM;
4043 config->nspares = raidPtr->numSpare;
4044 if (config->nspares >= RF_MAX_DISKS)
4045 return ENOMEM;
4046 config->maxqdepth = raidPtr->maxQueueDepth;
4047 d = 0;
4048 for (j = 0; j < config->cols; j++) {
4049 config->devs[d] = raidPtr->Disks[j];
4050 d++;
4051 }
4052 for (i = 0; i < config->nspares; i++) {
4053 config->spares[i] = raidPtr->Disks[raidPtr->numCol + i];
4054 if (config->spares[i].status == rf_ds_rebuilding_spare) {
4055 /* raidctl(8) expects to see this as a used spare */
4056 config->spares[i].status = rf_ds_used_spare;
4057 }
4058 }
4059 return 0;
4060 }
4061
4062 int
4063 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
4064 {
4065 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
4066 RF_ComponentLabel_t *raid_clabel;
4067 int column = clabel->column;
4068
4069 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
4070 return EINVAL;
4071 raid_clabel = raidget_component_label(raidPtr, column);
4072 memcpy(clabel, raid_clabel, sizeof *clabel);
4073 /* Fix-up for userland. */
4074 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
4075 clabel->version = RF_COMPONENT_LABEL_VERSION;
4076
4077 return 0;
4078 }
4079
4080 /*
4081 * Module interface
4082 */
4083
4084 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
4085
4086 #ifdef _MODULE
4087 CFDRIVER_DECL(raid, DV_DISK, NULL);
4088 #endif
4089
4090 static int raid_modcmd(modcmd_t, void *);
4091 static int raid_modcmd_init(void);
4092 static int raid_modcmd_fini(void);
4093
4094 static int
4095 raid_modcmd(modcmd_t cmd, void *data)
4096 {
4097 int error;
4098
4099 error = 0;
4100 switch (cmd) {
4101 case MODULE_CMD_INIT:
4102 error = raid_modcmd_init();
4103 break;
4104 case MODULE_CMD_FINI:
4105 error = raid_modcmd_fini();
4106 break;
4107 default:
4108 error = ENOTTY;
4109 break;
4110 }
4111 return error;
4112 }
4113
4114 static int
4115 raid_modcmd_init(void)
4116 {
4117 int error;
4118 int bmajor, cmajor;
4119
4120 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
4121 mutex_enter(&raid_lock);
4122 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4123 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
4124 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4125 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4126
4127 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4128 #endif
4129
4130 bmajor = cmajor = -1;
4131 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4132 &raid_cdevsw, &cmajor);
4133 if (error != 0 && error != EEXIST) {
4134 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4135 mutex_exit(&raid_lock);
4136 return error;
4137 }
4138 #ifdef _MODULE
4139 error = config_cfdriver_attach(&raid_cd);
4140 if (error != 0) {
4141 aprint_error("%s: config_cfdriver_attach failed %d\n",
4142 __func__, error);
4143 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4144 mutex_exit(&raid_lock);
4145 return error;
4146 }
4147 #endif
4148 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4149 if (error != 0) {
4150 aprint_error("%s: config_cfattach_attach failed %d\n",
4151 __func__, error);
4152 #ifdef _MODULE
4153 config_cfdriver_detach(&raid_cd);
4154 #endif
4155 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4156 mutex_exit(&raid_lock);
4157 return error;
4158 }
4159
4160 raidautoconfigdone = false;
4161
4162 mutex_exit(&raid_lock);
4163
4164 if (error == 0) {
4165 if (rf_BootRaidframe(true) == 0)
4166 aprint_verbose("Kernelized RAIDframe activated\n");
4167 else
4168 panic("Serious error activating RAID!!");
4169 }
4170
4171 /*
4172 * Register a finalizer which will be used to auto-config RAID
4173 * sets once all real hardware devices have been found.
4174 */
4175 error = config_finalize_register(NULL, rf_autoconfig);
4176 if (error != 0) {
4177 aprint_error("WARNING: unable to register RAIDframe "
4178 "finalizer\n");
4179 error = 0;
4180 }
4181
4182 return error;
4183 }
4184
4185 static int
4186 raid_modcmd_fini(void)
4187 {
4188 int error;
4189
4190 mutex_enter(&raid_lock);
4191
4192 /* Don't allow unload if raid device(s) exist. */
4193 if (!LIST_EMPTY(&raids)) {
4194 mutex_exit(&raid_lock);
4195 return EBUSY;
4196 }
4197
4198 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4199 if (error != 0) {
4200 aprint_error("%s: cannot detach cfattach\n",__func__);
4201 mutex_exit(&raid_lock);
4202 return error;
4203 }
4204 #ifdef _MODULE
4205 error = config_cfdriver_detach(&raid_cd);
4206 if (error != 0) {
4207 aprint_error("%s: cannot detach cfdriver\n",__func__);
4208 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4209 mutex_exit(&raid_lock);
4210 return error;
4211 }
4212 #endif
4213 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4214 rf_BootRaidframe(false);
4215 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4216 rf_destroy_mutex2(rf_sparet_wait_mutex);
4217 rf_destroy_cond2(rf_sparet_wait_cv);
4218 rf_destroy_cond2(rf_sparet_resp_cv);
4219 #endif
4220 mutex_exit(&raid_lock);
4221 mutex_destroy(&raid_lock);
4222
4223 return error;
4224 }
4225