rf_netbsdkintf.c revision 1.391.2.1 1 /* $NetBSD: rf_netbsdkintf.c,v 1.391.2.1 2021/05/13 00:47:32 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.391.2.1 2021/05/13 00:47:32 thorpej Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171
172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
173 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
175 * installation process */
176 #endif
177
178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
179
180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
181
182 /* prototypes */
183 static void KernelWakeupFunc(struct buf *);
184 static void InitBP(struct buf *, struct vnode *, unsigned,
185 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
186 void *, int);
187 static void raidinit(struct raid_softc *);
188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
190
191 static int raid_match(device_t, cfdata_t, void *);
192 static void raid_attach(device_t, device_t, void *);
193 static int raid_detach(device_t, int);
194
195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
196 daddr_t, daddr_t);
197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
198 daddr_t, daddr_t, int);
199
200 static int raidwrite_component_label(unsigned,
201 dev_t, struct vnode *, RF_ComponentLabel_t *);
202 static int raidread_component_label(unsigned,
203 dev_t, struct vnode *, RF_ComponentLabel_t *);
204
205 static int raid_diskstart(device_t, struct buf *bp);
206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
207 static int raid_lastclose(device_t);
208
209 static dev_type_open(raidopen);
210 static dev_type_close(raidclose);
211 static dev_type_read(raidread);
212 static dev_type_write(raidwrite);
213 static dev_type_ioctl(raidioctl);
214 static dev_type_strategy(raidstrategy);
215 static dev_type_dump(raiddump);
216 static dev_type_size(raidsize);
217
218 const struct bdevsw raid_bdevsw = {
219 .d_open = raidopen,
220 .d_close = raidclose,
221 .d_strategy = raidstrategy,
222 .d_ioctl = raidioctl,
223 .d_dump = raiddump,
224 .d_psize = raidsize,
225 .d_discard = nodiscard,
226 .d_flag = D_DISK
227 };
228
229 const struct cdevsw raid_cdevsw = {
230 .d_open = raidopen,
231 .d_close = raidclose,
232 .d_read = raidread,
233 .d_write = raidwrite,
234 .d_ioctl = raidioctl,
235 .d_stop = nostop,
236 .d_tty = notty,
237 .d_poll = nopoll,
238 .d_mmap = nommap,
239 .d_kqfilter = nokqfilter,
240 .d_discard = nodiscard,
241 .d_flag = D_DISK
242 };
243
244 static struct dkdriver rf_dkdriver = {
245 .d_open = raidopen,
246 .d_close = raidclose,
247 .d_strategy = raidstrategy,
248 .d_diskstart = raid_diskstart,
249 .d_dumpblocks = raid_dumpblocks,
250 .d_lastclose = raid_lastclose,
251 .d_minphys = minphys
252 };
253
254 #define raidunit(x) DISKUNIT(x)
255 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
256
257 extern struct cfdriver raid_cd;
258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
259 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
260 DVF_DETACH_SHUTDOWN);
261
262 /* Internal representation of a rf_recon_req */
263 struct rf_recon_req_internal {
264 RF_RowCol_t col;
265 RF_ReconReqFlags_t flags;
266 void *raidPtr;
267 };
268
269 /*
270 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
271 * Be aware that large numbers can allow the driver to consume a lot of
272 * kernel memory, especially on writes, and in degraded mode reads.
273 *
274 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
275 * a single 64K write will typically require 64K for the old data,
276 * 64K for the old parity, and 64K for the new parity, for a total
277 * of 192K (if the parity buffer is not re-used immediately).
278 * Even it if is used immediately, that's still 128K, which when multiplied
279 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
280 *
281 * Now in degraded mode, for example, a 64K read on the above setup may
282 * require data reconstruction, which will require *all* of the 4 remaining
283 * disks to participate -- 4 * 32K/disk == 128K again.
284 */
285
286 #ifndef RAIDOUTSTANDING
287 #define RAIDOUTSTANDING 6
288 #endif
289
290 #define RAIDLABELDEV(dev) \
291 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
292
293 /* declared here, and made public, for the benefit of KVM stuff.. */
294
295 static int raidlock(struct raid_softc *);
296 static void raidunlock(struct raid_softc *);
297
298 static int raid_detach_unlocked(struct raid_softc *);
299
300 static void rf_markalldirty(RF_Raid_t *);
301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
302
303 void rf_ReconThread(struct rf_recon_req_internal *);
304 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
305 void rf_CopybackThread(RF_Raid_t *raidPtr);
306 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
307 int rf_autoconfig(device_t);
308 void rf_buildroothack(RF_ConfigSet_t *);
309
310 RF_AutoConfig_t *rf_find_raid_components(void);
311 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
313 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
314 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
315 int rf_set_autoconfig(RF_Raid_t *, int);
316 int rf_set_rootpartition(RF_Raid_t *, int);
317 void rf_release_all_vps(RF_ConfigSet_t *);
318 void rf_cleanup_config_set(RF_ConfigSet_t *);
319 int rf_have_enough_components(RF_ConfigSet_t *);
320 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
322
323 /*
324 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
325 * Note that this is overridden by having RAID_AUTOCONFIG as an option
326 * in the kernel config file.
327 */
328 #ifdef RAID_AUTOCONFIG
329 int raidautoconfig = 1;
330 #else
331 int raidautoconfig = 0;
332 #endif
333 static bool raidautoconfigdone = false;
334
335 struct RF_Pools_s rf_pools;
336
337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
338 static kmutex_t raid_lock;
339
340 static struct raid_softc *
341 raidcreate(int unit) {
342 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
343 sc->sc_unit = unit;
344 cv_init(&sc->sc_cv, "raidunit");
345 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
346 return sc;
347 }
348
349 static void
350 raiddestroy(struct raid_softc *sc) {
351 cv_destroy(&sc->sc_cv);
352 mutex_destroy(&sc->sc_mutex);
353 kmem_free(sc, sizeof(*sc));
354 }
355
356 static struct raid_softc *
357 raidget(int unit, bool create) {
358 struct raid_softc *sc;
359 if (unit < 0) {
360 #ifdef DIAGNOSTIC
361 panic("%s: unit %d!", __func__, unit);
362 #endif
363 return NULL;
364 }
365 mutex_enter(&raid_lock);
366 LIST_FOREACH(sc, &raids, sc_link) {
367 if (sc->sc_unit == unit) {
368 mutex_exit(&raid_lock);
369 return sc;
370 }
371 }
372 mutex_exit(&raid_lock);
373 if (!create)
374 return NULL;
375 sc = raidcreate(unit);
376 mutex_enter(&raid_lock);
377 LIST_INSERT_HEAD(&raids, sc, sc_link);
378 mutex_exit(&raid_lock);
379 return sc;
380 }
381
382 static void
383 raidput(struct raid_softc *sc) {
384 mutex_enter(&raid_lock);
385 LIST_REMOVE(sc, sc_link);
386 mutex_exit(&raid_lock);
387 raiddestroy(sc);
388 }
389
390 void
391 raidattach(int num)
392 {
393
394 /*
395 * Device attachment and associated initialization now occurs
396 * as part of the module initialization.
397 */
398 }
399
400 int
401 rf_autoconfig(device_t self)
402 {
403 RF_AutoConfig_t *ac_list;
404 RF_ConfigSet_t *config_sets;
405
406 if (!raidautoconfig || raidautoconfigdone == true)
407 return 0;
408
409 /* XXX This code can only be run once. */
410 raidautoconfigdone = true;
411
412 #ifdef __HAVE_CPU_BOOTCONF
413 /*
414 * 0. find the boot device if needed first so we can use it later
415 * this needs to be done before we autoconfigure any raid sets,
416 * because if we use wedges we are not going to be able to open
417 * the boot device later
418 */
419 if (booted_device == NULL)
420 cpu_bootconf();
421 #endif
422 /* 1. locate all RAID components on the system */
423 aprint_debug("Searching for RAID components...\n");
424 ac_list = rf_find_raid_components();
425
426 /* 2. Sort them into their respective sets. */
427 config_sets = rf_create_auto_sets(ac_list);
428
429 /*
430 * 3. Evaluate each set and configure the valid ones.
431 * This gets done in rf_buildroothack().
432 */
433 rf_buildroothack(config_sets);
434
435 return 1;
436 }
437
438 int
439 rf_inited(const struct raid_softc *rs) {
440 return (rs->sc_flags & RAIDF_INITED) != 0;
441 }
442
443 RF_Raid_t *
444 rf_get_raid(struct raid_softc *rs) {
445 return &rs->sc_r;
446 }
447
448 int
449 rf_get_unit(const struct raid_softc *rs) {
450 return rs->sc_unit;
451 }
452
453 static int
454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
455 const char *bootname;
456 size_t len;
457
458 /* if bdv is NULL, the set can't contain it. exit early. */
459 if (bdv == NULL)
460 return 0;
461
462 bootname = device_xname(bdv);
463 len = strlen(bootname);
464
465 for (int col = 0; col < r->numCol; col++) {
466 const char *devname = r->Disks[col].devname;
467 devname += sizeof("/dev/") - 1;
468 if (strncmp(devname, "dk", 2) == 0) {
469 const char *parent =
470 dkwedge_get_parent_name(r->Disks[col].dev);
471 if (parent != NULL)
472 devname = parent;
473 }
474 if (strncmp(devname, bootname, len) == 0) {
475 struct raid_softc *sc = r->softc;
476 aprint_debug("raid%d includes boot device %s\n",
477 sc->sc_unit, devname);
478 return 1;
479 }
480 }
481 return 0;
482 }
483
484 void
485 rf_buildroothack(RF_ConfigSet_t *config_sets)
486 {
487 RF_ConfigSet_t *cset;
488 RF_ConfigSet_t *next_cset;
489 int num_root;
490 struct raid_softc *sc, *rsc;
491 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
492
493 sc = rsc = NULL;
494 num_root = 0;
495 cset = config_sets;
496 while (cset != NULL) {
497 next_cset = cset->next;
498 if (rf_have_enough_components(cset) &&
499 cset->ac->clabel->autoconfigure == 1) {
500 sc = rf_auto_config_set(cset);
501 if (sc != NULL) {
502 aprint_debug("raid%d: configured ok, rootable %d\n",
503 sc->sc_unit, cset->rootable);
504 if (cset->rootable) {
505 rsc = sc;
506 num_root++;
507 }
508 } else {
509 /* The autoconfig didn't work :( */
510 aprint_debug("Autoconfig failed\n");
511 rf_release_all_vps(cset);
512 }
513 } else {
514 /* we're not autoconfiguring this set...
515 release the associated resources */
516 rf_release_all_vps(cset);
517 }
518 /* cleanup */
519 rf_cleanup_config_set(cset);
520 cset = next_cset;
521 }
522
523 /* if the user has specified what the root device should be
524 then we don't touch booted_device or boothowto... */
525
526 if (rootspec != NULL) {
527 DPRINTF("%s: rootspec %s\n", __func__, rootspec);
528 return;
529 }
530
531 /* we found something bootable... */
532
533 /*
534 * XXX: The following code assumes that the root raid
535 * is the first ('a') partition. This is about the best
536 * we can do with a BSD disklabel, but we might be able
537 * to do better with a GPT label, by setting a specified
538 * attribute to indicate the root partition. We can then
539 * stash the partition number in the r->root_partition
540 * high bits (the bottom 2 bits are already used). For
541 * now we just set booted_partition to 0 when we override
542 * root.
543 */
544 if (num_root == 1) {
545 device_t candidate_root;
546 dksc = &rsc->sc_dksc;
547 if (dksc->sc_dkdev.dk_nwedges != 0) {
548 char cname[sizeof(cset->ac->devname)];
549 /* XXX: assume partition 'a' first */
550 snprintf(cname, sizeof(cname), "%s%c",
551 device_xname(dksc->sc_dev), 'a');
552 candidate_root = dkwedge_find_by_wname(cname);
553 DPRINTF("%s: candidate wedge root=%s\n", __func__,
554 cname);
555 if (candidate_root == NULL) {
556 /*
557 * If that is not found, because we don't use
558 * disklabel, return the first dk child
559 * XXX: we can skip the 'a' check above
560 * and always do this...
561 */
562 size_t i = 0;
563 candidate_root = dkwedge_find_by_parent(
564 device_xname(dksc->sc_dev), &i);
565 }
566 DPRINTF("%s: candidate wedge root=%p\n", __func__,
567 candidate_root);
568 } else
569 candidate_root = dksc->sc_dev;
570 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
571 DPRINTF("%s: booted_device=%p root_partition=%d "
572 "contains_boot=%d",
573 __func__, booted_device, rsc->sc_r.root_partition,
574 rf_containsboot(&rsc->sc_r, booted_device));
575 /* XXX the check for booted_device == NULL can probably be
576 * dropped, now that rf_containsboot handles that case.
577 */
578 if (booted_device == NULL ||
579 rsc->sc_r.root_partition == 1 ||
580 rf_containsboot(&rsc->sc_r, booted_device)) {
581 booted_device = candidate_root;
582 booted_method = "raidframe/single";
583 booted_partition = 0; /* XXX assume 'a' */
584 DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
585 device_xname(booted_device), booted_device);
586 }
587 } else if (num_root > 1) {
588 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
589 booted_device);
590
591 /*
592 * Maybe the MD code can help. If it cannot, then
593 * setroot() will discover that we have no
594 * booted_device and will ask the user if nothing was
595 * hardwired in the kernel config file
596 */
597 if (booted_device == NULL)
598 return;
599
600 num_root = 0;
601 mutex_enter(&raid_lock);
602 LIST_FOREACH(sc, &raids, sc_link) {
603 RF_Raid_t *r = &sc->sc_r;
604 if (r->valid == 0)
605 continue;
606
607 if (r->root_partition == 0)
608 continue;
609
610 if (rf_containsboot(r, booted_device)) {
611 num_root++;
612 rsc = sc;
613 dksc = &rsc->sc_dksc;
614 }
615 }
616 mutex_exit(&raid_lock);
617
618 if (num_root == 1) {
619 booted_device = dksc->sc_dev;
620 booted_method = "raidframe/multi";
621 booted_partition = 0; /* XXX assume 'a' */
622 } else {
623 /* we can't guess.. require the user to answer... */
624 boothowto |= RB_ASKNAME;
625 }
626 }
627 }
628
629 static int
630 raidsize(dev_t dev)
631 {
632 struct raid_softc *rs;
633 struct dk_softc *dksc;
634 unsigned int unit;
635
636 unit = raidunit(dev);
637 if ((rs = raidget(unit, false)) == NULL)
638 return -1;
639 dksc = &rs->sc_dksc;
640
641 if ((rs->sc_flags & RAIDF_INITED) == 0)
642 return -1;
643
644 return dk_size(dksc, dev);
645 }
646
647 static int
648 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
649 {
650 unsigned int unit;
651 struct raid_softc *rs;
652 struct dk_softc *dksc;
653
654 unit = raidunit(dev);
655 if ((rs = raidget(unit, false)) == NULL)
656 return ENXIO;
657 dksc = &rs->sc_dksc;
658
659 if ((rs->sc_flags & RAIDF_INITED) == 0)
660 return ENODEV;
661
662 /*
663 Note that blkno is relative to this particular partition.
664 By adding adding RF_PROTECTED_SECTORS, we get a value that
665 is relative to the partition used for the underlying component.
666 */
667 blkno += RF_PROTECTED_SECTORS;
668
669 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
670 }
671
672 static int
673 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
674 {
675 struct raid_softc *rs = raidsoftc(dev);
676 const struct bdevsw *bdev;
677 RF_Raid_t *raidPtr;
678 int c, sparecol, j, scol, dumpto;
679 int error = 0;
680
681 raidPtr = &rs->sc_r;
682
683 /* we only support dumping to RAID 1 sets */
684 if (raidPtr->Layout.numDataCol != 1 ||
685 raidPtr->Layout.numParityCol != 1)
686 return EINVAL;
687
688 if ((error = raidlock(rs)) != 0)
689 return error;
690
691 /* figure out what device is alive.. */
692
693 /*
694 Look for a component to dump to. The preference for the
695 component to dump to is as follows:
696 1) the first component
697 2) a used_spare of the first component
698 3) the second component
699 4) a used_spare of the second component
700 */
701
702 dumpto = -1;
703 for (c = 0; c < raidPtr->numCol; c++) {
704 if (raidPtr->Disks[c].status == rf_ds_optimal) {
705 /* this might be the one */
706 dumpto = c;
707 break;
708 }
709 }
710
711 /*
712 At this point we have possibly selected a live component.
713 If we didn't find a live ocmponent, we now check to see
714 if there is a relevant spared component.
715 */
716
717 for (c = 0; c < raidPtr->numSpare; c++) {
718 sparecol = raidPtr->numCol + c;
719 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
720 /* How about this one? */
721 scol = -1;
722 for(j=0;j<raidPtr->numCol;j++) {
723 if (raidPtr->Disks[j].spareCol == sparecol) {
724 scol = j;
725 break;
726 }
727 }
728 if (scol == 0) {
729 /*
730 We must have found a spared first
731 component! We'll take that over
732 anything else found so far. (We
733 couldn't have found a real first
734 component before, since this is a
735 used spare, and it's saying that
736 it's replacing the first
737 component.) On reboot (with
738 autoconfiguration turned on)
739 sparecol will become the first
740 component (component0) of this set.
741 */
742 dumpto = sparecol;
743 break;
744 } else if (scol != -1) {
745 /*
746 Must be a spared second component.
747 We'll dump to that if we havn't found
748 anything else so far.
749 */
750 if (dumpto == -1)
751 dumpto = sparecol;
752 }
753 }
754 }
755
756 if (dumpto == -1) {
757 /* we couldn't find any live components to dump to!?!?
758 */
759 error = EINVAL;
760 goto out;
761 }
762
763 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
764 if (bdev == NULL) {
765 error = ENXIO;
766 goto out;
767 }
768
769 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
770 blkno, va, nblk * raidPtr->bytesPerSector);
771
772 out:
773 raidunlock(rs);
774
775 return error;
776 }
777
778 /* ARGSUSED */
779 static int
780 raidopen(dev_t dev, int flags, int fmt,
781 struct lwp *l)
782 {
783 int unit = raidunit(dev);
784 struct raid_softc *rs;
785 struct dk_softc *dksc;
786 int error = 0;
787 int part, pmask;
788
789 if ((rs = raidget(unit, true)) == NULL)
790 return ENXIO;
791 if ((error = raidlock(rs)) != 0)
792 return error;
793
794 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
795 error = EBUSY;
796 goto bad;
797 }
798
799 dksc = &rs->sc_dksc;
800
801 part = DISKPART(dev);
802 pmask = (1 << part);
803
804 if (!DK_BUSY(dksc, pmask) &&
805 ((rs->sc_flags & RAIDF_INITED) != 0)) {
806 /* First one... mark things as dirty... Note that we *MUST*
807 have done a configure before this. I DO NOT WANT TO BE
808 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
809 THAT THEY BELONG TOGETHER!!!!! */
810 /* XXX should check to see if we're only open for reading
811 here... If so, we needn't do this, but then need some
812 other way of keeping track of what's happened.. */
813
814 rf_markalldirty(&rs->sc_r);
815 }
816
817 if ((rs->sc_flags & RAIDF_INITED) != 0)
818 error = dk_open(dksc, dev, flags, fmt, l);
819
820 bad:
821 raidunlock(rs);
822
823 return error;
824
825
826 }
827
828 static int
829 raid_lastclose(device_t self)
830 {
831 struct raid_softc *rs = raidsoftc(self);
832
833 /* Last one... device is not unconfigured yet.
834 Device shutdown has taken care of setting the
835 clean bits if RAIDF_INITED is not set
836 mark things as clean... */
837
838 rf_update_component_labels(&rs->sc_r,
839 RF_FINAL_COMPONENT_UPDATE);
840
841 /* pass to unlocked code */
842 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
843 rs->sc_flags |= RAIDF_DETACH;
844
845 return 0;
846 }
847
848 /* ARGSUSED */
849 static int
850 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
851 {
852 int unit = raidunit(dev);
853 struct raid_softc *rs;
854 struct dk_softc *dksc;
855 cfdata_t cf;
856 int error = 0, do_detach = 0, do_put = 0;
857
858 if ((rs = raidget(unit, false)) == NULL)
859 return ENXIO;
860 dksc = &rs->sc_dksc;
861
862 if ((error = raidlock(rs)) != 0)
863 return error;
864
865 if ((rs->sc_flags & RAIDF_INITED) != 0) {
866 error = dk_close(dksc, dev, flags, fmt, l);
867 if ((rs->sc_flags & RAIDF_DETACH) != 0)
868 do_detach = 1;
869 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
870 do_put = 1;
871
872 raidunlock(rs);
873
874 if (do_detach) {
875 /* free the pseudo device attach bits */
876 cf = device_cfdata(dksc->sc_dev);
877 error = config_detach(dksc->sc_dev, 0);
878 if (error == 0)
879 free(cf, M_RAIDFRAME);
880 } else if (do_put) {
881 raidput(rs);
882 }
883
884 return error;
885
886 }
887
888 static void
889 raid_wakeup(RF_Raid_t *raidPtr)
890 {
891 rf_lock_mutex2(raidPtr->iodone_lock);
892 rf_signal_cond2(raidPtr->iodone_cv);
893 rf_unlock_mutex2(raidPtr->iodone_lock);
894 }
895
896 static void
897 raidstrategy(struct buf *bp)
898 {
899 unsigned int unit;
900 struct raid_softc *rs;
901 struct dk_softc *dksc;
902 RF_Raid_t *raidPtr;
903
904 unit = raidunit(bp->b_dev);
905 if ((rs = raidget(unit, false)) == NULL) {
906 bp->b_error = ENXIO;
907 goto fail;
908 }
909 if ((rs->sc_flags & RAIDF_INITED) == 0) {
910 bp->b_error = ENXIO;
911 goto fail;
912 }
913 dksc = &rs->sc_dksc;
914 raidPtr = &rs->sc_r;
915
916 /* Queue IO only */
917 if (dk_strategy_defer(dksc, bp))
918 goto done;
919
920 /* schedule the IO to happen at the next convenient time */
921 raid_wakeup(raidPtr);
922
923 done:
924 return;
925
926 fail:
927 bp->b_resid = bp->b_bcount;
928 biodone(bp);
929 }
930
931 static int
932 raid_diskstart(device_t dev, struct buf *bp)
933 {
934 struct raid_softc *rs = raidsoftc(dev);
935 RF_Raid_t *raidPtr;
936
937 raidPtr = &rs->sc_r;
938 if (!raidPtr->valid) {
939 db1_printf(("raid is not valid..\n"));
940 return ENODEV;
941 }
942
943 /* XXX */
944 bp->b_resid = 0;
945
946 return raiddoaccess(raidPtr, bp);
947 }
948
949 void
950 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
951 {
952 struct raid_softc *rs;
953 struct dk_softc *dksc;
954
955 rs = raidPtr->softc;
956 dksc = &rs->sc_dksc;
957
958 dk_done(dksc, bp);
959
960 rf_lock_mutex2(raidPtr->mutex);
961 raidPtr->openings++;
962 rf_unlock_mutex2(raidPtr->mutex);
963
964 /* schedule more IO */
965 raid_wakeup(raidPtr);
966 }
967
968 /* ARGSUSED */
969 static int
970 raidread(dev_t dev, struct uio *uio, int flags)
971 {
972 int unit = raidunit(dev);
973 struct raid_softc *rs;
974
975 if ((rs = raidget(unit, false)) == NULL)
976 return ENXIO;
977
978 if ((rs->sc_flags & RAIDF_INITED) == 0)
979 return ENXIO;
980
981 return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
982
983 }
984
985 /* ARGSUSED */
986 static int
987 raidwrite(dev_t dev, struct uio *uio, int flags)
988 {
989 int unit = raidunit(dev);
990 struct raid_softc *rs;
991
992 if ((rs = raidget(unit, false)) == NULL)
993 return ENXIO;
994
995 if ((rs->sc_flags & RAIDF_INITED) == 0)
996 return ENXIO;
997
998 return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
999
1000 }
1001
1002 static int
1003 raid_detach_unlocked(struct raid_softc *rs)
1004 {
1005 struct dk_softc *dksc = &rs->sc_dksc;
1006 RF_Raid_t *raidPtr;
1007 int error;
1008
1009 raidPtr = &rs->sc_r;
1010
1011 if (DK_BUSY(dksc, 0) ||
1012 raidPtr->recon_in_progress != 0 ||
1013 raidPtr->parity_rewrite_in_progress != 0 ||
1014 raidPtr->copyback_in_progress != 0)
1015 return EBUSY;
1016
1017 if ((rs->sc_flags & RAIDF_INITED) == 0)
1018 return 0;
1019
1020 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1021
1022 if ((error = rf_Shutdown(raidPtr)) != 0)
1023 return error;
1024
1025 rs->sc_flags &= ~RAIDF_INITED;
1026
1027 /* Kill off any queued buffers */
1028 dk_drain(dksc);
1029 bufq_free(dksc->sc_bufq);
1030
1031 /* Detach the disk. */
1032 dkwedge_delall(&dksc->sc_dkdev);
1033 disk_detach(&dksc->sc_dkdev);
1034 disk_destroy(&dksc->sc_dkdev);
1035 dk_detach(dksc);
1036
1037 return 0;
1038 }
1039
1040 static bool
1041 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1042 {
1043 switch (cmd) {
1044 case RAIDFRAME_ADD_HOT_SPARE:
1045 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1046 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1047 case RAIDFRAME_CHECK_PARITY:
1048 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1049 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1050 case RAIDFRAME_CHECK_RECON_STATUS:
1051 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1052 case RAIDFRAME_COPYBACK:
1053 case RAIDFRAME_DELETE_COMPONENT:
1054 case RAIDFRAME_FAIL_DISK:
1055 case RAIDFRAME_GET_ACCTOTALS:
1056 case RAIDFRAME_GET_COMPONENT_LABEL:
1057 case RAIDFRAME_GET_INFO:
1058 case RAIDFRAME_GET_SIZE:
1059 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1060 case RAIDFRAME_INIT_LABELS:
1061 case RAIDFRAME_KEEP_ACCTOTALS:
1062 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1063 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1064 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1065 case RAIDFRAME_PARITYMAP_STATUS:
1066 case RAIDFRAME_REBUILD_IN_PLACE:
1067 case RAIDFRAME_REMOVE_HOT_SPARE:
1068 case RAIDFRAME_RESET_ACCTOTALS:
1069 case RAIDFRAME_REWRITEPARITY:
1070 case RAIDFRAME_SET_AUTOCONFIG:
1071 case RAIDFRAME_SET_COMPONENT_LABEL:
1072 case RAIDFRAME_SET_ROOT:
1073 return (rs->sc_flags & RAIDF_INITED) == 0;
1074 }
1075 return false;
1076 }
1077
1078 int
1079 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1080 {
1081 struct rf_recon_req_internal *rrint;
1082
1083 if (raidPtr->Layout.map->faultsTolerated == 0) {
1084 /* Can't do this on a RAID 0!! */
1085 return EINVAL;
1086 }
1087
1088 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1089 /* bad column */
1090 return EINVAL;
1091 }
1092
1093 rf_lock_mutex2(raidPtr->mutex);
1094 if (raidPtr->status == rf_rs_reconstructing) {
1095 /* you can't fail a disk while we're reconstructing! */
1096 /* XXX wrong for RAID6 */
1097 goto out;
1098 }
1099 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1100 (raidPtr->numFailures > 0)) {
1101 /* some other component has failed. Let's not make
1102 things worse. XXX wrong for RAID6 */
1103 goto out;
1104 }
1105 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1106 /* Can't fail a spared disk! */
1107 goto out;
1108 }
1109 rf_unlock_mutex2(raidPtr->mutex);
1110
1111 /* make a copy of the recon request so that we don't rely on
1112 * the user's buffer */
1113 rrint = RF_Malloc(sizeof(*rrint));
1114 if (rrint == NULL)
1115 return(ENOMEM);
1116 rrint->col = rr->col;
1117 rrint->flags = rr->flags;
1118 rrint->raidPtr = raidPtr;
1119
1120 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1121 rrint, "raid_recon");
1122 out:
1123 rf_unlock_mutex2(raidPtr->mutex);
1124 return EINVAL;
1125 }
1126
1127 static int
1128 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1129 {
1130 /* allocate a buffer for the layout-specific data, and copy it in */
1131 if (k_cfg->layoutSpecificSize == 0)
1132 return 0;
1133
1134 if (k_cfg->layoutSpecificSize > 10000) {
1135 /* sanity check */
1136 return EINVAL;
1137 }
1138
1139 u_char *specific_buf;
1140 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1141 if (specific_buf == NULL)
1142 return ENOMEM;
1143
1144 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1145 k_cfg->layoutSpecificSize);
1146 if (retcode) {
1147 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1148 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1149 return retcode;
1150 }
1151
1152 k_cfg->layoutSpecific = specific_buf;
1153 return 0;
1154 }
1155
1156 static int
1157 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1158 {
1159 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1160
1161 if (rs->sc_r.valid) {
1162 /* There is a valid RAID set running on this unit! */
1163 printf("raid%d: Device already configured!\n", rs->sc_unit);
1164 return EINVAL;
1165 }
1166
1167 /* copy-in the configuration information */
1168 /* data points to a pointer to the configuration structure */
1169 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1170 if (*k_cfg == NULL) {
1171 return ENOMEM;
1172 }
1173 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1174 if (retcode == 0)
1175 return 0;
1176 RF_Free(*k_cfg, sizeof(RF_Config_t));
1177 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1178 rs->sc_flags |= RAIDF_SHUTDOWN;
1179 return retcode;
1180 }
1181
1182 int
1183 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1184 {
1185 int retcode;
1186 RF_Raid_t *raidPtr = &rs->sc_r;
1187
1188 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1189
1190 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1191 goto out;
1192
1193 /* should do some kind of sanity check on the configuration.
1194 * Store the sum of all the bytes in the last byte? */
1195
1196 /* configure the system */
1197
1198 /*
1199 * Clear the entire RAID descriptor, just to make sure
1200 * there is no stale data left in the case of a
1201 * reconfiguration
1202 */
1203 memset(raidPtr, 0, sizeof(*raidPtr));
1204 raidPtr->softc = rs;
1205 raidPtr->raidid = rs->sc_unit;
1206
1207 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1208
1209 if (retcode == 0) {
1210 /* allow this many simultaneous IO's to
1211 this RAID device */
1212 raidPtr->openings = RAIDOUTSTANDING;
1213
1214 raidinit(rs);
1215 raid_wakeup(raidPtr);
1216 rf_markalldirty(raidPtr);
1217 }
1218
1219 /* free the buffers. No return code here. */
1220 if (k_cfg->layoutSpecificSize) {
1221 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1222 }
1223 out:
1224 RF_Free(k_cfg, sizeof(RF_Config_t));
1225 if (retcode) {
1226 /*
1227 * If configuration failed, set sc_flags so that we
1228 * will detach the device when we close it.
1229 */
1230 rs->sc_flags |= RAIDF_SHUTDOWN;
1231 }
1232 return retcode;
1233 }
1234
1235 #if RF_DISABLED
1236 static int
1237 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1238 {
1239
1240 /* XXX check the label for valid stuff... */
1241 /* Note that some things *should not* get modified --
1242 the user should be re-initing the labels instead of
1243 trying to patch things.
1244 */
1245 #ifdef DEBUG
1246 int raidid = raidPtr->raidid;
1247 printf("raid%d: Got component label:\n", raidid);
1248 printf("raid%d: Version: %d\n", raidid, clabel->version);
1249 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1250 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1251 printf("raid%d: Column: %d\n", raidid, clabel->column);
1252 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1253 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1254 printf("raid%d: Status: %d\n", raidid, clabel->status);
1255 #endif /* DEBUG */
1256 clabel->row = 0;
1257 int column = clabel->column;
1258
1259 if ((column < 0) || (column >= raidPtr->numCol)) {
1260 return(EINVAL);
1261 }
1262
1263 /* XXX this isn't allowed to do anything for now :-) */
1264
1265 /* XXX and before it is, we need to fill in the rest
1266 of the fields!?!?!?! */
1267 memcpy(raidget_component_label(raidPtr, column),
1268 clabel, sizeof(*clabel));
1269 raidflush_component_label(raidPtr, column);
1270 return 0;
1271 }
1272 #endif
1273
1274 static int
1275 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1276 {
1277 /*
1278 we only want the serial number from
1279 the above. We get all the rest of the information
1280 from the config that was used to create this RAID
1281 set.
1282 */
1283
1284 raidPtr->serial_number = clabel->serial_number;
1285
1286 for (int column = 0; column < raidPtr->numCol; column++) {
1287 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1288 if (RF_DEAD_DISK(diskPtr->status))
1289 continue;
1290 RF_ComponentLabel_t *ci_label = raidget_component_label(
1291 raidPtr, column);
1292 /* Zeroing this is important. */
1293 memset(ci_label, 0, sizeof(*ci_label));
1294 raid_init_component_label(raidPtr, ci_label);
1295 ci_label->serial_number = raidPtr->serial_number;
1296 ci_label->row = 0; /* we dont' pretend to support more */
1297 rf_component_label_set_partitionsize(ci_label,
1298 diskPtr->partitionSize);
1299 ci_label->column = column;
1300 raidflush_component_label(raidPtr, column);
1301 /* XXXjld what about the spares? */
1302 }
1303
1304 return 0;
1305 }
1306
1307 static int
1308 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1309 {
1310
1311 if (raidPtr->Layout.map->faultsTolerated == 0) {
1312 /* Can't do this on a RAID 0!! */
1313 return EINVAL;
1314 }
1315
1316 if (raidPtr->recon_in_progress == 1) {
1317 /* a reconstruct is already in progress! */
1318 return EINVAL;
1319 }
1320
1321 RF_SingleComponent_t component;
1322 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1323 component.row = 0; /* we don't support any more */
1324 int column = component.column;
1325
1326 if ((column < 0) || (column >= raidPtr->numCol)) {
1327 return EINVAL;
1328 }
1329
1330 rf_lock_mutex2(raidPtr->mutex);
1331 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1332 (raidPtr->numFailures > 0)) {
1333 /* XXX 0 above shouldn't be constant!!! */
1334 /* some component other than this has failed.
1335 Let's not make things worse than they already
1336 are... */
1337 printf("raid%d: Unable to reconstruct to disk at:\n",
1338 raidPtr->raidid);
1339 printf("raid%d: Col: %d Too many failures.\n",
1340 raidPtr->raidid, column);
1341 rf_unlock_mutex2(raidPtr->mutex);
1342 return EINVAL;
1343 }
1344
1345 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1346 printf("raid%d: Unable to reconstruct to disk at:\n",
1347 raidPtr->raidid);
1348 printf("raid%d: Col: %d "
1349 "Reconstruction already occurring!\n",
1350 raidPtr->raidid, column);
1351
1352 rf_unlock_mutex2(raidPtr->mutex);
1353 return EINVAL;
1354 }
1355
1356 if (raidPtr->Disks[column].status == rf_ds_spared) {
1357 rf_unlock_mutex2(raidPtr->mutex);
1358 return EINVAL;
1359 }
1360
1361 rf_unlock_mutex2(raidPtr->mutex);
1362
1363 struct rf_recon_req_internal *rrint;
1364 rrint = RF_Malloc(sizeof(*rrint));
1365 if (rrint == NULL)
1366 return ENOMEM;
1367
1368 rrint->col = column;
1369 rrint->raidPtr = raidPtr;
1370
1371 return RF_CREATE_THREAD(raidPtr->recon_thread,
1372 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1373 }
1374
1375 static int
1376 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1377 {
1378 /*
1379 * This makes no sense on a RAID 0, or if we are not reconstructing
1380 * so tell the user it's done.
1381 */
1382 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1383 raidPtr->status != rf_rs_reconstructing) {
1384 *data = 100;
1385 return 0;
1386 }
1387 if (raidPtr->reconControl->numRUsTotal == 0) {
1388 *data = 0;
1389 return 0;
1390 }
1391 *data = (raidPtr->reconControl->numRUsComplete * 100
1392 / raidPtr->reconControl->numRUsTotal);
1393 return 0;
1394 }
1395
1396 static int
1397 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1398 {
1399 int unit = raidunit(dev);
1400 int part, pmask;
1401 struct raid_softc *rs;
1402 struct dk_softc *dksc;
1403 RF_Config_t *k_cfg;
1404 RF_Raid_t *raidPtr;
1405 RF_AccTotals_t *totals;
1406 RF_SingleComponent_t component;
1407 RF_DeviceConfig_t *d_cfg, *ucfgp;
1408 int retcode = 0;
1409 int column;
1410 RF_ComponentLabel_t *clabel;
1411 RF_SingleComponent_t *sparePtr,*componentPtr;
1412 int d;
1413
1414 if ((rs = raidget(unit, false)) == NULL)
1415 return ENXIO;
1416
1417 dksc = &rs->sc_dksc;
1418 raidPtr = &rs->sc_r;
1419
1420 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1421 (int) DISKPART(dev), (int) unit, cmd));
1422
1423 /* Must be initialized for these... */
1424 if (rf_must_be_initialized(rs, cmd))
1425 return ENXIO;
1426
1427 switch (cmd) {
1428 /* configure the system */
1429 case RAIDFRAME_CONFIGURE:
1430 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1431 return retcode;
1432 return rf_construct(rs, k_cfg);
1433
1434 /* shutdown the system */
1435 case RAIDFRAME_SHUTDOWN:
1436
1437 part = DISKPART(dev);
1438 pmask = (1 << part);
1439
1440 if ((retcode = raidlock(rs)) != 0)
1441 return retcode;
1442
1443 if (DK_BUSY(dksc, pmask) ||
1444 raidPtr->recon_in_progress != 0 ||
1445 raidPtr->parity_rewrite_in_progress != 0 ||
1446 raidPtr->copyback_in_progress != 0)
1447 retcode = EBUSY;
1448 else {
1449 /* detach and free on close */
1450 rs->sc_flags |= RAIDF_SHUTDOWN;
1451 retcode = 0;
1452 }
1453
1454 raidunlock(rs);
1455
1456 return retcode;
1457 case RAIDFRAME_GET_COMPONENT_LABEL:
1458 return rf_get_component_label(raidPtr, data);
1459
1460 #if RF_DISABLED
1461 case RAIDFRAME_SET_COMPONENT_LABEL:
1462 return rf_set_component_label(raidPtr, data);
1463 #endif
1464
1465 case RAIDFRAME_INIT_LABELS:
1466 return rf_init_component_label(raidPtr, data);
1467
1468 case RAIDFRAME_SET_AUTOCONFIG:
1469 d = rf_set_autoconfig(raidPtr, *(int *) data);
1470 printf("raid%d: New autoconfig value is: %d\n",
1471 raidPtr->raidid, d);
1472 *(int *) data = d;
1473 return retcode;
1474
1475 case RAIDFRAME_SET_ROOT:
1476 d = rf_set_rootpartition(raidPtr, *(int *) data);
1477 printf("raid%d: New rootpartition value is: %d\n",
1478 raidPtr->raidid, d);
1479 *(int *) data = d;
1480 return retcode;
1481
1482 /* initialize all parity */
1483 case RAIDFRAME_REWRITEPARITY:
1484
1485 if (raidPtr->Layout.map->faultsTolerated == 0) {
1486 /* Parity for RAID 0 is trivially correct */
1487 raidPtr->parity_good = RF_RAID_CLEAN;
1488 return 0;
1489 }
1490
1491 if (raidPtr->parity_rewrite_in_progress == 1) {
1492 /* Re-write is already in progress! */
1493 return EINVAL;
1494 }
1495
1496 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1497 rf_RewriteParityThread, raidPtr,"raid_parity");
1498
1499 case RAIDFRAME_ADD_HOT_SPARE:
1500 sparePtr = (RF_SingleComponent_t *) data;
1501 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1502 return rf_add_hot_spare(raidPtr, &component);
1503
1504 case RAIDFRAME_REMOVE_HOT_SPARE:
1505 return retcode;
1506
1507 case RAIDFRAME_DELETE_COMPONENT:
1508 componentPtr = (RF_SingleComponent_t *)data;
1509 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1510 return rf_delete_component(raidPtr, &component);
1511
1512 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1513 componentPtr = (RF_SingleComponent_t *)data;
1514 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1515 return rf_incorporate_hot_spare(raidPtr, &component);
1516
1517 case RAIDFRAME_REBUILD_IN_PLACE:
1518 return rf_rebuild_in_place(raidPtr, data);
1519
1520 case RAIDFRAME_GET_INFO:
1521 ucfgp = *(RF_DeviceConfig_t **)data;
1522 d_cfg = RF_Malloc(sizeof(*d_cfg));
1523 if (d_cfg == NULL)
1524 return ENOMEM;
1525 retcode = rf_get_info(raidPtr, d_cfg);
1526 if (retcode == 0) {
1527 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1528 }
1529 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1530 return retcode;
1531
1532 case RAIDFRAME_CHECK_PARITY:
1533 *(int *) data = raidPtr->parity_good;
1534 return 0;
1535
1536 case RAIDFRAME_PARITYMAP_STATUS:
1537 if (rf_paritymap_ineligible(raidPtr))
1538 return EINVAL;
1539 rf_paritymap_status(raidPtr->parity_map, data);
1540 return 0;
1541
1542 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1543 if (rf_paritymap_ineligible(raidPtr))
1544 return EINVAL;
1545 if (raidPtr->parity_map == NULL)
1546 return ENOENT; /* ??? */
1547 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1548 return EINVAL;
1549 return 0;
1550
1551 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1552 if (rf_paritymap_ineligible(raidPtr))
1553 return EINVAL;
1554 *(int *) data = rf_paritymap_get_disable(raidPtr);
1555 return 0;
1556
1557 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1558 if (rf_paritymap_ineligible(raidPtr))
1559 return EINVAL;
1560 rf_paritymap_set_disable(raidPtr, *(int *)data);
1561 /* XXX should errors be passed up? */
1562 return 0;
1563
1564 case RAIDFRAME_RESET_ACCTOTALS:
1565 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1566 return 0;
1567
1568 case RAIDFRAME_GET_ACCTOTALS:
1569 totals = (RF_AccTotals_t *) data;
1570 *totals = raidPtr->acc_totals;
1571 return 0;
1572
1573 case RAIDFRAME_KEEP_ACCTOTALS:
1574 raidPtr->keep_acc_totals = *(int *)data;
1575 return 0;
1576
1577 case RAIDFRAME_GET_SIZE:
1578 *(int *) data = raidPtr->totalSectors;
1579 return 0;
1580
1581 case RAIDFRAME_FAIL_DISK:
1582 return rf_fail_disk(raidPtr, data);
1583
1584 /* invoke a copyback operation after recon on whatever disk
1585 * needs it, if any */
1586 case RAIDFRAME_COPYBACK:
1587
1588 if (raidPtr->Layout.map->faultsTolerated == 0) {
1589 /* This makes no sense on a RAID 0!! */
1590 return EINVAL;
1591 }
1592
1593 if (raidPtr->copyback_in_progress == 1) {
1594 /* Copyback is already in progress! */
1595 return EINVAL;
1596 }
1597
1598 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1599 rf_CopybackThread, raidPtr, "raid_copyback");
1600
1601 /* return the percentage completion of reconstruction */
1602 case RAIDFRAME_CHECK_RECON_STATUS:
1603 return rf_check_recon_status(raidPtr, data);
1604
1605 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1606 rf_check_recon_status_ext(raidPtr, data);
1607 return 0;
1608
1609 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1610 if (raidPtr->Layout.map->faultsTolerated == 0) {
1611 /* This makes no sense on a RAID 0, so tell the
1612 user it's done. */
1613 *(int *) data = 100;
1614 return 0;
1615 }
1616 if (raidPtr->parity_rewrite_in_progress == 1) {
1617 *(int *) data = 100 *
1618 raidPtr->parity_rewrite_stripes_done /
1619 raidPtr->Layout.numStripe;
1620 } else {
1621 *(int *) data = 100;
1622 }
1623 return 0;
1624
1625 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1626 rf_check_parityrewrite_status_ext(raidPtr, data);
1627 return 0;
1628
1629 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1630 if (raidPtr->Layout.map->faultsTolerated == 0) {
1631 /* This makes no sense on a RAID 0 */
1632 *(int *) data = 100;
1633 return 0;
1634 }
1635 if (raidPtr->copyback_in_progress == 1) {
1636 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1637 raidPtr->Layout.numStripe;
1638 } else {
1639 *(int *) data = 100;
1640 }
1641 return 0;
1642
1643 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1644 rf_check_copyback_status_ext(raidPtr, data);
1645 return 0;
1646
1647 case RAIDFRAME_SET_LAST_UNIT:
1648 for (column = 0; column < raidPtr->numCol; column++)
1649 if (raidPtr->Disks[column].status != rf_ds_optimal)
1650 return EBUSY;
1651
1652 for (column = 0; column < raidPtr->numCol; column++) {
1653 clabel = raidget_component_label(raidPtr, column);
1654 clabel->last_unit = *(int *)data;
1655 raidflush_component_label(raidPtr, column);
1656 }
1657 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1658 return 0;
1659
1660 /* the sparetable daemon calls this to wait for the kernel to
1661 * need a spare table. this ioctl does not return until a
1662 * spare table is needed. XXX -- calling mpsleep here in the
1663 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1664 * -- I should either compute the spare table in the kernel,
1665 * or have a different -- XXX XXX -- interface (a different
1666 * character device) for delivering the table -- XXX */
1667 #if RF_DISABLED
1668 case RAIDFRAME_SPARET_WAIT:
1669 rf_lock_mutex2(rf_sparet_wait_mutex);
1670 while (!rf_sparet_wait_queue)
1671 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1672 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1673 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1674 rf_unlock_mutex2(rf_sparet_wait_mutex);
1675
1676 /* structure assignment */
1677 *((RF_SparetWait_t *) data) = *waitreq;
1678
1679 RF_Free(waitreq, sizeof(*waitreq));
1680 return 0;
1681
1682 /* wakes up a process waiting on SPARET_WAIT and puts an error
1683 * code in it that will cause the dameon to exit */
1684 case RAIDFRAME_ABORT_SPARET_WAIT:
1685 waitreq = RF_Malloc(sizeof(*waitreq));
1686 waitreq->fcol = -1;
1687 rf_lock_mutex2(rf_sparet_wait_mutex);
1688 waitreq->next = rf_sparet_wait_queue;
1689 rf_sparet_wait_queue = waitreq;
1690 rf_broadcast_cond2(rf_sparet_wait_cv);
1691 rf_unlock_mutex2(rf_sparet_wait_mutex);
1692 return 0;
1693
1694 /* used by the spare table daemon to deliver a spare table
1695 * into the kernel */
1696 case RAIDFRAME_SEND_SPARET:
1697
1698 /* install the spare table */
1699 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1700
1701 /* respond to the requestor. the return status of the spare
1702 * table installation is passed in the "fcol" field */
1703 waitred = RF_Malloc(sizeof(*waitreq));
1704 waitreq->fcol = retcode;
1705 rf_lock_mutex2(rf_sparet_wait_mutex);
1706 waitreq->next = rf_sparet_resp_queue;
1707 rf_sparet_resp_queue = waitreq;
1708 rf_broadcast_cond2(rf_sparet_resp_cv);
1709 rf_unlock_mutex2(rf_sparet_wait_mutex);
1710
1711 return retcode;
1712 #endif
1713 default:
1714 /*
1715 * Don't bother trying to load compat modules
1716 * if it is not our ioctl. This is more efficient
1717 * and makes rump tests not depend on compat code
1718 */
1719 if (IOCGROUP(cmd) != 'r')
1720 break;
1721 #ifdef _LP64
1722 if ((l->l_proc->p_flag & PK_32) != 0) {
1723 module_autoload("compat_netbsd32_raid",
1724 MODULE_CLASS_EXEC);
1725 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1726 (rs, cmd, data), enosys(), retcode);
1727 if (retcode != EPASSTHROUGH)
1728 return retcode;
1729 }
1730 #endif
1731 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1732 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1733 (rs, cmd, data), enosys(), retcode);
1734 if (retcode != EPASSTHROUGH)
1735 return retcode;
1736
1737 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1738 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1739 (rs, cmd, data), enosys(), retcode);
1740 if (retcode != EPASSTHROUGH)
1741 return retcode;
1742 break; /* fall through to the os-specific code below */
1743
1744 }
1745
1746 if (!raidPtr->valid)
1747 return EINVAL;
1748
1749 /*
1750 * Add support for "regular" device ioctls here.
1751 */
1752
1753 switch (cmd) {
1754 case DIOCGCACHE:
1755 retcode = rf_get_component_caches(raidPtr, (int *)data);
1756 break;
1757
1758 case DIOCCACHESYNC:
1759 retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1760 break;
1761
1762 default:
1763 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1764 break;
1765 }
1766
1767 return retcode;
1768
1769 }
1770
1771
1772 /* raidinit -- complete the rest of the initialization for the
1773 RAIDframe device. */
1774
1775
1776 static void
1777 raidinit(struct raid_softc *rs)
1778 {
1779 cfdata_t cf;
1780 unsigned int unit;
1781 struct dk_softc *dksc = &rs->sc_dksc;
1782 RF_Raid_t *raidPtr = &rs->sc_r;
1783 device_t dev;
1784
1785 unit = raidPtr->raidid;
1786
1787 /* XXX doesn't check bounds. */
1788 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1789
1790 /* attach the pseudo device */
1791 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1792 cf->cf_name = raid_cd.cd_name;
1793 cf->cf_atname = raid_cd.cd_name;
1794 cf->cf_unit = unit;
1795 cf->cf_fstate = FSTATE_STAR;
1796
1797 dev = config_attach_pseudo(cf);
1798 if (dev == NULL) {
1799 printf("raid%d: config_attach_pseudo failed\n",
1800 raidPtr->raidid);
1801 free(cf, M_RAIDFRAME);
1802 return;
1803 }
1804
1805 /* provide a backpointer to the real softc */
1806 raidsoftc(dev) = rs;
1807
1808 /* disk_attach actually creates space for the CPU disklabel, among
1809 * other things, so it's critical to call this *BEFORE* we try putzing
1810 * with disklabels. */
1811 dk_init(dksc, dev, DKTYPE_RAID);
1812 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1813
1814 /* XXX There may be a weird interaction here between this, and
1815 * protectedSectors, as used in RAIDframe. */
1816
1817 rs->sc_size = raidPtr->totalSectors;
1818
1819 /* Attach dk and disk subsystems */
1820 dk_attach(dksc);
1821 disk_attach(&dksc->sc_dkdev);
1822 rf_set_geometry(rs, raidPtr);
1823
1824 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1825
1826 /* mark unit as usuable */
1827 rs->sc_flags |= RAIDF_INITED;
1828
1829 dkwedge_discover(&dksc->sc_dkdev);
1830 }
1831
1832 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1833 /* wake up the daemon & tell it to get us a spare table
1834 * XXX
1835 * the entries in the queues should be tagged with the raidPtr
1836 * so that in the extremely rare case that two recons happen at once,
1837 * we know for which device were requesting a spare table
1838 * XXX
1839 *
1840 * XXX This code is not currently used. GO
1841 */
1842 int
1843 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1844 {
1845 int retcode;
1846
1847 rf_lock_mutex2(rf_sparet_wait_mutex);
1848 req->next = rf_sparet_wait_queue;
1849 rf_sparet_wait_queue = req;
1850 rf_broadcast_cond2(rf_sparet_wait_cv);
1851
1852 /* mpsleep unlocks the mutex */
1853 while (!rf_sparet_resp_queue) {
1854 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1855 }
1856 req = rf_sparet_resp_queue;
1857 rf_sparet_resp_queue = req->next;
1858 rf_unlock_mutex2(rf_sparet_wait_mutex);
1859
1860 retcode = req->fcol;
1861 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1862 * alloc'd */
1863 return retcode;
1864 }
1865 #endif
1866
1867 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1868 * bp & passes it down.
1869 * any calls originating in the kernel must use non-blocking I/O
1870 * do some extra sanity checking to return "appropriate" error values for
1871 * certain conditions (to make some standard utilities work)
1872 *
1873 * Formerly known as: rf_DoAccessKernel
1874 */
1875 void
1876 raidstart(RF_Raid_t *raidPtr)
1877 {
1878 struct raid_softc *rs;
1879 struct dk_softc *dksc;
1880
1881 rs = raidPtr->softc;
1882 dksc = &rs->sc_dksc;
1883 /* quick check to see if anything has died recently */
1884 rf_lock_mutex2(raidPtr->mutex);
1885 if (raidPtr->numNewFailures > 0) {
1886 rf_unlock_mutex2(raidPtr->mutex);
1887 rf_update_component_labels(raidPtr,
1888 RF_NORMAL_COMPONENT_UPDATE);
1889 rf_lock_mutex2(raidPtr->mutex);
1890 raidPtr->numNewFailures--;
1891 }
1892 rf_unlock_mutex2(raidPtr->mutex);
1893
1894 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1895 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1896 return;
1897 }
1898
1899 dk_start(dksc, NULL);
1900 }
1901
1902 static int
1903 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1904 {
1905 RF_SectorCount_t num_blocks, pb, sum;
1906 RF_RaidAddr_t raid_addr;
1907 daddr_t blocknum;
1908 int do_async;
1909 int rc;
1910
1911 rf_lock_mutex2(raidPtr->mutex);
1912 if (raidPtr->openings == 0) {
1913 rf_unlock_mutex2(raidPtr->mutex);
1914 return EAGAIN;
1915 }
1916 rf_unlock_mutex2(raidPtr->mutex);
1917
1918 blocknum = bp->b_rawblkno;
1919
1920 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1921 (int) blocknum));
1922
1923 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1924 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1925
1926 /* *THIS* is where we adjust what block we're going to...
1927 * but DO NOT TOUCH bp->b_blkno!!! */
1928 raid_addr = blocknum;
1929
1930 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1931 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1932 sum = raid_addr + num_blocks + pb;
1933 if (1 || rf_debugKernelAccess) {
1934 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1935 (int) raid_addr, (int) sum, (int) num_blocks,
1936 (int) pb, (int) bp->b_resid));
1937 }
1938 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1939 || (sum < num_blocks) || (sum < pb)) {
1940 rc = ENOSPC;
1941 goto done;
1942 }
1943 /*
1944 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1945 */
1946
1947 if (bp->b_bcount & raidPtr->sectorMask) {
1948 rc = ENOSPC;
1949 goto done;
1950 }
1951 db1_printf(("Calling DoAccess..\n"));
1952
1953
1954 rf_lock_mutex2(raidPtr->mutex);
1955 raidPtr->openings--;
1956 rf_unlock_mutex2(raidPtr->mutex);
1957
1958 /*
1959 * Everything is async.
1960 */
1961 do_async = 1;
1962
1963 /* don't ever condition on bp->b_flags & B_WRITE.
1964 * always condition on B_READ instead */
1965
1966 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1967 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1968 do_async, raid_addr, num_blocks,
1969 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1970
1971 done:
1972 return rc;
1973 }
1974
1975 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1976
1977 int
1978 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1979 {
1980 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1981 struct buf *bp;
1982
1983 req->queue = queue;
1984 bp = req->bp;
1985
1986 switch (req->type) {
1987 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1988 /* XXX need to do something extra here.. */
1989 /* I'm leaving this in, as I've never actually seen it used,
1990 * and I'd like folks to report it... GO */
1991 printf("%s: WAKEUP CALLED\n", __func__);
1992 queue->numOutstanding++;
1993
1994 bp->b_flags = 0;
1995 bp->b_private = req;
1996
1997 KernelWakeupFunc(bp);
1998 break;
1999
2000 case RF_IO_TYPE_READ:
2001 case RF_IO_TYPE_WRITE:
2002 #if RF_ACC_TRACE > 0
2003 if (req->tracerec) {
2004 RF_ETIMER_START(req->tracerec->timer);
2005 }
2006 #endif
2007 InitBP(bp, queue->rf_cinfo->ci_vp,
2008 op, queue->rf_cinfo->ci_dev,
2009 req->sectorOffset, req->numSector,
2010 req->buf, KernelWakeupFunc, (void *) req,
2011 queue->raidPtr->logBytesPerSector);
2012
2013 if (rf_debugKernelAccess) {
2014 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2015 (long) bp->b_blkno));
2016 }
2017 queue->numOutstanding++;
2018 queue->last_deq_sector = req->sectorOffset;
2019 /* acc wouldn't have been let in if there were any pending
2020 * reqs at any other priority */
2021 queue->curPriority = req->priority;
2022
2023 db1_printf(("Going for %c to unit %d col %d\n",
2024 req->type, queue->raidPtr->raidid,
2025 queue->col));
2026 db1_printf(("sector %d count %d (%d bytes) %d\n",
2027 (int) req->sectorOffset, (int) req->numSector,
2028 (int) (req->numSector <<
2029 queue->raidPtr->logBytesPerSector),
2030 (int) queue->raidPtr->logBytesPerSector));
2031
2032 /*
2033 * XXX: drop lock here since this can block at
2034 * least with backing SCSI devices. Retake it
2035 * to minimize fuss with calling interfaces.
2036 */
2037
2038 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2039 bdev_strategy(bp);
2040 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2041 break;
2042
2043 default:
2044 panic("bad req->type in rf_DispatchKernelIO");
2045 }
2046 db1_printf(("Exiting from DispatchKernelIO\n"));
2047
2048 return 0;
2049 }
2050 /* this is the callback function associated with a I/O invoked from
2051 kernel code.
2052 */
2053 static void
2054 KernelWakeupFunc(struct buf *bp)
2055 {
2056 RF_DiskQueueData_t *req = NULL;
2057 RF_DiskQueue_t *queue;
2058
2059 db1_printf(("recovering the request queue:\n"));
2060
2061 req = bp->b_private;
2062
2063 queue = (RF_DiskQueue_t *) req->queue;
2064
2065 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2066
2067 #if RF_ACC_TRACE > 0
2068 if (req->tracerec) {
2069 RF_ETIMER_STOP(req->tracerec->timer);
2070 RF_ETIMER_EVAL(req->tracerec->timer);
2071 rf_lock_mutex2(rf_tracing_mutex);
2072 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2073 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2074 req->tracerec->num_phys_ios++;
2075 rf_unlock_mutex2(rf_tracing_mutex);
2076 }
2077 #endif
2078
2079 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2080 * ballistic, and mark the component as hosed... */
2081
2082 if (bp->b_error != 0) {
2083 /* Mark the disk as dead */
2084 /* but only mark it once... */
2085 /* and only if it wouldn't leave this RAID set
2086 completely broken */
2087 if (((queue->raidPtr->Disks[queue->col].status ==
2088 rf_ds_optimal) ||
2089 (queue->raidPtr->Disks[queue->col].status ==
2090 rf_ds_used_spare)) &&
2091 (queue->raidPtr->numFailures <
2092 queue->raidPtr->Layout.map->faultsTolerated)) {
2093 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2094 queue->raidPtr->raidid,
2095 bp->b_error,
2096 queue->raidPtr->Disks[queue->col].devname);
2097 queue->raidPtr->Disks[queue->col].status =
2098 rf_ds_failed;
2099 queue->raidPtr->status = rf_rs_degraded;
2100 queue->raidPtr->numFailures++;
2101 queue->raidPtr->numNewFailures++;
2102 } else { /* Disk is already dead... */
2103 /* printf("Disk already marked as dead!\n"); */
2104 }
2105
2106 }
2107
2108 /* Fill in the error value */
2109 req->error = bp->b_error;
2110
2111 /* Drop this one on the "finished" queue... */
2112 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2113
2114 /* Let the raidio thread know there is work to be done. */
2115 rf_signal_cond2(queue->raidPtr->iodone_cv);
2116
2117 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2118 }
2119
2120
2121 /*
2122 * initialize a buf structure for doing an I/O in the kernel.
2123 */
2124 static void
2125 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2126 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2127 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2128 {
2129 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2130 bp->b_oflags = 0;
2131 bp->b_cflags = 0;
2132 bp->b_bcount = numSect << logBytesPerSector;
2133 bp->b_bufsize = bp->b_bcount;
2134 bp->b_error = 0;
2135 bp->b_dev = dev;
2136 bp->b_data = bf;
2137 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2138 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2139 if (bp->b_bcount == 0) {
2140 panic("bp->b_bcount is zero in InitBP!!");
2141 }
2142 bp->b_iodone = cbFunc;
2143 bp->b_private = cbArg;
2144 }
2145
2146 /*
2147 * Wait interruptibly for an exclusive lock.
2148 *
2149 * XXX
2150 * Several drivers do this; it should be abstracted and made MP-safe.
2151 * (Hmm... where have we seen this warning before :-> GO )
2152 */
2153 static int
2154 raidlock(struct raid_softc *rs)
2155 {
2156 int error;
2157
2158 error = 0;
2159 mutex_enter(&rs->sc_mutex);
2160 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2161 rs->sc_flags |= RAIDF_WANTED;
2162 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2163 if (error != 0)
2164 goto done;
2165 }
2166 rs->sc_flags |= RAIDF_LOCKED;
2167 done:
2168 mutex_exit(&rs->sc_mutex);
2169 return error;
2170 }
2171 /*
2172 * Unlock and wake up any waiters.
2173 */
2174 static void
2175 raidunlock(struct raid_softc *rs)
2176 {
2177
2178 mutex_enter(&rs->sc_mutex);
2179 rs->sc_flags &= ~RAIDF_LOCKED;
2180 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2181 rs->sc_flags &= ~RAIDF_WANTED;
2182 cv_broadcast(&rs->sc_cv);
2183 }
2184 mutex_exit(&rs->sc_mutex);
2185 }
2186
2187
2188 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2189 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2190 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2191
2192 static daddr_t
2193 rf_component_info_offset(void)
2194 {
2195
2196 return RF_COMPONENT_INFO_OFFSET;
2197 }
2198
2199 static daddr_t
2200 rf_component_info_size(unsigned secsize)
2201 {
2202 daddr_t info_size;
2203
2204 KASSERT(secsize);
2205 if (secsize > RF_COMPONENT_INFO_SIZE)
2206 info_size = secsize;
2207 else
2208 info_size = RF_COMPONENT_INFO_SIZE;
2209
2210 return info_size;
2211 }
2212
2213 static daddr_t
2214 rf_parity_map_offset(RF_Raid_t *raidPtr)
2215 {
2216 daddr_t map_offset;
2217
2218 KASSERT(raidPtr->bytesPerSector);
2219 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2220 map_offset = raidPtr->bytesPerSector;
2221 else
2222 map_offset = RF_COMPONENT_INFO_SIZE;
2223 map_offset += rf_component_info_offset();
2224
2225 return map_offset;
2226 }
2227
2228 static daddr_t
2229 rf_parity_map_size(RF_Raid_t *raidPtr)
2230 {
2231 daddr_t map_size;
2232
2233 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2234 map_size = raidPtr->bytesPerSector;
2235 else
2236 map_size = RF_PARITY_MAP_SIZE;
2237
2238 return map_size;
2239 }
2240
2241 int
2242 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2243 {
2244 RF_ComponentLabel_t *clabel;
2245
2246 clabel = raidget_component_label(raidPtr, col);
2247 clabel->clean = RF_RAID_CLEAN;
2248 raidflush_component_label(raidPtr, col);
2249 return(0);
2250 }
2251
2252
2253 int
2254 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2255 {
2256 RF_ComponentLabel_t *clabel;
2257
2258 clabel = raidget_component_label(raidPtr, col);
2259 clabel->clean = RF_RAID_DIRTY;
2260 raidflush_component_label(raidPtr, col);
2261 return(0);
2262 }
2263
2264 int
2265 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2266 {
2267 KASSERT(raidPtr->bytesPerSector);
2268 return raidread_component_label(raidPtr->bytesPerSector,
2269 raidPtr->Disks[col].dev,
2270 raidPtr->raid_cinfo[col].ci_vp,
2271 &raidPtr->raid_cinfo[col].ci_label);
2272 }
2273
2274 RF_ComponentLabel_t *
2275 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2276 {
2277 return &raidPtr->raid_cinfo[col].ci_label;
2278 }
2279
2280 int
2281 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2282 {
2283 RF_ComponentLabel_t *label;
2284
2285 label = &raidPtr->raid_cinfo[col].ci_label;
2286 label->mod_counter = raidPtr->mod_counter;
2287 #ifndef RF_NO_PARITY_MAP
2288 label->parity_map_modcount = label->mod_counter;
2289 #endif
2290 return raidwrite_component_label(raidPtr->bytesPerSector,
2291 raidPtr->Disks[col].dev,
2292 raidPtr->raid_cinfo[col].ci_vp, label);
2293 }
2294
2295
2296 static int
2297 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2298 RF_ComponentLabel_t *clabel)
2299 {
2300 return raidread_component_area(dev, b_vp, clabel,
2301 sizeof(RF_ComponentLabel_t),
2302 rf_component_info_offset(),
2303 rf_component_info_size(secsize));
2304 }
2305
2306 /* ARGSUSED */
2307 static int
2308 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2309 size_t msize, daddr_t offset, daddr_t dsize)
2310 {
2311 struct buf *bp;
2312 int error;
2313
2314 /* XXX should probably ensure that we don't try to do this if
2315 someone has changed rf_protected_sectors. */
2316
2317 if (b_vp == NULL) {
2318 /* For whatever reason, this component is not valid.
2319 Don't try to read a component label from it. */
2320 return(EINVAL);
2321 }
2322
2323 /* get a block of the appropriate size... */
2324 bp = geteblk((int)dsize);
2325 bp->b_dev = dev;
2326
2327 /* get our ducks in a row for the read */
2328 bp->b_blkno = offset / DEV_BSIZE;
2329 bp->b_bcount = dsize;
2330 bp->b_flags |= B_READ;
2331 bp->b_resid = dsize;
2332
2333 bdev_strategy(bp);
2334 error = biowait(bp);
2335
2336 if (!error) {
2337 memcpy(data, bp->b_data, msize);
2338 }
2339
2340 brelse(bp, 0);
2341 return(error);
2342 }
2343
2344
2345 static int
2346 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2347 RF_ComponentLabel_t *clabel)
2348 {
2349 return raidwrite_component_area(dev, b_vp, clabel,
2350 sizeof(RF_ComponentLabel_t),
2351 rf_component_info_offset(),
2352 rf_component_info_size(secsize), 0);
2353 }
2354
2355 /* ARGSUSED */
2356 static int
2357 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2358 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2359 {
2360 struct buf *bp;
2361 int error;
2362
2363 /* get a block of the appropriate size... */
2364 bp = geteblk((int)dsize);
2365 bp->b_dev = dev;
2366
2367 /* get our ducks in a row for the write */
2368 bp->b_blkno = offset / DEV_BSIZE;
2369 bp->b_bcount = dsize;
2370 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2371 bp->b_resid = dsize;
2372
2373 memset(bp->b_data, 0, dsize);
2374 memcpy(bp->b_data, data, msize);
2375
2376 bdev_strategy(bp);
2377 if (asyncp)
2378 return 0;
2379 error = biowait(bp);
2380 brelse(bp, 0);
2381 if (error) {
2382 #if 1
2383 printf("Failed to write RAID component info!\n");
2384 #endif
2385 }
2386
2387 return(error);
2388 }
2389
2390 void
2391 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2392 {
2393 int c;
2394
2395 for (c = 0; c < raidPtr->numCol; c++) {
2396 /* Skip dead disks. */
2397 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2398 continue;
2399 /* XXXjld: what if an error occurs here? */
2400 raidwrite_component_area(raidPtr->Disks[c].dev,
2401 raidPtr->raid_cinfo[c].ci_vp, map,
2402 RF_PARITYMAP_NBYTE,
2403 rf_parity_map_offset(raidPtr),
2404 rf_parity_map_size(raidPtr), 0);
2405 }
2406 }
2407
2408 void
2409 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2410 {
2411 struct rf_paritymap_ondisk tmp;
2412 int c,first;
2413
2414 first=1;
2415 for (c = 0; c < raidPtr->numCol; c++) {
2416 /* Skip dead disks. */
2417 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2418 continue;
2419 raidread_component_area(raidPtr->Disks[c].dev,
2420 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2421 RF_PARITYMAP_NBYTE,
2422 rf_parity_map_offset(raidPtr),
2423 rf_parity_map_size(raidPtr));
2424 if (first) {
2425 memcpy(map, &tmp, sizeof(*map));
2426 first = 0;
2427 } else {
2428 rf_paritymap_merge(map, &tmp);
2429 }
2430 }
2431 }
2432
2433 void
2434 rf_markalldirty(RF_Raid_t *raidPtr)
2435 {
2436 RF_ComponentLabel_t *clabel;
2437 int sparecol;
2438 int c;
2439 int j;
2440 int scol = -1;
2441
2442 raidPtr->mod_counter++;
2443 for (c = 0; c < raidPtr->numCol; c++) {
2444 /* we don't want to touch (at all) a disk that has
2445 failed */
2446 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2447 clabel = raidget_component_label(raidPtr, c);
2448 if (clabel->status == rf_ds_spared) {
2449 /* XXX do something special...
2450 but whatever you do, don't
2451 try to access it!! */
2452 } else {
2453 raidmarkdirty(raidPtr, c);
2454 }
2455 }
2456 }
2457
2458 for( c = 0; c < raidPtr->numSpare ; c++) {
2459 sparecol = raidPtr->numCol + c;
2460 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2461 /*
2462
2463 we claim this disk is "optimal" if it's
2464 rf_ds_used_spare, as that means it should be
2465 directly substitutable for the disk it replaced.
2466 We note that too...
2467
2468 */
2469
2470 for(j=0;j<raidPtr->numCol;j++) {
2471 if (raidPtr->Disks[j].spareCol == sparecol) {
2472 scol = j;
2473 break;
2474 }
2475 }
2476
2477 clabel = raidget_component_label(raidPtr, sparecol);
2478 /* make sure status is noted */
2479
2480 raid_init_component_label(raidPtr, clabel);
2481
2482 clabel->row = 0;
2483 clabel->column = scol;
2484 /* Note: we *don't* change status from rf_ds_used_spare
2485 to rf_ds_optimal */
2486 /* clabel.status = rf_ds_optimal; */
2487
2488 raidmarkdirty(raidPtr, sparecol);
2489 }
2490 }
2491 }
2492
2493
2494 void
2495 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2496 {
2497 RF_ComponentLabel_t *clabel;
2498 int sparecol;
2499 int c;
2500 int j;
2501 int scol;
2502 struct raid_softc *rs = raidPtr->softc;
2503
2504 scol = -1;
2505
2506 /* XXX should do extra checks to make sure things really are clean,
2507 rather than blindly setting the clean bit... */
2508
2509 raidPtr->mod_counter++;
2510
2511 for (c = 0; c < raidPtr->numCol; c++) {
2512 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2513 clabel = raidget_component_label(raidPtr, c);
2514 /* make sure status is noted */
2515 clabel->status = rf_ds_optimal;
2516
2517 /* note what unit we are configured as */
2518 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2519 clabel->last_unit = raidPtr->raidid;
2520
2521 raidflush_component_label(raidPtr, c);
2522 if (final == RF_FINAL_COMPONENT_UPDATE) {
2523 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2524 raidmarkclean(raidPtr, c);
2525 }
2526 }
2527 }
2528 /* else we don't touch it.. */
2529 }
2530
2531 for( c = 0; c < raidPtr->numSpare ; c++) {
2532 sparecol = raidPtr->numCol + c;
2533 /* Need to ensure that the reconstruct actually completed! */
2534 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2535 /*
2536
2537 we claim this disk is "optimal" if it's
2538 rf_ds_used_spare, as that means it should be
2539 directly substitutable for the disk it replaced.
2540 We note that too...
2541
2542 */
2543
2544 for(j=0;j<raidPtr->numCol;j++) {
2545 if (raidPtr->Disks[j].spareCol == sparecol) {
2546 scol = j;
2547 break;
2548 }
2549 }
2550
2551 /* XXX shouldn't *really* need this... */
2552 clabel = raidget_component_label(raidPtr, sparecol);
2553 /* make sure status is noted */
2554
2555 raid_init_component_label(raidPtr, clabel);
2556
2557 clabel->column = scol;
2558 clabel->status = rf_ds_optimal;
2559 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2560 clabel->last_unit = raidPtr->raidid;
2561
2562 raidflush_component_label(raidPtr, sparecol);
2563 if (final == RF_FINAL_COMPONENT_UPDATE) {
2564 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2565 raidmarkclean(raidPtr, sparecol);
2566 }
2567 }
2568 }
2569 }
2570 }
2571
2572 void
2573 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2574 {
2575
2576 if (vp != NULL) {
2577 if (auto_configured == 1) {
2578 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2579 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2580 vput(vp);
2581
2582 } else {
2583 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2584 }
2585 }
2586 }
2587
2588
2589 void
2590 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2591 {
2592 int r,c;
2593 struct vnode *vp;
2594 int acd;
2595
2596
2597 /* We take this opportunity to close the vnodes like we should.. */
2598
2599 for (c = 0; c < raidPtr->numCol; c++) {
2600 vp = raidPtr->raid_cinfo[c].ci_vp;
2601 acd = raidPtr->Disks[c].auto_configured;
2602 rf_close_component(raidPtr, vp, acd);
2603 raidPtr->raid_cinfo[c].ci_vp = NULL;
2604 raidPtr->Disks[c].auto_configured = 0;
2605 }
2606
2607 for (r = 0; r < raidPtr->numSpare; r++) {
2608 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2609 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2610 rf_close_component(raidPtr, vp, acd);
2611 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2612 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2613 }
2614 }
2615
2616
2617 void
2618 rf_ReconThread(struct rf_recon_req_internal *req)
2619 {
2620 int s;
2621 RF_Raid_t *raidPtr;
2622
2623 s = splbio();
2624 raidPtr = (RF_Raid_t *) req->raidPtr;
2625 raidPtr->recon_in_progress = 1;
2626
2627 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2628 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2629
2630 RF_Free(req, sizeof(*req));
2631
2632 raidPtr->recon_in_progress = 0;
2633 splx(s);
2634
2635 /* That's all... */
2636 kthread_exit(0); /* does not return */
2637 }
2638
2639 void
2640 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2641 {
2642 int retcode;
2643 int s;
2644
2645 raidPtr->parity_rewrite_stripes_done = 0;
2646 raidPtr->parity_rewrite_in_progress = 1;
2647 s = splbio();
2648 retcode = rf_RewriteParity(raidPtr);
2649 splx(s);
2650 if (retcode) {
2651 printf("raid%d: Error re-writing parity (%d)!\n",
2652 raidPtr->raidid, retcode);
2653 } else {
2654 /* set the clean bit! If we shutdown correctly,
2655 the clean bit on each component label will get
2656 set */
2657 raidPtr->parity_good = RF_RAID_CLEAN;
2658 }
2659 raidPtr->parity_rewrite_in_progress = 0;
2660
2661 /* Anyone waiting for us to stop? If so, inform them... */
2662 if (raidPtr->waitShutdown) {
2663 rf_lock_mutex2(raidPtr->rad_lock);
2664 cv_broadcast(&raidPtr->parity_rewrite_cv);
2665 rf_unlock_mutex2(raidPtr->rad_lock);
2666 }
2667
2668 /* That's all... */
2669 kthread_exit(0); /* does not return */
2670 }
2671
2672
2673 void
2674 rf_CopybackThread(RF_Raid_t *raidPtr)
2675 {
2676 int s;
2677
2678 raidPtr->copyback_in_progress = 1;
2679 s = splbio();
2680 rf_CopybackReconstructedData(raidPtr);
2681 splx(s);
2682 raidPtr->copyback_in_progress = 0;
2683
2684 /* That's all... */
2685 kthread_exit(0); /* does not return */
2686 }
2687
2688
2689 void
2690 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2691 {
2692 int s;
2693 RF_Raid_t *raidPtr;
2694
2695 s = splbio();
2696 raidPtr = req->raidPtr;
2697 raidPtr->recon_in_progress = 1;
2698 rf_ReconstructInPlace(raidPtr, req->col);
2699 RF_Free(req, sizeof(*req));
2700 raidPtr->recon_in_progress = 0;
2701 splx(s);
2702
2703 /* That's all... */
2704 kthread_exit(0); /* does not return */
2705 }
2706
2707 static RF_AutoConfig_t *
2708 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2709 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2710 unsigned secsize)
2711 {
2712 int good_one = 0;
2713 RF_ComponentLabel_t *clabel;
2714 RF_AutoConfig_t *ac;
2715
2716 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2717
2718 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2719 /* Got the label. Does it look reasonable? */
2720 if (rf_reasonable_label(clabel, numsecs) &&
2721 (rf_component_label_partitionsize(clabel) <= size)) {
2722 #ifdef DEBUG
2723 printf("Component on: %s: %llu\n",
2724 cname, (unsigned long long)size);
2725 rf_print_component_label(clabel);
2726 #endif
2727 /* if it's reasonable, add it, else ignore it. */
2728 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2729 M_WAITOK);
2730 strlcpy(ac->devname, cname, sizeof(ac->devname));
2731 ac->dev = dev;
2732 ac->vp = vp;
2733 ac->clabel = clabel;
2734 ac->next = ac_list;
2735 ac_list = ac;
2736 good_one = 1;
2737 }
2738 }
2739 if (!good_one) {
2740 /* cleanup */
2741 free(clabel, M_RAIDFRAME);
2742 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2743 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2744 vput(vp);
2745 }
2746 return ac_list;
2747 }
2748
2749 RF_AutoConfig_t *
2750 rf_find_raid_components(void)
2751 {
2752 struct vnode *vp;
2753 struct disklabel label;
2754 device_t dv;
2755 deviter_t di;
2756 dev_t dev;
2757 int bmajor, bminor, wedge, rf_part_found;
2758 int error;
2759 int i;
2760 RF_AutoConfig_t *ac_list;
2761 uint64_t numsecs;
2762 unsigned secsize;
2763 int dowedges;
2764
2765 /* initialize the AutoConfig list */
2766 ac_list = NULL;
2767
2768 /*
2769 * we begin by trolling through *all* the devices on the system *twice*
2770 * first we scan for wedges, second for other devices. This avoids
2771 * using a raw partition instead of a wedge that covers the whole disk
2772 */
2773
2774 for (dowedges=1; dowedges>=0; --dowedges) {
2775 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2776 dv = deviter_next(&di)) {
2777
2778 /* we are only interested in disks... */
2779 if (device_class(dv) != DV_DISK)
2780 continue;
2781
2782 /* we don't care about floppies... */
2783 if (device_is_a(dv, "fd")) {
2784 continue;
2785 }
2786
2787 /* we don't care about CD's... */
2788 if (device_is_a(dv, "cd")) {
2789 continue;
2790 }
2791
2792 /* we don't care about md's... */
2793 if (device_is_a(dv, "md")) {
2794 continue;
2795 }
2796
2797 /* hdfd is the Atari/Hades floppy driver */
2798 if (device_is_a(dv, "hdfd")) {
2799 continue;
2800 }
2801
2802 /* fdisa is the Atari/Milan floppy driver */
2803 if (device_is_a(dv, "fdisa")) {
2804 continue;
2805 }
2806
2807 /* are we in the wedges pass ? */
2808 wedge = device_is_a(dv, "dk");
2809 if (wedge != dowedges) {
2810 continue;
2811 }
2812
2813 /* need to find the device_name_to_block_device_major stuff */
2814 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2815
2816 rf_part_found = 0; /*No raid partition as yet*/
2817
2818 /* get a vnode for the raw partition of this disk */
2819 bminor = minor(device_unit(dv));
2820 dev = wedge ? makedev(bmajor, bminor) :
2821 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2822 if (bdevvp(dev, &vp))
2823 panic("RAID can't alloc vnode");
2824
2825 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2826 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2827
2828 if (error) {
2829 /* "Who cares." Continue looking
2830 for something that exists*/
2831 vput(vp);
2832 continue;
2833 }
2834
2835 error = getdisksize(vp, &numsecs, &secsize);
2836 if (error) {
2837 /*
2838 * Pseudo devices like vnd and cgd can be
2839 * opened but may still need some configuration.
2840 * Ignore these quietly.
2841 */
2842 if (error != ENXIO)
2843 printf("RAIDframe: can't get disk size"
2844 " for dev %s (%d)\n",
2845 device_xname(dv), error);
2846 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2847 vput(vp);
2848 continue;
2849 }
2850 if (wedge) {
2851 struct dkwedge_info dkw;
2852 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2853 NOCRED);
2854 if (error) {
2855 printf("RAIDframe: can't get wedge info for "
2856 "dev %s (%d)\n", device_xname(dv), error);
2857 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2858 vput(vp);
2859 continue;
2860 }
2861
2862 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2863 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2864 vput(vp);
2865 continue;
2866 }
2867
2868 VOP_UNLOCK(vp);
2869 ac_list = rf_get_component(ac_list, dev, vp,
2870 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2871 rf_part_found = 1; /*There is a raid component on this disk*/
2872 continue;
2873 }
2874
2875 /* Ok, the disk exists. Go get the disklabel. */
2876 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2877 if (error) {
2878 /*
2879 * XXX can't happen - open() would
2880 * have errored out (or faked up one)
2881 */
2882 if (error != ENOTTY)
2883 printf("RAIDframe: can't get label for dev "
2884 "%s (%d)\n", device_xname(dv), error);
2885 }
2886
2887 /* don't need this any more. We'll allocate it again
2888 a little later if we really do... */
2889 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2890 vput(vp);
2891
2892 if (error)
2893 continue;
2894
2895 rf_part_found = 0; /*No raid partitions yet*/
2896 for (i = 0; i < label.d_npartitions; i++) {
2897 char cname[sizeof(ac_list->devname)];
2898
2899 /* We only support partitions marked as RAID */
2900 if (label.d_partitions[i].p_fstype != FS_RAID)
2901 continue;
2902
2903 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2904 if (bdevvp(dev, &vp))
2905 panic("RAID can't alloc vnode");
2906
2907 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2908 error = VOP_OPEN(vp, FREAD, NOCRED);
2909 if (error) {
2910 /* Whatever... */
2911 vput(vp);
2912 continue;
2913 }
2914 VOP_UNLOCK(vp);
2915 snprintf(cname, sizeof(cname), "%s%c",
2916 device_xname(dv), 'a' + i);
2917 ac_list = rf_get_component(ac_list, dev, vp, cname,
2918 label.d_partitions[i].p_size, numsecs, secsize);
2919 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2920 }
2921
2922 /*
2923 *If there is no raid component on this disk, either in a
2924 *disklabel or inside a wedge, check the raw partition as well,
2925 *as it is possible to configure raid components on raw disk
2926 *devices.
2927 */
2928
2929 if (!rf_part_found) {
2930 char cname[sizeof(ac_list->devname)];
2931
2932 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2933 if (bdevvp(dev, &vp))
2934 panic("RAID can't alloc vnode");
2935
2936 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2937
2938 error = VOP_OPEN(vp, FREAD, NOCRED);
2939 if (error) {
2940 /* Whatever... */
2941 vput(vp);
2942 continue;
2943 }
2944 VOP_UNLOCK(vp);
2945 snprintf(cname, sizeof(cname), "%s%c",
2946 device_xname(dv), 'a' + RAW_PART);
2947 ac_list = rf_get_component(ac_list, dev, vp, cname,
2948 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2949 }
2950 }
2951 deviter_release(&di);
2952 }
2953 return ac_list;
2954 }
2955
2956
2957 int
2958 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2959 {
2960
2961 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2962 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2963 ((clabel->clean == RF_RAID_CLEAN) ||
2964 (clabel->clean == RF_RAID_DIRTY)) &&
2965 clabel->row >=0 &&
2966 clabel->column >= 0 &&
2967 clabel->num_rows > 0 &&
2968 clabel->num_columns > 0 &&
2969 clabel->row < clabel->num_rows &&
2970 clabel->column < clabel->num_columns &&
2971 clabel->blockSize > 0 &&
2972 /*
2973 * numBlocksHi may contain garbage, but it is ok since
2974 * the type is unsigned. If it is really garbage,
2975 * rf_fix_old_label_size() will fix it.
2976 */
2977 rf_component_label_numblocks(clabel) > 0) {
2978 /*
2979 * label looks reasonable enough...
2980 * let's make sure it has no old garbage.
2981 */
2982 if (numsecs)
2983 rf_fix_old_label_size(clabel, numsecs);
2984 return(1);
2985 }
2986 return(0);
2987 }
2988
2989
2990 /*
2991 * For reasons yet unknown, some old component labels have garbage in
2992 * the newer numBlocksHi region, and this causes lossage. Since those
2993 * disks will also have numsecs set to less than 32 bits of sectors,
2994 * we can determine when this corruption has occurred, and fix it.
2995 *
2996 * The exact same problem, with the same unknown reason, happens to
2997 * the partitionSizeHi member as well.
2998 */
2999 static void
3000 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3001 {
3002
3003 if (numsecs < ((uint64_t)1 << 32)) {
3004 if (clabel->numBlocksHi) {
3005 printf("WARNING: total sectors < 32 bits, yet "
3006 "numBlocksHi set\n"
3007 "WARNING: resetting numBlocksHi to zero.\n");
3008 clabel->numBlocksHi = 0;
3009 }
3010
3011 if (clabel->partitionSizeHi) {
3012 printf("WARNING: total sectors < 32 bits, yet "
3013 "partitionSizeHi set\n"
3014 "WARNING: resetting partitionSizeHi to zero.\n");
3015 clabel->partitionSizeHi = 0;
3016 }
3017 }
3018 }
3019
3020
3021 #ifdef DEBUG
3022 void
3023 rf_print_component_label(RF_ComponentLabel_t *clabel)
3024 {
3025 uint64_t numBlocks;
3026 static const char *rp[] = {
3027 "No", "Force", "Soft", "*invalid*"
3028 };
3029
3030
3031 numBlocks = rf_component_label_numblocks(clabel);
3032
3033 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3034 clabel->row, clabel->column,
3035 clabel->num_rows, clabel->num_columns);
3036 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3037 clabel->version, clabel->serial_number,
3038 clabel->mod_counter);
3039 printf(" Clean: %s Status: %d\n",
3040 clabel->clean ? "Yes" : "No", clabel->status);
3041 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3042 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3043 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3044 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3045 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3046 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3047 printf(" Last configured as: raid%d\n", clabel->last_unit);
3048 #if 0
3049 printf(" Config order: %d\n", clabel->config_order);
3050 #endif
3051
3052 }
3053 #endif
3054
3055 RF_ConfigSet_t *
3056 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3057 {
3058 RF_AutoConfig_t *ac;
3059 RF_ConfigSet_t *config_sets;
3060 RF_ConfigSet_t *cset;
3061 RF_AutoConfig_t *ac_next;
3062
3063
3064 config_sets = NULL;
3065
3066 /* Go through the AutoConfig list, and figure out which components
3067 belong to what sets. */
3068 ac = ac_list;
3069 while(ac!=NULL) {
3070 /* we're going to putz with ac->next, so save it here
3071 for use at the end of the loop */
3072 ac_next = ac->next;
3073
3074 if (config_sets == NULL) {
3075 /* will need at least this one... */
3076 config_sets = malloc(sizeof(RF_ConfigSet_t),
3077 M_RAIDFRAME, M_WAITOK);
3078 /* this one is easy :) */
3079 config_sets->ac = ac;
3080 config_sets->next = NULL;
3081 config_sets->rootable = 0;
3082 ac->next = NULL;
3083 } else {
3084 /* which set does this component fit into? */
3085 cset = config_sets;
3086 while(cset!=NULL) {
3087 if (rf_does_it_fit(cset, ac)) {
3088 /* looks like it matches... */
3089 ac->next = cset->ac;
3090 cset->ac = ac;
3091 break;
3092 }
3093 cset = cset->next;
3094 }
3095 if (cset==NULL) {
3096 /* didn't find a match above... new set..*/
3097 cset = malloc(sizeof(RF_ConfigSet_t),
3098 M_RAIDFRAME, M_WAITOK);
3099 cset->ac = ac;
3100 ac->next = NULL;
3101 cset->next = config_sets;
3102 cset->rootable = 0;
3103 config_sets = cset;
3104 }
3105 }
3106 ac = ac_next;
3107 }
3108
3109
3110 return(config_sets);
3111 }
3112
3113 static int
3114 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3115 {
3116 RF_ComponentLabel_t *clabel1, *clabel2;
3117
3118 /* If this one matches the *first* one in the set, that's good
3119 enough, since the other members of the set would have been
3120 through here too... */
3121 /* note that we are not checking partitionSize here..
3122
3123 Note that we are also not checking the mod_counters here.
3124 If everything else matches except the mod_counter, that's
3125 good enough for this test. We will deal with the mod_counters
3126 a little later in the autoconfiguration process.
3127
3128 (clabel1->mod_counter == clabel2->mod_counter) &&
3129
3130 The reason we don't check for this is that failed disks
3131 will have lower modification counts. If those disks are
3132 not added to the set they used to belong to, then they will
3133 form their own set, which may result in 2 different sets,
3134 for example, competing to be configured at raid0, and
3135 perhaps competing to be the root filesystem set. If the
3136 wrong ones get configured, or both attempt to become /,
3137 weird behaviour and or serious lossage will occur. Thus we
3138 need to bring them into the fold here, and kick them out at
3139 a later point.
3140
3141 */
3142
3143 clabel1 = cset->ac->clabel;
3144 clabel2 = ac->clabel;
3145 if ((clabel1->version == clabel2->version) &&
3146 (clabel1->serial_number == clabel2->serial_number) &&
3147 (clabel1->num_rows == clabel2->num_rows) &&
3148 (clabel1->num_columns == clabel2->num_columns) &&
3149 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3150 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3151 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3152 (clabel1->parityConfig == clabel2->parityConfig) &&
3153 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3154 (clabel1->blockSize == clabel2->blockSize) &&
3155 rf_component_label_numblocks(clabel1) ==
3156 rf_component_label_numblocks(clabel2) &&
3157 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3158 (clabel1->root_partition == clabel2->root_partition) &&
3159 (clabel1->last_unit == clabel2->last_unit) &&
3160 (clabel1->config_order == clabel2->config_order)) {
3161 /* if it get's here, it almost *has* to be a match */
3162 } else {
3163 /* it's not consistent with somebody in the set..
3164 punt */
3165 return(0);
3166 }
3167 /* all was fine.. it must fit... */
3168 return(1);
3169 }
3170
3171 int
3172 rf_have_enough_components(RF_ConfigSet_t *cset)
3173 {
3174 RF_AutoConfig_t *ac;
3175 RF_AutoConfig_t *auto_config;
3176 RF_ComponentLabel_t *clabel;
3177 int c;
3178 int num_cols;
3179 int num_missing;
3180 int mod_counter;
3181 int mod_counter_found;
3182 int even_pair_failed;
3183 char parity_type;
3184
3185
3186 /* check to see that we have enough 'live' components
3187 of this set. If so, we can configure it if necessary */
3188
3189 num_cols = cset->ac->clabel->num_columns;
3190 parity_type = cset->ac->clabel->parityConfig;
3191
3192 /* XXX Check for duplicate components!?!?!? */
3193
3194 /* Determine what the mod_counter is supposed to be for this set. */
3195
3196 mod_counter_found = 0;
3197 mod_counter = 0;
3198 ac = cset->ac;
3199 while(ac!=NULL) {
3200 if (mod_counter_found==0) {
3201 mod_counter = ac->clabel->mod_counter;
3202 mod_counter_found = 1;
3203 } else {
3204 if (ac->clabel->mod_counter > mod_counter) {
3205 mod_counter = ac->clabel->mod_counter;
3206 }
3207 }
3208 ac = ac->next;
3209 }
3210
3211 num_missing = 0;
3212 auto_config = cset->ac;
3213
3214 even_pair_failed = 0;
3215 for(c=0; c<num_cols; c++) {
3216 ac = auto_config;
3217 while(ac!=NULL) {
3218 if ((ac->clabel->column == c) &&
3219 (ac->clabel->mod_counter == mod_counter)) {
3220 /* it's this one... */
3221 #ifdef DEBUG
3222 printf("Found: %s at %d\n",
3223 ac->devname,c);
3224 #endif
3225 break;
3226 }
3227 ac=ac->next;
3228 }
3229 if (ac==NULL) {
3230 /* Didn't find one here! */
3231 /* special case for RAID 1, especially
3232 where there are more than 2
3233 components (where RAIDframe treats
3234 things a little differently :( ) */
3235 if (parity_type == '1') {
3236 if (c%2 == 0) { /* even component */
3237 even_pair_failed = 1;
3238 } else { /* odd component. If
3239 we're failed, and
3240 so is the even
3241 component, it's
3242 "Good Night, Charlie" */
3243 if (even_pair_failed == 1) {
3244 return(0);
3245 }
3246 }
3247 } else {
3248 /* normal accounting */
3249 num_missing++;
3250 }
3251 }
3252 if ((parity_type == '1') && (c%2 == 1)) {
3253 /* Just did an even component, and we didn't
3254 bail.. reset the even_pair_failed flag,
3255 and go on to the next component.... */
3256 even_pair_failed = 0;
3257 }
3258 }
3259
3260 clabel = cset->ac->clabel;
3261
3262 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3263 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3264 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3265 /* XXX this needs to be made *much* more general */
3266 /* Too many failures */
3267 return(0);
3268 }
3269 /* otherwise, all is well, and we've got enough to take a kick
3270 at autoconfiguring this set */
3271 return(1);
3272 }
3273
3274 void
3275 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3276 RF_Raid_t *raidPtr)
3277 {
3278 RF_ComponentLabel_t *clabel;
3279 int i;
3280
3281 clabel = ac->clabel;
3282
3283 /* 1. Fill in the common stuff */
3284 config->numCol = clabel->num_columns;
3285 config->numSpare = 0; /* XXX should this be set here? */
3286 config->sectPerSU = clabel->sectPerSU;
3287 config->SUsPerPU = clabel->SUsPerPU;
3288 config->SUsPerRU = clabel->SUsPerRU;
3289 config->parityConfig = clabel->parityConfig;
3290 /* XXX... */
3291 strcpy(config->diskQueueType,"fifo");
3292 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3293 config->layoutSpecificSize = 0; /* XXX ?? */
3294
3295 while(ac!=NULL) {
3296 /* row/col values will be in range due to the checks
3297 in reasonable_label() */
3298 strcpy(config->devnames[0][ac->clabel->column],
3299 ac->devname);
3300 ac = ac->next;
3301 }
3302
3303 for(i=0;i<RF_MAXDBGV;i++) {
3304 config->debugVars[i][0] = 0;
3305 }
3306 }
3307
3308 int
3309 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3310 {
3311 RF_ComponentLabel_t *clabel;
3312 int column;
3313 int sparecol;
3314
3315 raidPtr->autoconfigure = new_value;
3316
3317 for(column=0; column<raidPtr->numCol; column++) {
3318 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3319 clabel = raidget_component_label(raidPtr, column);
3320 clabel->autoconfigure = new_value;
3321 raidflush_component_label(raidPtr, column);
3322 }
3323 }
3324 for(column = 0; column < raidPtr->numSpare ; column++) {
3325 sparecol = raidPtr->numCol + column;
3326 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3327 clabel = raidget_component_label(raidPtr, sparecol);
3328 clabel->autoconfigure = new_value;
3329 raidflush_component_label(raidPtr, sparecol);
3330 }
3331 }
3332 return(new_value);
3333 }
3334
3335 int
3336 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3337 {
3338 RF_ComponentLabel_t *clabel;
3339 int column;
3340 int sparecol;
3341
3342 raidPtr->root_partition = new_value;
3343 for(column=0; column<raidPtr->numCol; column++) {
3344 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3345 clabel = raidget_component_label(raidPtr, column);
3346 clabel->root_partition = new_value;
3347 raidflush_component_label(raidPtr, column);
3348 }
3349 }
3350 for(column = 0; column < raidPtr->numSpare ; column++) {
3351 sparecol = raidPtr->numCol + column;
3352 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3353 clabel = raidget_component_label(raidPtr, sparecol);
3354 clabel->root_partition = new_value;
3355 raidflush_component_label(raidPtr, sparecol);
3356 }
3357 }
3358 return(new_value);
3359 }
3360
3361 void
3362 rf_release_all_vps(RF_ConfigSet_t *cset)
3363 {
3364 RF_AutoConfig_t *ac;
3365
3366 ac = cset->ac;
3367 while(ac!=NULL) {
3368 /* Close the vp, and give it back */
3369 if (ac->vp) {
3370 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3371 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3372 vput(ac->vp);
3373 ac->vp = NULL;
3374 }
3375 ac = ac->next;
3376 }
3377 }
3378
3379
3380 void
3381 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3382 {
3383 RF_AutoConfig_t *ac;
3384 RF_AutoConfig_t *next_ac;
3385
3386 ac = cset->ac;
3387 while(ac!=NULL) {
3388 next_ac = ac->next;
3389 /* nuke the label */
3390 free(ac->clabel, M_RAIDFRAME);
3391 /* cleanup the config structure */
3392 free(ac, M_RAIDFRAME);
3393 /* "next.." */
3394 ac = next_ac;
3395 }
3396 /* and, finally, nuke the config set */
3397 free(cset, M_RAIDFRAME);
3398 }
3399
3400
3401 void
3402 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3403 {
3404 /* current version number */
3405 clabel->version = RF_COMPONENT_LABEL_VERSION;
3406 clabel->serial_number = raidPtr->serial_number;
3407 clabel->mod_counter = raidPtr->mod_counter;
3408
3409 clabel->num_rows = 1;
3410 clabel->num_columns = raidPtr->numCol;
3411 clabel->clean = RF_RAID_DIRTY; /* not clean */
3412 clabel->status = rf_ds_optimal; /* "It's good!" */
3413
3414 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3415 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3416 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3417
3418 clabel->blockSize = raidPtr->bytesPerSector;
3419 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3420
3421 /* XXX not portable */
3422 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3423 clabel->maxOutstanding = raidPtr->maxOutstanding;
3424 clabel->autoconfigure = raidPtr->autoconfigure;
3425 clabel->root_partition = raidPtr->root_partition;
3426 clabel->last_unit = raidPtr->raidid;
3427 clabel->config_order = raidPtr->config_order;
3428
3429 #ifndef RF_NO_PARITY_MAP
3430 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3431 #endif
3432 }
3433
3434 struct raid_softc *
3435 rf_auto_config_set(RF_ConfigSet_t *cset)
3436 {
3437 RF_Raid_t *raidPtr;
3438 RF_Config_t *config;
3439 int raidID;
3440 struct raid_softc *sc;
3441
3442 #ifdef DEBUG
3443 printf("RAID autoconfigure\n");
3444 #endif
3445
3446 /* 1. Create a config structure */
3447 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3448
3449 /*
3450 2. Figure out what RAID ID this one is supposed to live at
3451 See if we can get the same RAID dev that it was configured
3452 on last time..
3453 */
3454
3455 raidID = cset->ac->clabel->last_unit;
3456 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3457 sc = raidget(++raidID, false))
3458 continue;
3459 #ifdef DEBUG
3460 printf("Configuring raid%d:\n",raidID);
3461 #endif
3462
3463 if (sc == NULL)
3464 sc = raidget(raidID, true);
3465 raidPtr = &sc->sc_r;
3466
3467 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3468 raidPtr->softc = sc;
3469 raidPtr->raidid = raidID;
3470 raidPtr->openings = RAIDOUTSTANDING;
3471
3472 /* 3. Build the configuration structure */
3473 rf_create_configuration(cset->ac, config, raidPtr);
3474
3475 /* 4. Do the configuration */
3476 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3477 raidinit(sc);
3478
3479 rf_markalldirty(raidPtr);
3480 raidPtr->autoconfigure = 1; /* XXX do this here? */
3481 switch (cset->ac->clabel->root_partition) {
3482 case 1: /* Force Root */
3483 case 2: /* Soft Root: root when boot partition part of raid */
3484 /*
3485 * everything configured just fine. Make a note
3486 * that this set is eligible to be root,
3487 * or forced to be root
3488 */
3489 cset->rootable = cset->ac->clabel->root_partition;
3490 /* XXX do this here? */
3491 raidPtr->root_partition = cset->rootable;
3492 break;
3493 default:
3494 break;
3495 }
3496 } else {
3497 raidput(sc);
3498 sc = NULL;
3499 }
3500
3501 /* 5. Cleanup */
3502 free(config, M_RAIDFRAME);
3503 return sc;
3504 }
3505
3506 void
3507 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3508 size_t xmin, size_t xmax)
3509 {
3510
3511 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3512 pool_sethiwat(p, xmax);
3513 pool_prime(p, xmin);
3514 }
3515
3516 /*
3517 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3518 * to see if there is IO pending and if that IO could possibly be done
3519 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3520 * otherwise.
3521 *
3522 */
3523 int
3524 rf_buf_queue_check(RF_Raid_t *raidPtr)
3525 {
3526 struct raid_softc *rs;
3527 struct dk_softc *dksc;
3528
3529 rs = raidPtr->softc;
3530 dksc = &rs->sc_dksc;
3531
3532 if ((rs->sc_flags & RAIDF_INITED) == 0)
3533 return 1;
3534
3535 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3536 /* there is work to do */
3537 return 0;
3538 }
3539 /* default is nothing to do */
3540 return 1;
3541 }
3542
3543 int
3544 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3545 {
3546 uint64_t numsecs;
3547 unsigned secsize;
3548 int error;
3549
3550 error = getdisksize(vp, &numsecs, &secsize);
3551 if (error == 0) {
3552 diskPtr->blockSize = secsize;
3553 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3554 diskPtr->partitionSize = numsecs;
3555 return 0;
3556 }
3557 return error;
3558 }
3559
3560 static int
3561 raid_match(device_t self, cfdata_t cfdata, void *aux)
3562 {
3563 return 1;
3564 }
3565
3566 static void
3567 raid_attach(device_t parent, device_t self, void *aux)
3568 {
3569 }
3570
3571
3572 static int
3573 raid_detach(device_t self, int flags)
3574 {
3575 int error;
3576 struct raid_softc *rs = raidsoftc(self);
3577
3578 if (rs == NULL)
3579 return ENXIO;
3580
3581 if ((error = raidlock(rs)) != 0)
3582 return error;
3583
3584 error = raid_detach_unlocked(rs);
3585
3586 raidunlock(rs);
3587
3588 /* XXX raid can be referenced here */
3589
3590 if (error)
3591 return error;
3592
3593 /* Free the softc */
3594 raidput(rs);
3595
3596 return 0;
3597 }
3598
3599 static void
3600 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3601 {
3602 struct dk_softc *dksc = &rs->sc_dksc;
3603 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3604
3605 memset(dg, 0, sizeof(*dg));
3606
3607 dg->dg_secperunit = raidPtr->totalSectors;
3608 dg->dg_secsize = raidPtr->bytesPerSector;
3609 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3610 dg->dg_ntracks = 4 * raidPtr->numCol;
3611
3612 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3613 }
3614
3615 /*
3616 * Get cache info for all the components (including spares).
3617 * Returns intersection of all the cache flags of all disks, or first
3618 * error if any encountered.
3619 * XXXfua feature flags can change as spares are added - lock down somehow
3620 */
3621 static int
3622 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3623 {
3624 int c;
3625 int error;
3626 int dkwhole = 0, dkpart;
3627
3628 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3629 /*
3630 * Check any non-dead disk, even when currently being
3631 * reconstructed.
3632 */
3633 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3634 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3635 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3636 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3637 if (error) {
3638 if (error != ENODEV) {
3639 printf("raid%d: get cache for component %s failed\n",
3640 raidPtr->raidid,
3641 raidPtr->Disks[c].devname);
3642 }
3643
3644 return error;
3645 }
3646
3647 if (c == 0)
3648 dkwhole = dkpart;
3649 else
3650 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3651 }
3652 }
3653
3654 *data = dkwhole;
3655
3656 return 0;
3657 }
3658
3659 /*
3660 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3661 * We end up returning whatever error was returned by the first cache flush
3662 * that fails.
3663 */
3664
3665 static int
3666 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3667 {
3668 int e = 0;
3669 for (int i = 0; i < 5; i++) {
3670 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3671 &force, FWRITE, NOCRED);
3672 if (!e || e == ENODEV)
3673 return e;
3674 printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3675 raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3676 }
3677 return e;
3678 }
3679
3680 int
3681 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3682 {
3683 int c, error;
3684
3685 error = 0;
3686 for (c = 0; c < raidPtr->numCol; c++) {
3687 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3688 int e = rf_sync_component_cache(raidPtr, c, force);
3689 if (e && !error)
3690 error = e;
3691 }
3692 }
3693
3694 for (c = 0; c < raidPtr->numSpare ; c++) {
3695 int sparecol = raidPtr->numCol + c;
3696 /* Need to ensure that the reconstruct actually completed! */
3697 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3698 int e = rf_sync_component_cache(raidPtr, sparecol,
3699 force);
3700 if (e && !error)
3701 error = e;
3702 }
3703 }
3704 return error;
3705 }
3706
3707 /* Fill in info with the current status */
3708 void
3709 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3710 {
3711
3712 if (raidPtr->status != rf_rs_reconstructing) {
3713 info->total = 100;
3714 info->completed = 100;
3715 } else {
3716 info->total = raidPtr->reconControl->numRUsTotal;
3717 info->completed = raidPtr->reconControl->numRUsComplete;
3718 }
3719 info->remaining = info->total - info->completed;
3720 }
3721
3722 /* Fill in info with the current status */
3723 void
3724 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3725 {
3726
3727 if (raidPtr->parity_rewrite_in_progress == 1) {
3728 info->total = raidPtr->Layout.numStripe;
3729 info->completed = raidPtr->parity_rewrite_stripes_done;
3730 } else {
3731 info->completed = 100;
3732 info->total = 100;
3733 }
3734 info->remaining = info->total - info->completed;
3735 }
3736
3737 /* Fill in info with the current status */
3738 void
3739 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3740 {
3741
3742 if (raidPtr->copyback_in_progress == 1) {
3743 info->total = raidPtr->Layout.numStripe;
3744 info->completed = raidPtr->copyback_stripes_done;
3745 info->remaining = info->total - info->completed;
3746 } else {
3747 info->remaining = 0;
3748 info->completed = 100;
3749 info->total = 100;
3750 }
3751 }
3752
3753 /* Fill in config with the current info */
3754 int
3755 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3756 {
3757 int d, i, j;
3758
3759 if (!raidPtr->valid)
3760 return ENODEV;
3761 config->cols = raidPtr->numCol;
3762 config->ndevs = raidPtr->numCol;
3763 if (config->ndevs >= RF_MAX_DISKS)
3764 return ENOMEM;
3765 config->nspares = raidPtr->numSpare;
3766 if (config->nspares >= RF_MAX_DISKS)
3767 return ENOMEM;
3768 config->maxqdepth = raidPtr->maxQueueDepth;
3769 d = 0;
3770 for (j = 0; j < config->cols; j++) {
3771 config->devs[d] = raidPtr->Disks[j];
3772 d++;
3773 }
3774 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3775 config->spares[i] = raidPtr->Disks[j];
3776 if (config->spares[i].status == rf_ds_rebuilding_spare) {
3777 /* XXX: raidctl(8) expects to see this as a used spare */
3778 config->spares[i].status = rf_ds_used_spare;
3779 }
3780 }
3781 return 0;
3782 }
3783
3784 int
3785 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3786 {
3787 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3788 RF_ComponentLabel_t *raid_clabel;
3789 int column = clabel->column;
3790
3791 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3792 return EINVAL;
3793 raid_clabel = raidget_component_label(raidPtr, column);
3794 memcpy(clabel, raid_clabel, sizeof *clabel);
3795
3796 return 0;
3797 }
3798
3799 /*
3800 * Module interface
3801 */
3802
3803 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3804
3805 #ifdef _MODULE
3806 CFDRIVER_DECL(raid, DV_DISK, NULL);
3807 #endif
3808
3809 static int raid_modcmd(modcmd_t, void *);
3810 static int raid_modcmd_init(void);
3811 static int raid_modcmd_fini(void);
3812
3813 static int
3814 raid_modcmd(modcmd_t cmd, void *data)
3815 {
3816 int error;
3817
3818 error = 0;
3819 switch (cmd) {
3820 case MODULE_CMD_INIT:
3821 error = raid_modcmd_init();
3822 break;
3823 case MODULE_CMD_FINI:
3824 error = raid_modcmd_fini();
3825 break;
3826 default:
3827 error = ENOTTY;
3828 break;
3829 }
3830 return error;
3831 }
3832
3833 static int
3834 raid_modcmd_init(void)
3835 {
3836 int error;
3837 int bmajor, cmajor;
3838
3839 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3840 mutex_enter(&raid_lock);
3841 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3842 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3843 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3844 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3845
3846 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3847 #endif
3848
3849 bmajor = cmajor = -1;
3850 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3851 &raid_cdevsw, &cmajor);
3852 if (error != 0 && error != EEXIST) {
3853 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3854 mutex_exit(&raid_lock);
3855 return error;
3856 }
3857 #ifdef _MODULE
3858 error = config_cfdriver_attach(&raid_cd);
3859 if (error != 0) {
3860 aprint_error("%s: config_cfdriver_attach failed %d\n",
3861 __func__, error);
3862 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3863 mutex_exit(&raid_lock);
3864 return error;
3865 }
3866 #endif
3867 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3868 if (error != 0) {
3869 aprint_error("%s: config_cfattach_attach failed %d\n",
3870 __func__, error);
3871 #ifdef _MODULE
3872 config_cfdriver_detach(&raid_cd);
3873 #endif
3874 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3875 mutex_exit(&raid_lock);
3876 return error;
3877 }
3878
3879 raidautoconfigdone = false;
3880
3881 mutex_exit(&raid_lock);
3882
3883 if (error == 0) {
3884 if (rf_BootRaidframe(true) == 0)
3885 aprint_verbose("Kernelized RAIDframe activated\n");
3886 else
3887 panic("Serious error activating RAID!!");
3888 }
3889
3890 /*
3891 * Register a finalizer which will be used to auto-config RAID
3892 * sets once all real hardware devices have been found.
3893 */
3894 error = config_finalize_register(NULL, rf_autoconfig);
3895 if (error != 0) {
3896 aprint_error("WARNING: unable to register RAIDframe "
3897 "finalizer\n");
3898 error = 0;
3899 }
3900
3901 return error;
3902 }
3903
3904 static int
3905 raid_modcmd_fini(void)
3906 {
3907 int error;
3908
3909 mutex_enter(&raid_lock);
3910
3911 /* Don't allow unload if raid device(s) exist. */
3912 if (!LIST_EMPTY(&raids)) {
3913 mutex_exit(&raid_lock);
3914 return EBUSY;
3915 }
3916
3917 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3918 if (error != 0) {
3919 aprint_error("%s: cannot detach cfattach\n",__func__);
3920 mutex_exit(&raid_lock);
3921 return error;
3922 }
3923 #ifdef _MODULE
3924 error = config_cfdriver_detach(&raid_cd);
3925 if (error != 0) {
3926 aprint_error("%s: cannot detach cfdriver\n",__func__);
3927 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3928 mutex_exit(&raid_lock);
3929 return error;
3930 }
3931 #endif
3932 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3933 if (error != 0) {
3934 aprint_error("%s: cannot detach devsw\n",__func__);
3935 #ifdef _MODULE
3936 config_cfdriver_attach(&raid_cd);
3937 #endif
3938 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3939 mutex_exit(&raid_lock);
3940 return error;
3941 }
3942 rf_BootRaidframe(false);
3943 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3944 rf_destroy_mutex2(rf_sparet_wait_mutex);
3945 rf_destroy_cond2(rf_sparet_wait_cv);
3946 rf_destroy_cond2(rf_sparet_resp_cv);
3947 #endif
3948 mutex_exit(&raid_lock);
3949 mutex_destroy(&raid_lock);
3950
3951 return error;
3952 }
3953