rf_netbsdkintf.c revision 1.384 1 /* $NetBSD: rf_netbsdkintf.c,v 1.384 2020/06/19 19:29:39 jdolecek Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.384 2020/06/19 19:29:39 jdolecek Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171
172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
173 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
175 * installation process */
176 #endif
177
178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
179
180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
181
182 /* prototypes */
183 static void KernelWakeupFunc(struct buf *);
184 static void InitBP(struct buf *, struct vnode *, unsigned,
185 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
186 void *, int);
187 static void raidinit(struct raid_softc *);
188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
190
191 static int raid_match(device_t, cfdata_t, void *);
192 static void raid_attach(device_t, device_t, void *);
193 static int raid_detach(device_t, int);
194
195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
196 daddr_t, daddr_t);
197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
198 daddr_t, daddr_t, int);
199
200 static int raidwrite_component_label(unsigned,
201 dev_t, struct vnode *, RF_ComponentLabel_t *);
202 static int raidread_component_label(unsigned,
203 dev_t, struct vnode *, RF_ComponentLabel_t *);
204
205 static int raid_diskstart(device_t, struct buf *bp);
206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
207 static int raid_lastclose(device_t);
208
209 static dev_type_open(raidopen);
210 static dev_type_close(raidclose);
211 static dev_type_read(raidread);
212 static dev_type_write(raidwrite);
213 static dev_type_ioctl(raidioctl);
214 static dev_type_strategy(raidstrategy);
215 static dev_type_dump(raiddump);
216 static dev_type_size(raidsize);
217
218 const struct bdevsw raid_bdevsw = {
219 .d_open = raidopen,
220 .d_close = raidclose,
221 .d_strategy = raidstrategy,
222 .d_ioctl = raidioctl,
223 .d_dump = raiddump,
224 .d_psize = raidsize,
225 .d_discard = nodiscard,
226 .d_flag = D_DISK
227 };
228
229 const struct cdevsw raid_cdevsw = {
230 .d_open = raidopen,
231 .d_close = raidclose,
232 .d_read = raidread,
233 .d_write = raidwrite,
234 .d_ioctl = raidioctl,
235 .d_stop = nostop,
236 .d_tty = notty,
237 .d_poll = nopoll,
238 .d_mmap = nommap,
239 .d_kqfilter = nokqfilter,
240 .d_discard = nodiscard,
241 .d_flag = D_DISK
242 };
243
244 static struct dkdriver rf_dkdriver = {
245 .d_open = raidopen,
246 .d_close = raidclose,
247 .d_strategy = raidstrategy,
248 .d_diskstart = raid_diskstart,
249 .d_dumpblocks = raid_dumpblocks,
250 .d_lastclose = raid_lastclose,
251 .d_minphys = minphys
252 };
253
254 #define raidunit(x) DISKUNIT(x)
255 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
256
257 extern struct cfdriver raid_cd;
258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
259 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
260 DVF_DETACH_SHUTDOWN);
261
262 /* Internal representation of a rf_recon_req */
263 struct rf_recon_req_internal {
264 RF_RowCol_t col;
265 RF_ReconReqFlags_t flags;
266 void *raidPtr;
267 };
268
269 /*
270 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
271 * Be aware that large numbers can allow the driver to consume a lot of
272 * kernel memory, especially on writes, and in degraded mode reads.
273 *
274 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
275 * a single 64K write will typically require 64K for the old data,
276 * 64K for the old parity, and 64K for the new parity, for a total
277 * of 192K (if the parity buffer is not re-used immediately).
278 * Even it if is used immediately, that's still 128K, which when multiplied
279 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
280 *
281 * Now in degraded mode, for example, a 64K read on the above setup may
282 * require data reconstruction, which will require *all* of the 4 remaining
283 * disks to participate -- 4 * 32K/disk == 128K again.
284 */
285
286 #ifndef RAIDOUTSTANDING
287 #define RAIDOUTSTANDING 6
288 #endif
289
290 #define RAIDLABELDEV(dev) \
291 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
292
293 /* declared here, and made public, for the benefit of KVM stuff.. */
294
295 static int raidlock(struct raid_softc *);
296 static void raidunlock(struct raid_softc *);
297
298 static int raid_detach_unlocked(struct raid_softc *);
299
300 static void rf_markalldirty(RF_Raid_t *);
301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
302
303 void rf_ReconThread(struct rf_recon_req_internal *);
304 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
305 void rf_CopybackThread(RF_Raid_t *raidPtr);
306 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
307 int rf_autoconfig(device_t);
308 void rf_buildroothack(RF_ConfigSet_t *);
309
310 RF_AutoConfig_t *rf_find_raid_components(void);
311 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
313 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
314 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
315 int rf_set_autoconfig(RF_Raid_t *, int);
316 int rf_set_rootpartition(RF_Raid_t *, int);
317 void rf_release_all_vps(RF_ConfigSet_t *);
318 void rf_cleanup_config_set(RF_ConfigSet_t *);
319 int rf_have_enough_components(RF_ConfigSet_t *);
320 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
322
323 /*
324 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
325 * Note that this is overridden by having RAID_AUTOCONFIG as an option
326 * in the kernel config file.
327 */
328 #ifdef RAID_AUTOCONFIG
329 int raidautoconfig = 1;
330 #else
331 int raidautoconfig = 0;
332 #endif
333 static bool raidautoconfigdone = false;
334
335 struct RF_Pools_s rf_pools;
336
337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
338 static kmutex_t raid_lock;
339
340 static struct raid_softc *
341 raidcreate(int unit) {
342 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
343 sc->sc_unit = unit;
344 cv_init(&sc->sc_cv, "raidunit");
345 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
346 return sc;
347 }
348
349 static void
350 raiddestroy(struct raid_softc *sc) {
351 cv_destroy(&sc->sc_cv);
352 mutex_destroy(&sc->sc_mutex);
353 kmem_free(sc, sizeof(*sc));
354 }
355
356 static struct raid_softc *
357 raidget(int unit, bool create) {
358 struct raid_softc *sc;
359 if (unit < 0) {
360 #ifdef DIAGNOSTIC
361 panic("%s: unit %d!", __func__, unit);
362 #endif
363 return NULL;
364 }
365 mutex_enter(&raid_lock);
366 LIST_FOREACH(sc, &raids, sc_link) {
367 if (sc->sc_unit == unit) {
368 mutex_exit(&raid_lock);
369 return sc;
370 }
371 }
372 mutex_exit(&raid_lock);
373 if (!create)
374 return NULL;
375 sc = raidcreate(unit);
376 mutex_enter(&raid_lock);
377 LIST_INSERT_HEAD(&raids, sc, sc_link);
378 mutex_exit(&raid_lock);
379 return sc;
380 }
381
382 static void
383 raidput(struct raid_softc *sc) {
384 mutex_enter(&raid_lock);
385 LIST_REMOVE(sc, sc_link);
386 mutex_exit(&raid_lock);
387 raiddestroy(sc);
388 }
389
390 void
391 raidattach(int num)
392 {
393
394 /*
395 * Device attachment and associated initialization now occurs
396 * as part of the module initialization.
397 */
398 }
399
400 int
401 rf_autoconfig(device_t self)
402 {
403 RF_AutoConfig_t *ac_list;
404 RF_ConfigSet_t *config_sets;
405
406 if (!raidautoconfig || raidautoconfigdone == true)
407 return (0);
408
409 /* XXX This code can only be run once. */
410 raidautoconfigdone = true;
411
412 #ifdef __HAVE_CPU_BOOTCONF
413 /*
414 * 0. find the boot device if needed first so we can use it later
415 * this needs to be done before we autoconfigure any raid sets,
416 * because if we use wedges we are not going to be able to open
417 * the boot device later
418 */
419 if (booted_device == NULL)
420 cpu_bootconf();
421 #endif
422 /* 1. locate all RAID components on the system */
423 aprint_debug("Searching for RAID components...\n");
424 ac_list = rf_find_raid_components();
425
426 /* 2. Sort them into their respective sets. */
427 config_sets = rf_create_auto_sets(ac_list);
428
429 /*
430 * 3. Evaluate each set and configure the valid ones.
431 * This gets done in rf_buildroothack().
432 */
433 rf_buildroothack(config_sets);
434
435 return 1;
436 }
437
438 int
439 rf_inited(const struct raid_softc *rs) {
440 return (rs->sc_flags & RAIDF_INITED) != 0;
441 }
442
443 RF_Raid_t *
444 rf_get_raid(struct raid_softc *rs) {
445 return &rs->sc_r;
446 }
447
448 int
449 rf_get_unit(const struct raid_softc *rs) {
450 return rs->sc_unit;
451 }
452
453 static int
454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
455 const char *bootname;
456 size_t len;
457
458 /* if bdv is NULL, the set can't contain it. exit early. */
459 if (bdv == NULL)
460 return 0;
461
462 bootname = device_xname(bdv);
463 len = strlen(bootname);
464
465 for (int col = 0; col < r->numCol; col++) {
466 const char *devname = r->Disks[col].devname;
467 devname += sizeof("/dev/") - 1;
468 if (strncmp(devname, "dk", 2) == 0) {
469 const char *parent =
470 dkwedge_get_parent_name(r->Disks[col].dev);
471 if (parent != NULL)
472 devname = parent;
473 }
474 if (strncmp(devname, bootname, len) == 0) {
475 struct raid_softc *sc = r->softc;
476 aprint_debug("raid%d includes boot device %s\n",
477 sc->sc_unit, devname);
478 return 1;
479 }
480 }
481 return 0;
482 }
483
484 void
485 rf_buildroothack(RF_ConfigSet_t *config_sets)
486 {
487 RF_ConfigSet_t *cset;
488 RF_ConfigSet_t *next_cset;
489 int num_root;
490 struct raid_softc *sc, *rsc;
491 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
492
493 sc = rsc = NULL;
494 num_root = 0;
495 cset = config_sets;
496 while (cset != NULL) {
497 next_cset = cset->next;
498 if (rf_have_enough_components(cset) &&
499 cset->ac->clabel->autoconfigure == 1) {
500 sc = rf_auto_config_set(cset);
501 if (sc != NULL) {
502 aprint_debug("raid%d: configured ok, rootable %d\n",
503 sc->sc_unit, cset->rootable);
504 if (cset->rootable) {
505 rsc = sc;
506 num_root++;
507 }
508 } else {
509 /* The autoconfig didn't work :( */
510 aprint_debug("Autoconfig failed\n");
511 rf_release_all_vps(cset);
512 }
513 } else {
514 /* we're not autoconfiguring this set...
515 release the associated resources */
516 rf_release_all_vps(cset);
517 }
518 /* cleanup */
519 rf_cleanup_config_set(cset);
520 cset = next_cset;
521 }
522
523 /* if the user has specified what the root device should be
524 then we don't touch booted_device or boothowto... */
525
526 if (rootspec != NULL) {
527 DPRINTF("%s: rootspec %s\n", __func__, rootspec);
528 return;
529 }
530
531 /* we found something bootable... */
532
533 /*
534 * XXX: The following code assumes that the root raid
535 * is the first ('a') partition. This is about the best
536 * we can do with a BSD disklabel, but we might be able
537 * to do better with a GPT label, by setting a specified
538 * attribute to indicate the root partition. We can then
539 * stash the partition number in the r->root_partition
540 * high bits (the bottom 2 bits are already used). For
541 * now we just set booted_partition to 0 when we override
542 * root.
543 */
544 if (num_root == 1) {
545 device_t candidate_root;
546 dksc = &rsc->sc_dksc;
547 if (dksc->sc_dkdev.dk_nwedges != 0) {
548 char cname[sizeof(cset->ac->devname)];
549 /* XXX: assume partition 'a' first */
550 snprintf(cname, sizeof(cname), "%s%c",
551 device_xname(dksc->sc_dev), 'a');
552 candidate_root = dkwedge_find_by_wname(cname);
553 DPRINTF("%s: candidate wedge root=%s\n", __func__,
554 cname);
555 if (candidate_root == NULL) {
556 /*
557 * If that is not found, because we don't use
558 * disklabel, return the first dk child
559 * XXX: we can skip the 'a' check above
560 * and always do this...
561 */
562 size_t i = 0;
563 candidate_root = dkwedge_find_by_parent(
564 device_xname(dksc->sc_dev), &i);
565 }
566 DPRINTF("%s: candidate wedge root=%p\n", __func__,
567 candidate_root);
568 } else
569 candidate_root = dksc->sc_dev;
570 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
571 DPRINTF("%s: booted_device=%p root_partition=%d "
572 "contains_boot=%d",
573 __func__, booted_device, rsc->sc_r.root_partition,
574 rf_containsboot(&rsc->sc_r, booted_device));
575 /* XXX the check for booted_device == NULL can probably be
576 * dropped, now that rf_containsboot handles that case.
577 */
578 if (booted_device == NULL ||
579 rsc->sc_r.root_partition == 1 ||
580 rf_containsboot(&rsc->sc_r, booted_device)) {
581 booted_device = candidate_root;
582 booted_method = "raidframe/single";
583 booted_partition = 0; /* XXX assume 'a' */
584 }
585 } else if (num_root > 1) {
586 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
587 booted_device);
588
589 /*
590 * Maybe the MD code can help. If it cannot, then
591 * setroot() will discover that we have no
592 * booted_device and will ask the user if nothing was
593 * hardwired in the kernel config file
594 */
595 if (booted_device == NULL)
596 return;
597
598 num_root = 0;
599 mutex_enter(&raid_lock);
600 LIST_FOREACH(sc, &raids, sc_link) {
601 RF_Raid_t *r = &sc->sc_r;
602 if (r->valid == 0)
603 continue;
604
605 if (r->root_partition == 0)
606 continue;
607
608 if (rf_containsboot(r, booted_device)) {
609 num_root++;
610 rsc = sc;
611 dksc = &rsc->sc_dksc;
612 }
613 }
614 mutex_exit(&raid_lock);
615
616 if (num_root == 1) {
617 booted_device = dksc->sc_dev;
618 booted_method = "raidframe/multi";
619 booted_partition = 0; /* XXX assume 'a' */
620 } else {
621 /* we can't guess.. require the user to answer... */
622 boothowto |= RB_ASKNAME;
623 }
624 }
625 }
626
627 static int
628 raidsize(dev_t dev)
629 {
630 struct raid_softc *rs;
631 struct dk_softc *dksc;
632 unsigned int unit;
633
634 unit = raidunit(dev);
635 if ((rs = raidget(unit, false)) == NULL)
636 return -1;
637 dksc = &rs->sc_dksc;
638
639 if ((rs->sc_flags & RAIDF_INITED) == 0)
640 return -1;
641
642 return dk_size(dksc, dev);
643 }
644
645 static int
646 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
647 {
648 unsigned int unit;
649 struct raid_softc *rs;
650 struct dk_softc *dksc;
651
652 unit = raidunit(dev);
653 if ((rs = raidget(unit, false)) == NULL)
654 return ENXIO;
655 dksc = &rs->sc_dksc;
656
657 if ((rs->sc_flags & RAIDF_INITED) == 0)
658 return ENODEV;
659
660 /*
661 Note that blkno is relative to this particular partition.
662 By adding adding RF_PROTECTED_SECTORS, we get a value that
663 is relative to the partition used for the underlying component.
664 */
665 blkno += RF_PROTECTED_SECTORS;
666
667 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
668 }
669
670 static int
671 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
672 {
673 struct raid_softc *rs = raidsoftc(dev);
674 const struct bdevsw *bdev;
675 RF_Raid_t *raidPtr;
676 int c, sparecol, j, scol, dumpto;
677 int error = 0;
678
679 raidPtr = &rs->sc_r;
680
681 /* we only support dumping to RAID 1 sets */
682 if (raidPtr->Layout.numDataCol != 1 ||
683 raidPtr->Layout.numParityCol != 1)
684 return EINVAL;
685
686 if ((error = raidlock(rs)) != 0)
687 return error;
688
689 /* figure out what device is alive.. */
690
691 /*
692 Look for a component to dump to. The preference for the
693 component to dump to is as follows:
694 1) the first component
695 2) a used_spare of the first component
696 3) the second component
697 4) a used_spare of the second component
698 */
699
700 dumpto = -1;
701 for (c = 0; c < raidPtr->numCol; c++) {
702 if (raidPtr->Disks[c].status == rf_ds_optimal) {
703 /* this might be the one */
704 dumpto = c;
705 break;
706 }
707 }
708
709 /*
710 At this point we have possibly selected a live component.
711 If we didn't find a live ocmponent, we now check to see
712 if there is a relevant spared component.
713 */
714
715 for (c = 0; c < raidPtr->numSpare; c++) {
716 sparecol = raidPtr->numCol + c;
717 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
718 /* How about this one? */
719 scol = -1;
720 for(j=0;j<raidPtr->numCol;j++) {
721 if (raidPtr->Disks[j].spareCol == sparecol) {
722 scol = j;
723 break;
724 }
725 }
726 if (scol == 0) {
727 /*
728 We must have found a spared first
729 component! We'll take that over
730 anything else found so far. (We
731 couldn't have found a real first
732 component before, since this is a
733 used spare, and it's saying that
734 it's replacing the first
735 component.) On reboot (with
736 autoconfiguration turned on)
737 sparecol will become the first
738 component (component0) of this set.
739 */
740 dumpto = sparecol;
741 break;
742 } else if (scol != -1) {
743 /*
744 Must be a spared second component.
745 We'll dump to that if we havn't found
746 anything else so far.
747 */
748 if (dumpto == -1)
749 dumpto = sparecol;
750 }
751 }
752 }
753
754 if (dumpto == -1) {
755 /* we couldn't find any live components to dump to!?!?
756 */
757 error = EINVAL;
758 goto out;
759 }
760
761 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
762 if (bdev == NULL) {
763 error = ENXIO;
764 goto out;
765 }
766
767 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
768 blkno, va, nblk * raidPtr->bytesPerSector);
769
770 out:
771 raidunlock(rs);
772
773 return error;
774 }
775
776 /* ARGSUSED */
777 static int
778 raidopen(dev_t dev, int flags, int fmt,
779 struct lwp *l)
780 {
781 int unit = raidunit(dev);
782 struct raid_softc *rs;
783 struct dk_softc *dksc;
784 int error = 0;
785 int part, pmask;
786
787 if ((rs = raidget(unit, true)) == NULL)
788 return ENXIO;
789 if ((error = raidlock(rs)) != 0)
790 return (error);
791
792 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
793 error = EBUSY;
794 goto bad;
795 }
796
797 dksc = &rs->sc_dksc;
798
799 part = DISKPART(dev);
800 pmask = (1 << part);
801
802 if (!DK_BUSY(dksc, pmask) &&
803 ((rs->sc_flags & RAIDF_INITED) != 0)) {
804 /* First one... mark things as dirty... Note that we *MUST*
805 have done a configure before this. I DO NOT WANT TO BE
806 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
807 THAT THEY BELONG TOGETHER!!!!! */
808 /* XXX should check to see if we're only open for reading
809 here... If so, we needn't do this, but then need some
810 other way of keeping track of what's happened.. */
811
812 rf_markalldirty(&rs->sc_r);
813 }
814
815 if ((rs->sc_flags & RAIDF_INITED) != 0)
816 error = dk_open(dksc, dev, flags, fmt, l);
817
818 bad:
819 raidunlock(rs);
820
821 return (error);
822
823
824 }
825
826 static int
827 raid_lastclose(device_t self)
828 {
829 struct raid_softc *rs = raidsoftc(self);
830
831 /* Last one... device is not unconfigured yet.
832 Device shutdown has taken care of setting the
833 clean bits if RAIDF_INITED is not set
834 mark things as clean... */
835
836 rf_update_component_labels(&rs->sc_r,
837 RF_FINAL_COMPONENT_UPDATE);
838
839 /* pass to unlocked code */
840 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
841 rs->sc_flags |= RAIDF_DETACH;
842
843 return 0;
844 }
845
846 /* ARGSUSED */
847 static int
848 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
849 {
850 int unit = raidunit(dev);
851 struct raid_softc *rs;
852 struct dk_softc *dksc;
853 cfdata_t cf;
854 int error = 0, do_detach = 0, do_put = 0;
855
856 if ((rs = raidget(unit, false)) == NULL)
857 return ENXIO;
858 dksc = &rs->sc_dksc;
859
860 if ((error = raidlock(rs)) != 0)
861 return (error);
862
863 if ((rs->sc_flags & RAIDF_INITED) != 0) {
864 error = dk_close(dksc, dev, flags, fmt, l);
865 if ((rs->sc_flags & RAIDF_DETACH) != 0)
866 do_detach = 1;
867 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
868 do_put = 1;
869
870 raidunlock(rs);
871
872 if (do_detach) {
873 /* free the pseudo device attach bits */
874 cf = device_cfdata(dksc->sc_dev);
875 error = config_detach(dksc->sc_dev, 0);
876 if (error == 0)
877 free(cf, M_RAIDFRAME);
878 } else if (do_put) {
879 raidput(rs);
880 }
881
882 return (error);
883
884 }
885
886 static void
887 raid_wakeup(RF_Raid_t *raidPtr)
888 {
889 rf_lock_mutex2(raidPtr->iodone_lock);
890 rf_signal_cond2(raidPtr->iodone_cv);
891 rf_unlock_mutex2(raidPtr->iodone_lock);
892 }
893
894 static void
895 raidstrategy(struct buf *bp)
896 {
897 unsigned int unit;
898 struct raid_softc *rs;
899 struct dk_softc *dksc;
900 RF_Raid_t *raidPtr;
901
902 unit = raidunit(bp->b_dev);
903 if ((rs = raidget(unit, false)) == NULL) {
904 bp->b_error = ENXIO;
905 goto fail;
906 }
907 if ((rs->sc_flags & RAIDF_INITED) == 0) {
908 bp->b_error = ENXIO;
909 goto fail;
910 }
911 dksc = &rs->sc_dksc;
912 raidPtr = &rs->sc_r;
913
914 /* Queue IO only */
915 if (dk_strategy_defer(dksc, bp))
916 goto done;
917
918 /* schedule the IO to happen at the next convenient time */
919 raid_wakeup(raidPtr);
920
921 done:
922 return;
923
924 fail:
925 bp->b_resid = bp->b_bcount;
926 biodone(bp);
927 }
928
929 static int
930 raid_diskstart(device_t dev, struct buf *bp)
931 {
932 struct raid_softc *rs = raidsoftc(dev);
933 RF_Raid_t *raidPtr;
934
935 raidPtr = &rs->sc_r;
936 if (!raidPtr->valid) {
937 db1_printf(("raid is not valid..\n"));
938 return ENODEV;
939 }
940
941 /* XXX */
942 bp->b_resid = 0;
943
944 return raiddoaccess(raidPtr, bp);
945 }
946
947 void
948 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
949 {
950 struct raid_softc *rs;
951 struct dk_softc *dksc;
952
953 rs = raidPtr->softc;
954 dksc = &rs->sc_dksc;
955
956 dk_done(dksc, bp);
957
958 rf_lock_mutex2(raidPtr->mutex);
959 raidPtr->openings++;
960 rf_unlock_mutex2(raidPtr->mutex);
961
962 /* schedule more IO */
963 raid_wakeup(raidPtr);
964 }
965
966 /* ARGSUSED */
967 static int
968 raidread(dev_t dev, struct uio *uio, int flags)
969 {
970 int unit = raidunit(dev);
971 struct raid_softc *rs;
972
973 if ((rs = raidget(unit, false)) == NULL)
974 return ENXIO;
975
976 if ((rs->sc_flags & RAIDF_INITED) == 0)
977 return (ENXIO);
978
979 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
980
981 }
982
983 /* ARGSUSED */
984 static int
985 raidwrite(dev_t dev, struct uio *uio, int flags)
986 {
987 int unit = raidunit(dev);
988 struct raid_softc *rs;
989
990 if ((rs = raidget(unit, false)) == NULL)
991 return ENXIO;
992
993 if ((rs->sc_flags & RAIDF_INITED) == 0)
994 return (ENXIO);
995
996 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
997
998 }
999
1000 static int
1001 raid_detach_unlocked(struct raid_softc *rs)
1002 {
1003 struct dk_softc *dksc = &rs->sc_dksc;
1004 RF_Raid_t *raidPtr;
1005 int error;
1006
1007 raidPtr = &rs->sc_r;
1008
1009 if (DK_BUSY(dksc, 0) ||
1010 raidPtr->recon_in_progress != 0 ||
1011 raidPtr->parity_rewrite_in_progress != 0 ||
1012 raidPtr->copyback_in_progress != 0)
1013 return EBUSY;
1014
1015 if ((rs->sc_flags & RAIDF_INITED) == 0)
1016 return 0;
1017
1018 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1019
1020 if ((error = rf_Shutdown(raidPtr)) != 0)
1021 return error;
1022
1023 rs->sc_flags &= ~RAIDF_INITED;
1024
1025 /* Kill off any queued buffers */
1026 dk_drain(dksc);
1027 bufq_free(dksc->sc_bufq);
1028
1029 /* Detach the disk. */
1030 dkwedge_delall(&dksc->sc_dkdev);
1031 disk_detach(&dksc->sc_dkdev);
1032 disk_destroy(&dksc->sc_dkdev);
1033 dk_detach(dksc);
1034
1035 return 0;
1036 }
1037
1038 static bool
1039 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1040 {
1041 switch (cmd) {
1042 case RAIDFRAME_ADD_HOT_SPARE:
1043 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1044 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1045 case RAIDFRAME_CHECK_PARITY:
1046 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1047 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1048 case RAIDFRAME_CHECK_RECON_STATUS:
1049 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1050 case RAIDFRAME_COPYBACK:
1051 case RAIDFRAME_DELETE_COMPONENT:
1052 case RAIDFRAME_FAIL_DISK:
1053 case RAIDFRAME_GET_ACCTOTALS:
1054 case RAIDFRAME_GET_COMPONENT_LABEL:
1055 case RAIDFRAME_GET_INFO:
1056 case RAIDFRAME_GET_SIZE:
1057 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1058 case RAIDFRAME_INIT_LABELS:
1059 case RAIDFRAME_KEEP_ACCTOTALS:
1060 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1061 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1062 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1063 case RAIDFRAME_PARITYMAP_STATUS:
1064 case RAIDFRAME_REBUILD_IN_PLACE:
1065 case RAIDFRAME_REMOVE_HOT_SPARE:
1066 case RAIDFRAME_RESET_ACCTOTALS:
1067 case RAIDFRAME_REWRITEPARITY:
1068 case RAIDFRAME_SET_AUTOCONFIG:
1069 case RAIDFRAME_SET_COMPONENT_LABEL:
1070 case RAIDFRAME_SET_ROOT:
1071 return (rs->sc_flags & RAIDF_INITED) == 0;
1072 }
1073 return false;
1074 }
1075
1076 int
1077 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1078 {
1079 struct rf_recon_req_internal *rrint;
1080
1081 if (raidPtr->Layout.map->faultsTolerated == 0) {
1082 /* Can't do this on a RAID 0!! */
1083 return EINVAL;
1084 }
1085
1086 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1087 /* bad column */
1088 return EINVAL;
1089 }
1090
1091 rf_lock_mutex2(raidPtr->mutex);
1092 if (raidPtr->status == rf_rs_reconstructing) {
1093 /* you can't fail a disk while we're reconstructing! */
1094 /* XXX wrong for RAID6 */
1095 goto out;
1096 }
1097 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1098 (raidPtr->numFailures > 0)) {
1099 /* some other component has failed. Let's not make
1100 things worse. XXX wrong for RAID6 */
1101 goto out;
1102 }
1103 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1104 /* Can't fail a spared disk! */
1105 goto out;
1106 }
1107 rf_unlock_mutex2(raidPtr->mutex);
1108
1109 /* make a copy of the recon request so that we don't rely on
1110 * the user's buffer */
1111 rrint = RF_Malloc(sizeof(*rrint));
1112 if (rrint == NULL)
1113 return(ENOMEM);
1114 rrint->col = rr->col;
1115 rrint->flags = rr->flags;
1116 rrint->raidPtr = raidPtr;
1117
1118 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1119 rrint, "raid_recon");
1120 out:
1121 rf_unlock_mutex2(raidPtr->mutex);
1122 return EINVAL;
1123 }
1124
1125 static int
1126 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1127 {
1128 /* allocate a buffer for the layout-specific data, and copy it in */
1129 if (k_cfg->layoutSpecificSize == 0)
1130 return 0;
1131
1132 if (k_cfg->layoutSpecificSize > 10000) {
1133 /* sanity check */
1134 return EINVAL;
1135 }
1136
1137 u_char *specific_buf;
1138 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1139 if (specific_buf == NULL)
1140 return ENOMEM;
1141
1142 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1143 k_cfg->layoutSpecificSize);
1144 if (retcode) {
1145 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1146 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1147 return retcode;
1148 }
1149
1150 k_cfg->layoutSpecific = specific_buf;
1151 return 0;
1152 }
1153
1154 static int
1155 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1156 {
1157 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1158
1159 if (rs->sc_r.valid) {
1160 /* There is a valid RAID set running on this unit! */
1161 printf("raid%d: Device already configured!\n", rs->sc_unit);
1162 return EINVAL;
1163 }
1164
1165 /* copy-in the configuration information */
1166 /* data points to a pointer to the configuration structure */
1167 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1168 if (*k_cfg == NULL) {
1169 return ENOMEM;
1170 }
1171 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1172 if (retcode == 0)
1173 return 0;
1174 RF_Free(*k_cfg, sizeof(RF_Config_t));
1175 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1176 rs->sc_flags |= RAIDF_SHUTDOWN;
1177 return retcode;
1178 }
1179
1180 int
1181 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1182 {
1183 int retcode;
1184 RF_Raid_t *raidPtr = &rs->sc_r;
1185
1186 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1187
1188 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1189 goto out;
1190
1191 /* should do some kind of sanity check on the configuration.
1192 * Store the sum of all the bytes in the last byte? */
1193
1194 /* configure the system */
1195
1196 /*
1197 * Clear the entire RAID descriptor, just to make sure
1198 * there is no stale data left in the case of a
1199 * reconfiguration
1200 */
1201 memset(raidPtr, 0, sizeof(*raidPtr));
1202 raidPtr->softc = rs;
1203 raidPtr->raidid = rs->sc_unit;
1204
1205 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1206
1207 if (retcode == 0) {
1208 /* allow this many simultaneous IO's to
1209 this RAID device */
1210 raidPtr->openings = RAIDOUTSTANDING;
1211
1212 raidinit(rs);
1213 raid_wakeup(raidPtr);
1214 rf_markalldirty(raidPtr);
1215 }
1216
1217 /* free the buffers. No return code here. */
1218 if (k_cfg->layoutSpecificSize) {
1219 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1220 }
1221 out:
1222 RF_Free(k_cfg, sizeof(RF_Config_t));
1223 if (retcode) {
1224 /*
1225 * If configuration failed, set sc_flags so that we
1226 * will detach the device when we close it.
1227 */
1228 rs->sc_flags |= RAIDF_SHUTDOWN;
1229 }
1230 return retcode;
1231 }
1232
1233 #if RF_DISABLED
1234 static int
1235 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1236 {
1237
1238 /* XXX check the label for valid stuff... */
1239 /* Note that some things *should not* get modified --
1240 the user should be re-initing the labels instead of
1241 trying to patch things.
1242 */
1243 #ifdef DEBUG
1244 int raidid = raidPtr->raidid;
1245 printf("raid%d: Got component label:\n", raidid);
1246 printf("raid%d: Version: %d\n", raidid, clabel->version);
1247 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1248 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1249 printf("raid%d: Column: %d\n", raidid, clabel->column);
1250 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1251 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1252 printf("raid%d: Status: %d\n", raidid, clabel->status);
1253 #endif /* DEBUG */
1254 clabel->row = 0;
1255 int column = clabel->column;
1256
1257 if ((column < 0) || (column >= raidPtr->numCol)) {
1258 return(EINVAL);
1259 }
1260
1261 /* XXX this isn't allowed to do anything for now :-) */
1262
1263 /* XXX and before it is, we need to fill in the rest
1264 of the fields!?!?!?! */
1265 memcpy(raidget_component_label(raidPtr, column),
1266 clabel, sizeof(*clabel));
1267 raidflush_component_label(raidPtr, column);
1268 return 0;
1269 }
1270 #endif
1271
1272 static int
1273 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1274 {
1275 /*
1276 we only want the serial number from
1277 the above. We get all the rest of the information
1278 from the config that was used to create this RAID
1279 set.
1280 */
1281
1282 raidPtr->serial_number = clabel->serial_number;
1283
1284 for (int column = 0; column < raidPtr->numCol; column++) {
1285 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1286 if (RF_DEAD_DISK(diskPtr->status))
1287 continue;
1288 RF_ComponentLabel_t *ci_label = raidget_component_label(
1289 raidPtr, column);
1290 /* Zeroing this is important. */
1291 memset(ci_label, 0, sizeof(*ci_label));
1292 raid_init_component_label(raidPtr, ci_label);
1293 ci_label->serial_number = raidPtr->serial_number;
1294 ci_label->row = 0; /* we dont' pretend to support more */
1295 rf_component_label_set_partitionsize(ci_label,
1296 diskPtr->partitionSize);
1297 ci_label->column = column;
1298 raidflush_component_label(raidPtr, column);
1299 /* XXXjld what about the spares? */
1300 }
1301
1302 return 0;
1303 }
1304
1305 static int
1306 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1307 {
1308
1309 if (raidPtr->Layout.map->faultsTolerated == 0) {
1310 /* Can't do this on a RAID 0!! */
1311 return EINVAL;
1312 }
1313
1314 if (raidPtr->recon_in_progress == 1) {
1315 /* a reconstruct is already in progress! */
1316 return EINVAL;
1317 }
1318
1319 RF_SingleComponent_t component;
1320 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1321 component.row = 0; /* we don't support any more */
1322 int column = component.column;
1323
1324 if ((column < 0) || (column >= raidPtr->numCol)) {
1325 return EINVAL;
1326 }
1327
1328 rf_lock_mutex2(raidPtr->mutex);
1329 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1330 (raidPtr->numFailures > 0)) {
1331 /* XXX 0 above shouldn't be constant!!! */
1332 /* some component other than this has failed.
1333 Let's not make things worse than they already
1334 are... */
1335 printf("raid%d: Unable to reconstruct to disk at:\n",
1336 raidPtr->raidid);
1337 printf("raid%d: Col: %d Too many failures.\n",
1338 raidPtr->raidid, column);
1339 rf_unlock_mutex2(raidPtr->mutex);
1340 return EINVAL;
1341 }
1342
1343 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1344 printf("raid%d: Unable to reconstruct to disk at:\n",
1345 raidPtr->raidid);
1346 printf("raid%d: Col: %d "
1347 "Reconstruction already occurring!\n",
1348 raidPtr->raidid, column);
1349
1350 rf_unlock_mutex2(raidPtr->mutex);
1351 return EINVAL;
1352 }
1353
1354 if (raidPtr->Disks[column].status == rf_ds_spared) {
1355 rf_unlock_mutex2(raidPtr->mutex);
1356 return EINVAL;
1357 }
1358
1359 rf_unlock_mutex2(raidPtr->mutex);
1360
1361 struct rf_recon_req_internal *rrint;
1362 rrint = RF_Malloc(sizeof(*rrint));
1363 if (rrint == NULL)
1364 return ENOMEM;
1365
1366 rrint->col = column;
1367 rrint->raidPtr = raidPtr;
1368
1369 return RF_CREATE_THREAD(raidPtr->recon_thread,
1370 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1371 }
1372
1373 static int
1374 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1375 {
1376 /*
1377 * This makes no sense on a RAID 0, or if we are not reconstructing
1378 * so tell the user it's done.
1379 */
1380 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1381 raidPtr->status != rf_rs_reconstructing) {
1382 *data = 100;
1383 return 0;
1384 }
1385 if (raidPtr->reconControl->numRUsTotal == 0) {
1386 *data = 0;
1387 return 0;
1388 }
1389 *data = (raidPtr->reconControl->numRUsComplete * 100
1390 / raidPtr->reconControl->numRUsTotal);
1391 return 0;
1392 }
1393
1394 static int
1395 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1396 {
1397 int unit = raidunit(dev);
1398 int part, pmask;
1399 struct raid_softc *rs;
1400 struct dk_softc *dksc;
1401 RF_Config_t *k_cfg;
1402 RF_Raid_t *raidPtr;
1403 RF_AccTotals_t *totals;
1404 RF_SingleComponent_t component;
1405 RF_DeviceConfig_t *d_cfg, *ucfgp;
1406 int retcode = 0;
1407 int column;
1408 RF_ComponentLabel_t *clabel;
1409 RF_SingleComponent_t *sparePtr,*componentPtr;
1410 int d;
1411
1412 if ((rs = raidget(unit, false)) == NULL)
1413 return ENXIO;
1414
1415 dksc = &rs->sc_dksc;
1416 raidPtr = &rs->sc_r;
1417
1418 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1419 (int) DISKPART(dev), (int) unit, cmd));
1420
1421 /* Must be initialized for these... */
1422 if (rf_must_be_initialized(rs, cmd))
1423 return ENXIO;
1424
1425 switch (cmd) {
1426 /* configure the system */
1427 case RAIDFRAME_CONFIGURE:
1428 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1429 return retcode;
1430 return rf_construct(rs, k_cfg);
1431
1432 /* shutdown the system */
1433 case RAIDFRAME_SHUTDOWN:
1434
1435 part = DISKPART(dev);
1436 pmask = (1 << part);
1437
1438 if ((retcode = raidlock(rs)) != 0)
1439 return retcode;
1440
1441 if (DK_BUSY(dksc, pmask) ||
1442 raidPtr->recon_in_progress != 0 ||
1443 raidPtr->parity_rewrite_in_progress != 0 ||
1444 raidPtr->copyback_in_progress != 0)
1445 retcode = EBUSY;
1446 else {
1447 /* detach and free on close */
1448 rs->sc_flags |= RAIDF_SHUTDOWN;
1449 retcode = 0;
1450 }
1451
1452 raidunlock(rs);
1453
1454 return retcode;
1455 case RAIDFRAME_GET_COMPONENT_LABEL:
1456 return rf_get_component_label(raidPtr, data);
1457
1458 #if RF_DISABLED
1459 case RAIDFRAME_SET_COMPONENT_LABEL:
1460 return rf_set_component_label(raidPtr, data);
1461 #endif
1462
1463 case RAIDFRAME_INIT_LABELS:
1464 return rf_init_component_label(raidPtr, data);
1465
1466 case RAIDFRAME_SET_AUTOCONFIG:
1467 d = rf_set_autoconfig(raidPtr, *(int *) data);
1468 printf("raid%d: New autoconfig value is: %d\n",
1469 raidPtr->raidid, d);
1470 *(int *) data = d;
1471 return retcode;
1472
1473 case RAIDFRAME_SET_ROOT:
1474 d = rf_set_rootpartition(raidPtr, *(int *) data);
1475 printf("raid%d: New rootpartition value is: %d\n",
1476 raidPtr->raidid, d);
1477 *(int *) data = d;
1478 return retcode;
1479
1480 /* initialize all parity */
1481 case RAIDFRAME_REWRITEPARITY:
1482
1483 if (raidPtr->Layout.map->faultsTolerated == 0) {
1484 /* Parity for RAID 0 is trivially correct */
1485 raidPtr->parity_good = RF_RAID_CLEAN;
1486 return 0;
1487 }
1488
1489 if (raidPtr->parity_rewrite_in_progress == 1) {
1490 /* Re-write is already in progress! */
1491 return EINVAL;
1492 }
1493
1494 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1495 rf_RewriteParityThread, raidPtr,"raid_parity");
1496
1497 case RAIDFRAME_ADD_HOT_SPARE:
1498 sparePtr = (RF_SingleComponent_t *) data;
1499 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1500 return rf_add_hot_spare(raidPtr, &component);
1501
1502 case RAIDFRAME_REMOVE_HOT_SPARE:
1503 return retcode;
1504
1505 case RAIDFRAME_DELETE_COMPONENT:
1506 componentPtr = (RF_SingleComponent_t *)data;
1507 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1508 return rf_delete_component(raidPtr, &component);
1509
1510 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1511 componentPtr = (RF_SingleComponent_t *)data;
1512 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1513 return rf_incorporate_hot_spare(raidPtr, &component);
1514
1515 case RAIDFRAME_REBUILD_IN_PLACE:
1516 return rf_rebuild_in_place(raidPtr, data);
1517
1518 case RAIDFRAME_GET_INFO:
1519 ucfgp = *(RF_DeviceConfig_t **)data;
1520 d_cfg = RF_Malloc(sizeof(*d_cfg));
1521 if (d_cfg == NULL)
1522 return ENOMEM;
1523 retcode = rf_get_info(raidPtr, d_cfg);
1524 if (retcode == 0) {
1525 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1526 }
1527 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1528 return retcode;
1529
1530 case RAIDFRAME_CHECK_PARITY:
1531 *(int *) data = raidPtr->parity_good;
1532 return 0;
1533
1534 case RAIDFRAME_PARITYMAP_STATUS:
1535 if (rf_paritymap_ineligible(raidPtr))
1536 return EINVAL;
1537 rf_paritymap_status(raidPtr->parity_map, data);
1538 return 0;
1539
1540 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1541 if (rf_paritymap_ineligible(raidPtr))
1542 return EINVAL;
1543 if (raidPtr->parity_map == NULL)
1544 return ENOENT; /* ??? */
1545 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1546 return EINVAL;
1547 return 0;
1548
1549 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1550 if (rf_paritymap_ineligible(raidPtr))
1551 return EINVAL;
1552 *(int *) data = rf_paritymap_get_disable(raidPtr);
1553 return 0;
1554
1555 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1556 if (rf_paritymap_ineligible(raidPtr))
1557 return EINVAL;
1558 rf_paritymap_set_disable(raidPtr, *(int *)data);
1559 /* XXX should errors be passed up? */
1560 return 0;
1561
1562 case RAIDFRAME_RESET_ACCTOTALS:
1563 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1564 return 0;
1565
1566 case RAIDFRAME_GET_ACCTOTALS:
1567 totals = (RF_AccTotals_t *) data;
1568 *totals = raidPtr->acc_totals;
1569 return 0;
1570
1571 case RAIDFRAME_KEEP_ACCTOTALS:
1572 raidPtr->keep_acc_totals = *(int *)data;
1573 return 0;
1574
1575 case RAIDFRAME_GET_SIZE:
1576 *(int *) data = raidPtr->totalSectors;
1577 return 0;
1578
1579 case RAIDFRAME_FAIL_DISK:
1580 return rf_fail_disk(raidPtr, data);
1581
1582 /* invoke a copyback operation after recon on whatever disk
1583 * needs it, if any */
1584 case RAIDFRAME_COPYBACK:
1585
1586 if (raidPtr->Layout.map->faultsTolerated == 0) {
1587 /* This makes no sense on a RAID 0!! */
1588 return EINVAL;
1589 }
1590
1591 if (raidPtr->copyback_in_progress == 1) {
1592 /* Copyback is already in progress! */
1593 return EINVAL;
1594 }
1595
1596 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1597 rf_CopybackThread, raidPtr, "raid_copyback");
1598
1599 /* return the percentage completion of reconstruction */
1600 case RAIDFRAME_CHECK_RECON_STATUS:
1601 return rf_check_recon_status(raidPtr, data);
1602
1603 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1604 rf_check_recon_status_ext(raidPtr, data);
1605 return 0;
1606
1607 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1608 if (raidPtr->Layout.map->faultsTolerated == 0) {
1609 /* This makes no sense on a RAID 0, so tell the
1610 user it's done. */
1611 *(int *) data = 100;
1612 return 0;
1613 }
1614 if (raidPtr->parity_rewrite_in_progress == 1) {
1615 *(int *) data = 100 *
1616 raidPtr->parity_rewrite_stripes_done /
1617 raidPtr->Layout.numStripe;
1618 } else {
1619 *(int *) data = 100;
1620 }
1621 return 0;
1622
1623 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1624 rf_check_parityrewrite_status_ext(raidPtr, data);
1625 return 0;
1626
1627 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1628 if (raidPtr->Layout.map->faultsTolerated == 0) {
1629 /* This makes no sense on a RAID 0 */
1630 *(int *) data = 100;
1631 return 0;
1632 }
1633 if (raidPtr->copyback_in_progress == 1) {
1634 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1635 raidPtr->Layout.numStripe;
1636 } else {
1637 *(int *) data = 100;
1638 }
1639 return 0;
1640
1641 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1642 rf_check_copyback_status_ext(raidPtr, data);
1643 return 0;
1644
1645 case RAIDFRAME_SET_LAST_UNIT:
1646 for (column = 0; column < raidPtr->numCol; column++)
1647 if (raidPtr->Disks[column].status != rf_ds_optimal)
1648 return EBUSY;
1649
1650 for (column = 0; column < raidPtr->numCol; column++) {
1651 clabel = raidget_component_label(raidPtr, column);
1652 clabel->last_unit = *(int *)data;
1653 raidflush_component_label(raidPtr, column);
1654 }
1655 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1656 return 0;
1657
1658 /* the sparetable daemon calls this to wait for the kernel to
1659 * need a spare table. this ioctl does not return until a
1660 * spare table is needed. XXX -- calling mpsleep here in the
1661 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1662 * -- I should either compute the spare table in the kernel,
1663 * or have a different -- XXX XXX -- interface (a different
1664 * character device) for delivering the table -- XXX */
1665 #if RF_DISABLED
1666 case RAIDFRAME_SPARET_WAIT:
1667 rf_lock_mutex2(rf_sparet_wait_mutex);
1668 while (!rf_sparet_wait_queue)
1669 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1670 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1671 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1672 rf_unlock_mutex2(rf_sparet_wait_mutex);
1673
1674 /* structure assignment */
1675 *((RF_SparetWait_t *) data) = *waitreq;
1676
1677 RF_Free(waitreq, sizeof(*waitreq));
1678 return 0;
1679
1680 /* wakes up a process waiting on SPARET_WAIT and puts an error
1681 * code in it that will cause the dameon to exit */
1682 case RAIDFRAME_ABORT_SPARET_WAIT:
1683 waitreq = RF_Malloc(sizeof(*waitreq));
1684 waitreq->fcol = -1;
1685 rf_lock_mutex2(rf_sparet_wait_mutex);
1686 waitreq->next = rf_sparet_wait_queue;
1687 rf_sparet_wait_queue = waitreq;
1688 rf_broadcast_cond2(rf_sparet_wait_cv);
1689 rf_unlock_mutex2(rf_sparet_wait_mutex);
1690 return 0;
1691
1692 /* used by the spare table daemon to deliver a spare table
1693 * into the kernel */
1694 case RAIDFRAME_SEND_SPARET:
1695
1696 /* install the spare table */
1697 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1698
1699 /* respond to the requestor. the return status of the spare
1700 * table installation is passed in the "fcol" field */
1701 waitred = RF_Malloc(sizeof(*waitreq));
1702 waitreq->fcol = retcode;
1703 rf_lock_mutex2(rf_sparet_wait_mutex);
1704 waitreq->next = rf_sparet_resp_queue;
1705 rf_sparet_resp_queue = waitreq;
1706 rf_broadcast_cond2(rf_sparet_resp_cv);
1707 rf_unlock_mutex2(rf_sparet_wait_mutex);
1708
1709 return retcode;
1710 #endif
1711 default:
1712 /*
1713 * Don't bother trying to load compat modules
1714 * if it is not our ioctl. This is more efficient
1715 * and makes rump tests not depend on compat code
1716 */
1717 if (IOCGROUP(cmd) != 'r')
1718 break;
1719 #ifdef _LP64
1720 if ((l->l_proc->p_flag & PK_32) != 0) {
1721 module_autoload("compat_netbsd32_raid",
1722 MODULE_CLASS_EXEC);
1723 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1724 (rs, cmd, data), enosys(), retcode);
1725 if (retcode != EPASSTHROUGH)
1726 return retcode;
1727 }
1728 #endif
1729 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1730 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1731 (rs, cmd, data), enosys(), retcode);
1732 if (retcode != EPASSTHROUGH)
1733 return retcode;
1734
1735 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1736 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1737 (rs, cmd, data), enosys(), retcode);
1738 if (retcode != EPASSTHROUGH)
1739 return retcode;
1740 break; /* fall through to the os-specific code below */
1741
1742 }
1743
1744 if (!raidPtr->valid)
1745 return (EINVAL);
1746
1747 /*
1748 * Add support for "regular" device ioctls here.
1749 */
1750
1751 switch (cmd) {
1752 case DIOCGCACHE:
1753 retcode = rf_get_component_caches(raidPtr, (int *)data);
1754 break;
1755
1756 case DIOCCACHESYNC:
1757 retcode = rf_sync_component_caches(raidPtr);
1758 break;
1759
1760 default:
1761 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1762 break;
1763 }
1764
1765 return (retcode);
1766
1767 }
1768
1769
1770 /* raidinit -- complete the rest of the initialization for the
1771 RAIDframe device. */
1772
1773
1774 static void
1775 raidinit(struct raid_softc *rs)
1776 {
1777 cfdata_t cf;
1778 unsigned int unit;
1779 struct dk_softc *dksc = &rs->sc_dksc;
1780 RF_Raid_t *raidPtr = &rs->sc_r;
1781 device_t dev;
1782
1783 unit = raidPtr->raidid;
1784
1785 /* XXX doesn't check bounds. */
1786 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1787
1788 /* attach the pseudo device */
1789 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1790 cf->cf_name = raid_cd.cd_name;
1791 cf->cf_atname = raid_cd.cd_name;
1792 cf->cf_unit = unit;
1793 cf->cf_fstate = FSTATE_STAR;
1794
1795 dev = config_attach_pseudo(cf);
1796 if (dev == NULL) {
1797 printf("raid%d: config_attach_pseudo failed\n",
1798 raidPtr->raidid);
1799 free(cf, M_RAIDFRAME);
1800 return;
1801 }
1802
1803 /* provide a backpointer to the real softc */
1804 raidsoftc(dev) = rs;
1805
1806 /* disk_attach actually creates space for the CPU disklabel, among
1807 * other things, so it's critical to call this *BEFORE* we try putzing
1808 * with disklabels. */
1809 dk_init(dksc, dev, DKTYPE_RAID);
1810 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1811
1812 /* XXX There may be a weird interaction here between this, and
1813 * protectedSectors, as used in RAIDframe. */
1814
1815 rs->sc_size = raidPtr->totalSectors;
1816
1817 /* Attach dk and disk subsystems */
1818 dk_attach(dksc);
1819 disk_attach(&dksc->sc_dkdev);
1820 rf_set_geometry(rs, raidPtr);
1821
1822 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1823
1824 /* mark unit as usuable */
1825 rs->sc_flags |= RAIDF_INITED;
1826
1827 dkwedge_discover(&dksc->sc_dkdev);
1828 }
1829
1830 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1831 /* wake up the daemon & tell it to get us a spare table
1832 * XXX
1833 * the entries in the queues should be tagged with the raidPtr
1834 * so that in the extremely rare case that two recons happen at once,
1835 * we know for which device were requesting a spare table
1836 * XXX
1837 *
1838 * XXX This code is not currently used. GO
1839 */
1840 int
1841 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1842 {
1843 int retcode;
1844
1845 rf_lock_mutex2(rf_sparet_wait_mutex);
1846 req->next = rf_sparet_wait_queue;
1847 rf_sparet_wait_queue = req;
1848 rf_broadcast_cond2(rf_sparet_wait_cv);
1849
1850 /* mpsleep unlocks the mutex */
1851 while (!rf_sparet_resp_queue) {
1852 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1853 }
1854 req = rf_sparet_resp_queue;
1855 rf_sparet_resp_queue = req->next;
1856 rf_unlock_mutex2(rf_sparet_wait_mutex);
1857
1858 retcode = req->fcol;
1859 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1860 * alloc'd */
1861 return (retcode);
1862 }
1863 #endif
1864
1865 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1866 * bp & passes it down.
1867 * any calls originating in the kernel must use non-blocking I/O
1868 * do some extra sanity checking to return "appropriate" error values for
1869 * certain conditions (to make some standard utilities work)
1870 *
1871 * Formerly known as: rf_DoAccessKernel
1872 */
1873 void
1874 raidstart(RF_Raid_t *raidPtr)
1875 {
1876 struct raid_softc *rs;
1877 struct dk_softc *dksc;
1878
1879 rs = raidPtr->softc;
1880 dksc = &rs->sc_dksc;
1881 /* quick check to see if anything has died recently */
1882 rf_lock_mutex2(raidPtr->mutex);
1883 if (raidPtr->numNewFailures > 0) {
1884 rf_unlock_mutex2(raidPtr->mutex);
1885 rf_update_component_labels(raidPtr,
1886 RF_NORMAL_COMPONENT_UPDATE);
1887 rf_lock_mutex2(raidPtr->mutex);
1888 raidPtr->numNewFailures--;
1889 }
1890 rf_unlock_mutex2(raidPtr->mutex);
1891
1892 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1893 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1894 return;
1895 }
1896
1897 dk_start(dksc, NULL);
1898 }
1899
1900 static int
1901 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1902 {
1903 RF_SectorCount_t num_blocks, pb, sum;
1904 RF_RaidAddr_t raid_addr;
1905 daddr_t blocknum;
1906 int do_async;
1907 int rc;
1908
1909 rf_lock_mutex2(raidPtr->mutex);
1910 if (raidPtr->openings == 0) {
1911 rf_unlock_mutex2(raidPtr->mutex);
1912 return EAGAIN;
1913 }
1914 rf_unlock_mutex2(raidPtr->mutex);
1915
1916 blocknum = bp->b_rawblkno;
1917
1918 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1919 (int) blocknum));
1920
1921 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1922 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1923
1924 /* *THIS* is where we adjust what block we're going to...
1925 * but DO NOT TOUCH bp->b_blkno!!! */
1926 raid_addr = blocknum;
1927
1928 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1929 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1930 sum = raid_addr + num_blocks + pb;
1931 if (1 || rf_debugKernelAccess) {
1932 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1933 (int) raid_addr, (int) sum, (int) num_blocks,
1934 (int) pb, (int) bp->b_resid));
1935 }
1936 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1937 || (sum < num_blocks) || (sum < pb)) {
1938 rc = ENOSPC;
1939 goto done;
1940 }
1941 /*
1942 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1943 */
1944
1945 if (bp->b_bcount & raidPtr->sectorMask) {
1946 rc = ENOSPC;
1947 goto done;
1948 }
1949 db1_printf(("Calling DoAccess..\n"));
1950
1951
1952 rf_lock_mutex2(raidPtr->mutex);
1953 raidPtr->openings--;
1954 rf_unlock_mutex2(raidPtr->mutex);
1955
1956 /*
1957 * Everything is async.
1958 */
1959 do_async = 1;
1960
1961 /* don't ever condition on bp->b_flags & B_WRITE.
1962 * always condition on B_READ instead */
1963
1964 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1965 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1966 do_async, raid_addr, num_blocks,
1967 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1968
1969 done:
1970 return rc;
1971 }
1972
1973 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1974
1975 int
1976 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1977 {
1978 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1979 struct buf *bp;
1980
1981 req->queue = queue;
1982 bp = req->bp;
1983
1984 switch (req->type) {
1985 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1986 /* XXX need to do something extra here.. */
1987 /* I'm leaving this in, as I've never actually seen it used,
1988 * and I'd like folks to report it... GO */
1989 printf(("WAKEUP CALLED\n"));
1990 queue->numOutstanding++;
1991
1992 bp->b_flags = 0;
1993 bp->b_private = req;
1994
1995 KernelWakeupFunc(bp);
1996 break;
1997
1998 case RF_IO_TYPE_READ:
1999 case RF_IO_TYPE_WRITE:
2000 #if RF_ACC_TRACE > 0
2001 if (req->tracerec) {
2002 RF_ETIMER_START(req->tracerec->timer);
2003 }
2004 #endif
2005 InitBP(bp, queue->rf_cinfo->ci_vp,
2006 op, queue->rf_cinfo->ci_dev,
2007 req->sectorOffset, req->numSector,
2008 req->buf, KernelWakeupFunc, (void *) req,
2009 queue->raidPtr->logBytesPerSector);
2010
2011 if (rf_debugKernelAccess) {
2012 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2013 (long) bp->b_blkno));
2014 }
2015 queue->numOutstanding++;
2016 queue->last_deq_sector = req->sectorOffset;
2017 /* acc wouldn't have been let in if there were any pending
2018 * reqs at any other priority */
2019 queue->curPriority = req->priority;
2020
2021 db1_printf(("Going for %c to unit %d col %d\n",
2022 req->type, queue->raidPtr->raidid,
2023 queue->col));
2024 db1_printf(("sector %d count %d (%d bytes) %d\n",
2025 (int) req->sectorOffset, (int) req->numSector,
2026 (int) (req->numSector <<
2027 queue->raidPtr->logBytesPerSector),
2028 (int) queue->raidPtr->logBytesPerSector));
2029
2030 /*
2031 * XXX: drop lock here since this can block at
2032 * least with backing SCSI devices. Retake it
2033 * to minimize fuss with calling interfaces.
2034 */
2035
2036 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2037 bdev_strategy(bp);
2038 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2039 break;
2040
2041 default:
2042 panic("bad req->type in rf_DispatchKernelIO");
2043 }
2044 db1_printf(("Exiting from DispatchKernelIO\n"));
2045
2046 return (0);
2047 }
2048 /* this is the callback function associated with a I/O invoked from
2049 kernel code.
2050 */
2051 static void
2052 KernelWakeupFunc(struct buf *bp)
2053 {
2054 RF_DiskQueueData_t *req = NULL;
2055 RF_DiskQueue_t *queue;
2056
2057 db1_printf(("recovering the request queue:\n"));
2058
2059 req = bp->b_private;
2060
2061 queue = (RF_DiskQueue_t *) req->queue;
2062
2063 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2064
2065 #if RF_ACC_TRACE > 0
2066 if (req->tracerec) {
2067 RF_ETIMER_STOP(req->tracerec->timer);
2068 RF_ETIMER_EVAL(req->tracerec->timer);
2069 rf_lock_mutex2(rf_tracing_mutex);
2070 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2071 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2072 req->tracerec->num_phys_ios++;
2073 rf_unlock_mutex2(rf_tracing_mutex);
2074 }
2075 #endif
2076
2077 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2078 * ballistic, and mark the component as hosed... */
2079
2080 if (bp->b_error != 0) {
2081 /* Mark the disk as dead */
2082 /* but only mark it once... */
2083 /* and only if it wouldn't leave this RAID set
2084 completely broken */
2085 if (((queue->raidPtr->Disks[queue->col].status ==
2086 rf_ds_optimal) ||
2087 (queue->raidPtr->Disks[queue->col].status ==
2088 rf_ds_used_spare)) &&
2089 (queue->raidPtr->numFailures <
2090 queue->raidPtr->Layout.map->faultsTolerated)) {
2091 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2092 queue->raidPtr->raidid,
2093 bp->b_error,
2094 queue->raidPtr->Disks[queue->col].devname);
2095 queue->raidPtr->Disks[queue->col].status =
2096 rf_ds_failed;
2097 queue->raidPtr->status = rf_rs_degraded;
2098 queue->raidPtr->numFailures++;
2099 queue->raidPtr->numNewFailures++;
2100 } else { /* Disk is already dead... */
2101 /* printf("Disk already marked as dead!\n"); */
2102 }
2103
2104 }
2105
2106 /* Fill in the error value */
2107 req->error = bp->b_error;
2108
2109 /* Drop this one on the "finished" queue... */
2110 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2111
2112 /* Let the raidio thread know there is work to be done. */
2113 rf_signal_cond2(queue->raidPtr->iodone_cv);
2114
2115 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2116 }
2117
2118
2119 /*
2120 * initialize a buf structure for doing an I/O in the kernel.
2121 */
2122 static void
2123 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2124 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2125 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2126 {
2127 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2128 bp->b_oflags = 0;
2129 bp->b_cflags = 0;
2130 bp->b_bcount = numSect << logBytesPerSector;
2131 bp->b_bufsize = bp->b_bcount;
2132 bp->b_error = 0;
2133 bp->b_dev = dev;
2134 bp->b_data = bf;
2135 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2136 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2137 if (bp->b_bcount == 0) {
2138 panic("bp->b_bcount is zero in InitBP!!");
2139 }
2140 bp->b_iodone = cbFunc;
2141 bp->b_private = cbArg;
2142 }
2143
2144 /*
2145 * Wait interruptibly for an exclusive lock.
2146 *
2147 * XXX
2148 * Several drivers do this; it should be abstracted and made MP-safe.
2149 * (Hmm... where have we seen this warning before :-> GO )
2150 */
2151 static int
2152 raidlock(struct raid_softc *rs)
2153 {
2154 int error;
2155
2156 error = 0;
2157 mutex_enter(&rs->sc_mutex);
2158 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2159 rs->sc_flags |= RAIDF_WANTED;
2160 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2161 if (error != 0)
2162 goto done;
2163 }
2164 rs->sc_flags |= RAIDF_LOCKED;
2165 done:
2166 mutex_exit(&rs->sc_mutex);
2167 return (error);
2168 }
2169 /*
2170 * Unlock and wake up any waiters.
2171 */
2172 static void
2173 raidunlock(struct raid_softc *rs)
2174 {
2175
2176 mutex_enter(&rs->sc_mutex);
2177 rs->sc_flags &= ~RAIDF_LOCKED;
2178 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2179 rs->sc_flags &= ~RAIDF_WANTED;
2180 cv_broadcast(&rs->sc_cv);
2181 }
2182 mutex_exit(&rs->sc_mutex);
2183 }
2184
2185
2186 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2187 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2188 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2189
2190 static daddr_t
2191 rf_component_info_offset(void)
2192 {
2193
2194 return RF_COMPONENT_INFO_OFFSET;
2195 }
2196
2197 static daddr_t
2198 rf_component_info_size(unsigned secsize)
2199 {
2200 daddr_t info_size;
2201
2202 KASSERT(secsize);
2203 if (secsize > RF_COMPONENT_INFO_SIZE)
2204 info_size = secsize;
2205 else
2206 info_size = RF_COMPONENT_INFO_SIZE;
2207
2208 return info_size;
2209 }
2210
2211 static daddr_t
2212 rf_parity_map_offset(RF_Raid_t *raidPtr)
2213 {
2214 daddr_t map_offset;
2215
2216 KASSERT(raidPtr->bytesPerSector);
2217 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2218 map_offset = raidPtr->bytesPerSector;
2219 else
2220 map_offset = RF_COMPONENT_INFO_SIZE;
2221 map_offset += rf_component_info_offset();
2222
2223 return map_offset;
2224 }
2225
2226 static daddr_t
2227 rf_parity_map_size(RF_Raid_t *raidPtr)
2228 {
2229 daddr_t map_size;
2230
2231 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2232 map_size = raidPtr->bytesPerSector;
2233 else
2234 map_size = RF_PARITY_MAP_SIZE;
2235
2236 return map_size;
2237 }
2238
2239 int
2240 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2241 {
2242 RF_ComponentLabel_t *clabel;
2243
2244 clabel = raidget_component_label(raidPtr, col);
2245 clabel->clean = RF_RAID_CLEAN;
2246 raidflush_component_label(raidPtr, col);
2247 return(0);
2248 }
2249
2250
2251 int
2252 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2253 {
2254 RF_ComponentLabel_t *clabel;
2255
2256 clabel = raidget_component_label(raidPtr, col);
2257 clabel->clean = RF_RAID_DIRTY;
2258 raidflush_component_label(raidPtr, col);
2259 return(0);
2260 }
2261
2262 int
2263 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2264 {
2265 KASSERT(raidPtr->bytesPerSector);
2266 return raidread_component_label(raidPtr->bytesPerSector,
2267 raidPtr->Disks[col].dev,
2268 raidPtr->raid_cinfo[col].ci_vp,
2269 &raidPtr->raid_cinfo[col].ci_label);
2270 }
2271
2272 RF_ComponentLabel_t *
2273 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2274 {
2275 return &raidPtr->raid_cinfo[col].ci_label;
2276 }
2277
2278 int
2279 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2280 {
2281 RF_ComponentLabel_t *label;
2282
2283 label = &raidPtr->raid_cinfo[col].ci_label;
2284 label->mod_counter = raidPtr->mod_counter;
2285 #ifndef RF_NO_PARITY_MAP
2286 label->parity_map_modcount = label->mod_counter;
2287 #endif
2288 return raidwrite_component_label(raidPtr->bytesPerSector,
2289 raidPtr->Disks[col].dev,
2290 raidPtr->raid_cinfo[col].ci_vp, label);
2291 }
2292
2293
2294 static int
2295 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2296 RF_ComponentLabel_t *clabel)
2297 {
2298 return raidread_component_area(dev, b_vp, clabel,
2299 sizeof(RF_ComponentLabel_t),
2300 rf_component_info_offset(),
2301 rf_component_info_size(secsize));
2302 }
2303
2304 /* ARGSUSED */
2305 static int
2306 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2307 size_t msize, daddr_t offset, daddr_t dsize)
2308 {
2309 struct buf *bp;
2310 int error;
2311
2312 /* XXX should probably ensure that we don't try to do this if
2313 someone has changed rf_protected_sectors. */
2314
2315 if (b_vp == NULL) {
2316 /* For whatever reason, this component is not valid.
2317 Don't try to read a component label from it. */
2318 return(EINVAL);
2319 }
2320
2321 /* get a block of the appropriate size... */
2322 bp = geteblk((int)dsize);
2323 bp->b_dev = dev;
2324
2325 /* get our ducks in a row for the read */
2326 bp->b_blkno = offset / DEV_BSIZE;
2327 bp->b_bcount = dsize;
2328 bp->b_flags |= B_READ;
2329 bp->b_resid = dsize;
2330
2331 bdev_strategy(bp);
2332 error = biowait(bp);
2333
2334 if (!error) {
2335 memcpy(data, bp->b_data, msize);
2336 }
2337
2338 brelse(bp, 0);
2339 return(error);
2340 }
2341
2342
2343 static int
2344 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2345 RF_ComponentLabel_t *clabel)
2346 {
2347 return raidwrite_component_area(dev, b_vp, clabel,
2348 sizeof(RF_ComponentLabel_t),
2349 rf_component_info_offset(),
2350 rf_component_info_size(secsize), 0);
2351 }
2352
2353 /* ARGSUSED */
2354 static int
2355 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2356 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2357 {
2358 struct buf *bp;
2359 int error;
2360
2361 /* get a block of the appropriate size... */
2362 bp = geteblk((int)dsize);
2363 bp->b_dev = dev;
2364
2365 /* get our ducks in a row for the write */
2366 bp->b_blkno = offset / DEV_BSIZE;
2367 bp->b_bcount = dsize;
2368 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2369 bp->b_resid = dsize;
2370
2371 memset(bp->b_data, 0, dsize);
2372 memcpy(bp->b_data, data, msize);
2373
2374 bdev_strategy(bp);
2375 if (asyncp)
2376 return 0;
2377 error = biowait(bp);
2378 brelse(bp, 0);
2379 if (error) {
2380 #if 1
2381 printf("Failed to write RAID component info!\n");
2382 #endif
2383 }
2384
2385 return(error);
2386 }
2387
2388 void
2389 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2390 {
2391 int c;
2392
2393 for (c = 0; c < raidPtr->numCol; c++) {
2394 /* Skip dead disks. */
2395 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2396 continue;
2397 /* XXXjld: what if an error occurs here? */
2398 raidwrite_component_area(raidPtr->Disks[c].dev,
2399 raidPtr->raid_cinfo[c].ci_vp, map,
2400 RF_PARITYMAP_NBYTE,
2401 rf_parity_map_offset(raidPtr),
2402 rf_parity_map_size(raidPtr), 0);
2403 }
2404 }
2405
2406 void
2407 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2408 {
2409 struct rf_paritymap_ondisk tmp;
2410 int c,first;
2411
2412 first=1;
2413 for (c = 0; c < raidPtr->numCol; c++) {
2414 /* Skip dead disks. */
2415 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2416 continue;
2417 raidread_component_area(raidPtr->Disks[c].dev,
2418 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2419 RF_PARITYMAP_NBYTE,
2420 rf_parity_map_offset(raidPtr),
2421 rf_parity_map_size(raidPtr));
2422 if (first) {
2423 memcpy(map, &tmp, sizeof(*map));
2424 first = 0;
2425 } else {
2426 rf_paritymap_merge(map, &tmp);
2427 }
2428 }
2429 }
2430
2431 void
2432 rf_markalldirty(RF_Raid_t *raidPtr)
2433 {
2434 RF_ComponentLabel_t *clabel;
2435 int sparecol;
2436 int c;
2437 int j;
2438 int scol = -1;
2439
2440 raidPtr->mod_counter++;
2441 for (c = 0; c < raidPtr->numCol; c++) {
2442 /* we don't want to touch (at all) a disk that has
2443 failed */
2444 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2445 clabel = raidget_component_label(raidPtr, c);
2446 if (clabel->status == rf_ds_spared) {
2447 /* XXX do something special...
2448 but whatever you do, don't
2449 try to access it!! */
2450 } else {
2451 raidmarkdirty(raidPtr, c);
2452 }
2453 }
2454 }
2455
2456 for( c = 0; c < raidPtr->numSpare ; c++) {
2457 sparecol = raidPtr->numCol + c;
2458 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2459 /*
2460
2461 we claim this disk is "optimal" if it's
2462 rf_ds_used_spare, as that means it should be
2463 directly substitutable for the disk it replaced.
2464 We note that too...
2465
2466 */
2467
2468 for(j=0;j<raidPtr->numCol;j++) {
2469 if (raidPtr->Disks[j].spareCol == sparecol) {
2470 scol = j;
2471 break;
2472 }
2473 }
2474
2475 clabel = raidget_component_label(raidPtr, sparecol);
2476 /* make sure status is noted */
2477
2478 raid_init_component_label(raidPtr, clabel);
2479
2480 clabel->row = 0;
2481 clabel->column = scol;
2482 /* Note: we *don't* change status from rf_ds_used_spare
2483 to rf_ds_optimal */
2484 /* clabel.status = rf_ds_optimal; */
2485
2486 raidmarkdirty(raidPtr, sparecol);
2487 }
2488 }
2489 }
2490
2491
2492 void
2493 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2494 {
2495 RF_ComponentLabel_t *clabel;
2496 int sparecol;
2497 int c;
2498 int j;
2499 int scol;
2500 struct raid_softc *rs = raidPtr->softc;
2501
2502 scol = -1;
2503
2504 /* XXX should do extra checks to make sure things really are clean,
2505 rather than blindly setting the clean bit... */
2506
2507 raidPtr->mod_counter++;
2508
2509 for (c = 0; c < raidPtr->numCol; c++) {
2510 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2511 clabel = raidget_component_label(raidPtr, c);
2512 /* make sure status is noted */
2513 clabel->status = rf_ds_optimal;
2514
2515 /* note what unit we are configured as */
2516 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2517 clabel->last_unit = raidPtr->raidid;
2518
2519 raidflush_component_label(raidPtr, c);
2520 if (final == RF_FINAL_COMPONENT_UPDATE) {
2521 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2522 raidmarkclean(raidPtr, c);
2523 }
2524 }
2525 }
2526 /* else we don't touch it.. */
2527 }
2528
2529 for( c = 0; c < raidPtr->numSpare ; c++) {
2530 sparecol = raidPtr->numCol + c;
2531 /* Need to ensure that the reconstruct actually completed! */
2532 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2533 /*
2534
2535 we claim this disk is "optimal" if it's
2536 rf_ds_used_spare, as that means it should be
2537 directly substitutable for the disk it replaced.
2538 We note that too...
2539
2540 */
2541
2542 for(j=0;j<raidPtr->numCol;j++) {
2543 if (raidPtr->Disks[j].spareCol == sparecol) {
2544 scol = j;
2545 break;
2546 }
2547 }
2548
2549 /* XXX shouldn't *really* need this... */
2550 clabel = raidget_component_label(raidPtr, sparecol);
2551 /* make sure status is noted */
2552
2553 raid_init_component_label(raidPtr, clabel);
2554
2555 clabel->column = scol;
2556 clabel->status = rf_ds_optimal;
2557 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2558 clabel->last_unit = raidPtr->raidid;
2559
2560 raidflush_component_label(raidPtr, sparecol);
2561 if (final == RF_FINAL_COMPONENT_UPDATE) {
2562 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2563 raidmarkclean(raidPtr, sparecol);
2564 }
2565 }
2566 }
2567 }
2568 }
2569
2570 void
2571 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2572 {
2573
2574 if (vp != NULL) {
2575 if (auto_configured == 1) {
2576 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2577 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2578 vput(vp);
2579
2580 } else {
2581 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2582 }
2583 }
2584 }
2585
2586
2587 void
2588 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2589 {
2590 int r,c;
2591 struct vnode *vp;
2592 int acd;
2593
2594
2595 /* We take this opportunity to close the vnodes like we should.. */
2596
2597 for (c = 0; c < raidPtr->numCol; c++) {
2598 vp = raidPtr->raid_cinfo[c].ci_vp;
2599 acd = raidPtr->Disks[c].auto_configured;
2600 rf_close_component(raidPtr, vp, acd);
2601 raidPtr->raid_cinfo[c].ci_vp = NULL;
2602 raidPtr->Disks[c].auto_configured = 0;
2603 }
2604
2605 for (r = 0; r < raidPtr->numSpare; r++) {
2606 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2607 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2608 rf_close_component(raidPtr, vp, acd);
2609 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2610 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2611 }
2612 }
2613
2614
2615 void
2616 rf_ReconThread(struct rf_recon_req_internal *req)
2617 {
2618 int s;
2619 RF_Raid_t *raidPtr;
2620
2621 s = splbio();
2622 raidPtr = (RF_Raid_t *) req->raidPtr;
2623 raidPtr->recon_in_progress = 1;
2624
2625 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2626 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2627
2628 RF_Free(req, sizeof(*req));
2629
2630 raidPtr->recon_in_progress = 0;
2631 splx(s);
2632
2633 /* That's all... */
2634 kthread_exit(0); /* does not return */
2635 }
2636
2637 void
2638 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2639 {
2640 int retcode;
2641 int s;
2642
2643 raidPtr->parity_rewrite_stripes_done = 0;
2644 raidPtr->parity_rewrite_in_progress = 1;
2645 s = splbio();
2646 retcode = rf_RewriteParity(raidPtr);
2647 splx(s);
2648 if (retcode) {
2649 printf("raid%d: Error re-writing parity (%d)!\n",
2650 raidPtr->raidid, retcode);
2651 } else {
2652 /* set the clean bit! If we shutdown correctly,
2653 the clean bit on each component label will get
2654 set */
2655 raidPtr->parity_good = RF_RAID_CLEAN;
2656 }
2657 raidPtr->parity_rewrite_in_progress = 0;
2658
2659 /* Anyone waiting for us to stop? If so, inform them... */
2660 if (raidPtr->waitShutdown) {
2661 rf_lock_mutex2(raidPtr->rad_lock);
2662 cv_broadcast(&raidPtr->parity_rewrite_cv);
2663 rf_unlock_mutex2(raidPtr->rad_lock);
2664 }
2665
2666 /* That's all... */
2667 kthread_exit(0); /* does not return */
2668 }
2669
2670
2671 void
2672 rf_CopybackThread(RF_Raid_t *raidPtr)
2673 {
2674 int s;
2675
2676 raidPtr->copyback_in_progress = 1;
2677 s = splbio();
2678 rf_CopybackReconstructedData(raidPtr);
2679 splx(s);
2680 raidPtr->copyback_in_progress = 0;
2681
2682 /* That's all... */
2683 kthread_exit(0); /* does not return */
2684 }
2685
2686
2687 void
2688 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2689 {
2690 int s;
2691 RF_Raid_t *raidPtr;
2692
2693 s = splbio();
2694 raidPtr = req->raidPtr;
2695 raidPtr->recon_in_progress = 1;
2696 rf_ReconstructInPlace(raidPtr, req->col);
2697 RF_Free(req, sizeof(*req));
2698 raidPtr->recon_in_progress = 0;
2699 splx(s);
2700
2701 /* That's all... */
2702 kthread_exit(0); /* does not return */
2703 }
2704
2705 static RF_AutoConfig_t *
2706 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2707 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2708 unsigned secsize)
2709 {
2710 int good_one = 0;
2711 RF_ComponentLabel_t *clabel;
2712 RF_AutoConfig_t *ac;
2713
2714 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2715
2716 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2717 /* Got the label. Does it look reasonable? */
2718 if (rf_reasonable_label(clabel, numsecs) &&
2719 (rf_component_label_partitionsize(clabel) <= size)) {
2720 #ifdef DEBUG
2721 printf("Component on: %s: %llu\n",
2722 cname, (unsigned long long)size);
2723 rf_print_component_label(clabel);
2724 #endif
2725 /* if it's reasonable, add it, else ignore it. */
2726 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2727 M_WAITOK);
2728 strlcpy(ac->devname, cname, sizeof(ac->devname));
2729 ac->dev = dev;
2730 ac->vp = vp;
2731 ac->clabel = clabel;
2732 ac->next = ac_list;
2733 ac_list = ac;
2734 good_one = 1;
2735 }
2736 }
2737 if (!good_one) {
2738 /* cleanup */
2739 free(clabel, M_RAIDFRAME);
2740 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2741 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2742 vput(vp);
2743 }
2744 return ac_list;
2745 }
2746
2747 RF_AutoConfig_t *
2748 rf_find_raid_components(void)
2749 {
2750 struct vnode *vp;
2751 struct disklabel label;
2752 device_t dv;
2753 deviter_t di;
2754 dev_t dev;
2755 int bmajor, bminor, wedge, rf_part_found;
2756 int error;
2757 int i;
2758 RF_AutoConfig_t *ac_list;
2759 uint64_t numsecs;
2760 unsigned secsize;
2761 int dowedges;
2762
2763 /* initialize the AutoConfig list */
2764 ac_list = NULL;
2765
2766 /*
2767 * we begin by trolling through *all* the devices on the system *twice*
2768 * first we scan for wedges, second for other devices. This avoids
2769 * using a raw partition instead of a wedge that covers the whole disk
2770 */
2771
2772 for (dowedges=1; dowedges>=0; --dowedges) {
2773 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2774 dv = deviter_next(&di)) {
2775
2776 /* we are only interested in disks... */
2777 if (device_class(dv) != DV_DISK)
2778 continue;
2779
2780 /* we don't care about floppies... */
2781 if (device_is_a(dv, "fd")) {
2782 continue;
2783 }
2784
2785 /* we don't care about CD's... */
2786 if (device_is_a(dv, "cd")) {
2787 continue;
2788 }
2789
2790 /* we don't care about md's... */
2791 if (device_is_a(dv, "md")) {
2792 continue;
2793 }
2794
2795 /* hdfd is the Atari/Hades floppy driver */
2796 if (device_is_a(dv, "hdfd")) {
2797 continue;
2798 }
2799
2800 /* fdisa is the Atari/Milan floppy driver */
2801 if (device_is_a(dv, "fdisa")) {
2802 continue;
2803 }
2804
2805 /* are we in the wedges pass ? */
2806 wedge = device_is_a(dv, "dk");
2807 if (wedge != dowedges) {
2808 continue;
2809 }
2810
2811 /* need to find the device_name_to_block_device_major stuff */
2812 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2813
2814 rf_part_found = 0; /*No raid partition as yet*/
2815
2816 /* get a vnode for the raw partition of this disk */
2817 bminor = minor(device_unit(dv));
2818 dev = wedge ? makedev(bmajor, bminor) :
2819 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2820 if (bdevvp(dev, &vp))
2821 panic("RAID can't alloc vnode");
2822
2823 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2824 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2825
2826 if (error) {
2827 /* "Who cares." Continue looking
2828 for something that exists*/
2829 vput(vp);
2830 continue;
2831 }
2832
2833 error = getdisksize(vp, &numsecs, &secsize);
2834 if (error) {
2835 /*
2836 * Pseudo devices like vnd and cgd can be
2837 * opened but may still need some configuration.
2838 * Ignore these quietly.
2839 */
2840 if (error != ENXIO)
2841 printf("RAIDframe: can't get disk size"
2842 " for dev %s (%d)\n",
2843 device_xname(dv), error);
2844 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2845 vput(vp);
2846 continue;
2847 }
2848 if (wedge) {
2849 struct dkwedge_info dkw;
2850 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2851 NOCRED);
2852 if (error) {
2853 printf("RAIDframe: can't get wedge info for "
2854 "dev %s (%d)\n", device_xname(dv), error);
2855 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2856 vput(vp);
2857 continue;
2858 }
2859
2860 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2861 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2862 vput(vp);
2863 continue;
2864 }
2865
2866 VOP_UNLOCK(vp);
2867 ac_list = rf_get_component(ac_list, dev, vp,
2868 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2869 rf_part_found = 1; /*There is a raid component on this disk*/
2870 continue;
2871 }
2872
2873 /* Ok, the disk exists. Go get the disklabel. */
2874 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2875 if (error) {
2876 /*
2877 * XXX can't happen - open() would
2878 * have errored out (or faked up one)
2879 */
2880 if (error != ENOTTY)
2881 printf("RAIDframe: can't get label for dev "
2882 "%s (%d)\n", device_xname(dv), error);
2883 }
2884
2885 /* don't need this any more. We'll allocate it again
2886 a little later if we really do... */
2887 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2888 vput(vp);
2889
2890 if (error)
2891 continue;
2892
2893 rf_part_found = 0; /*No raid partitions yet*/
2894 for (i = 0; i < label.d_npartitions; i++) {
2895 char cname[sizeof(ac_list->devname)];
2896
2897 /* We only support partitions marked as RAID */
2898 if (label.d_partitions[i].p_fstype != FS_RAID)
2899 continue;
2900
2901 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2902 if (bdevvp(dev, &vp))
2903 panic("RAID can't alloc vnode");
2904
2905 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2906 error = VOP_OPEN(vp, FREAD, NOCRED);
2907 if (error) {
2908 /* Whatever... */
2909 vput(vp);
2910 continue;
2911 }
2912 VOP_UNLOCK(vp);
2913 snprintf(cname, sizeof(cname), "%s%c",
2914 device_xname(dv), 'a' + i);
2915 ac_list = rf_get_component(ac_list, dev, vp, cname,
2916 label.d_partitions[i].p_size, numsecs, secsize);
2917 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2918 }
2919
2920 /*
2921 *If there is no raid component on this disk, either in a
2922 *disklabel or inside a wedge, check the raw partition as well,
2923 *as it is possible to configure raid components on raw disk
2924 *devices.
2925 */
2926
2927 if (!rf_part_found) {
2928 char cname[sizeof(ac_list->devname)];
2929
2930 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2931 if (bdevvp(dev, &vp))
2932 panic("RAID can't alloc vnode");
2933
2934 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2935
2936 error = VOP_OPEN(vp, FREAD, NOCRED);
2937 if (error) {
2938 /* Whatever... */
2939 vput(vp);
2940 continue;
2941 }
2942 VOP_UNLOCK(vp);
2943 snprintf(cname, sizeof(cname), "%s%c",
2944 device_xname(dv), 'a' + RAW_PART);
2945 ac_list = rf_get_component(ac_list, dev, vp, cname,
2946 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2947 }
2948 }
2949 deviter_release(&di);
2950 }
2951 return ac_list;
2952 }
2953
2954
2955 int
2956 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2957 {
2958
2959 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2960 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2961 ((clabel->clean == RF_RAID_CLEAN) ||
2962 (clabel->clean == RF_RAID_DIRTY)) &&
2963 clabel->row >=0 &&
2964 clabel->column >= 0 &&
2965 clabel->num_rows > 0 &&
2966 clabel->num_columns > 0 &&
2967 clabel->row < clabel->num_rows &&
2968 clabel->column < clabel->num_columns &&
2969 clabel->blockSize > 0 &&
2970 /*
2971 * numBlocksHi may contain garbage, but it is ok since
2972 * the type is unsigned. If it is really garbage,
2973 * rf_fix_old_label_size() will fix it.
2974 */
2975 rf_component_label_numblocks(clabel) > 0) {
2976 /*
2977 * label looks reasonable enough...
2978 * let's make sure it has no old garbage.
2979 */
2980 if (numsecs)
2981 rf_fix_old_label_size(clabel, numsecs);
2982 return(1);
2983 }
2984 return(0);
2985 }
2986
2987
2988 /*
2989 * For reasons yet unknown, some old component labels have garbage in
2990 * the newer numBlocksHi region, and this causes lossage. Since those
2991 * disks will also have numsecs set to less than 32 bits of sectors,
2992 * we can determine when this corruption has occurred, and fix it.
2993 *
2994 * The exact same problem, with the same unknown reason, happens to
2995 * the partitionSizeHi member as well.
2996 */
2997 static void
2998 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2999 {
3000
3001 if (numsecs < ((uint64_t)1 << 32)) {
3002 if (clabel->numBlocksHi) {
3003 printf("WARNING: total sectors < 32 bits, yet "
3004 "numBlocksHi set\n"
3005 "WARNING: resetting numBlocksHi to zero.\n");
3006 clabel->numBlocksHi = 0;
3007 }
3008
3009 if (clabel->partitionSizeHi) {
3010 printf("WARNING: total sectors < 32 bits, yet "
3011 "partitionSizeHi set\n"
3012 "WARNING: resetting partitionSizeHi to zero.\n");
3013 clabel->partitionSizeHi = 0;
3014 }
3015 }
3016 }
3017
3018
3019 #ifdef DEBUG
3020 void
3021 rf_print_component_label(RF_ComponentLabel_t *clabel)
3022 {
3023 uint64_t numBlocks;
3024 static const char *rp[] = {
3025 "No", "Force", "Soft", "*invalid*"
3026 };
3027
3028
3029 numBlocks = rf_component_label_numblocks(clabel);
3030
3031 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3032 clabel->row, clabel->column,
3033 clabel->num_rows, clabel->num_columns);
3034 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3035 clabel->version, clabel->serial_number,
3036 clabel->mod_counter);
3037 printf(" Clean: %s Status: %d\n",
3038 clabel->clean ? "Yes" : "No", clabel->status);
3039 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3040 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3041 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3042 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3043 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3044 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3045 printf(" Last configured as: raid%d\n", clabel->last_unit);
3046 #if 0
3047 printf(" Config order: %d\n", clabel->config_order);
3048 #endif
3049
3050 }
3051 #endif
3052
3053 RF_ConfigSet_t *
3054 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3055 {
3056 RF_AutoConfig_t *ac;
3057 RF_ConfigSet_t *config_sets;
3058 RF_ConfigSet_t *cset;
3059 RF_AutoConfig_t *ac_next;
3060
3061
3062 config_sets = NULL;
3063
3064 /* Go through the AutoConfig list, and figure out which components
3065 belong to what sets. */
3066 ac = ac_list;
3067 while(ac!=NULL) {
3068 /* we're going to putz with ac->next, so save it here
3069 for use at the end of the loop */
3070 ac_next = ac->next;
3071
3072 if (config_sets == NULL) {
3073 /* will need at least this one... */
3074 config_sets = malloc(sizeof(RF_ConfigSet_t),
3075 M_RAIDFRAME, M_WAITOK);
3076 /* this one is easy :) */
3077 config_sets->ac = ac;
3078 config_sets->next = NULL;
3079 config_sets->rootable = 0;
3080 ac->next = NULL;
3081 } else {
3082 /* which set does this component fit into? */
3083 cset = config_sets;
3084 while(cset!=NULL) {
3085 if (rf_does_it_fit(cset, ac)) {
3086 /* looks like it matches... */
3087 ac->next = cset->ac;
3088 cset->ac = ac;
3089 break;
3090 }
3091 cset = cset->next;
3092 }
3093 if (cset==NULL) {
3094 /* didn't find a match above... new set..*/
3095 cset = malloc(sizeof(RF_ConfigSet_t),
3096 M_RAIDFRAME, M_WAITOK);
3097 cset->ac = ac;
3098 ac->next = NULL;
3099 cset->next = config_sets;
3100 cset->rootable = 0;
3101 config_sets = cset;
3102 }
3103 }
3104 ac = ac_next;
3105 }
3106
3107
3108 return(config_sets);
3109 }
3110
3111 static int
3112 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3113 {
3114 RF_ComponentLabel_t *clabel1, *clabel2;
3115
3116 /* If this one matches the *first* one in the set, that's good
3117 enough, since the other members of the set would have been
3118 through here too... */
3119 /* note that we are not checking partitionSize here..
3120
3121 Note that we are also not checking the mod_counters here.
3122 If everything else matches except the mod_counter, that's
3123 good enough for this test. We will deal with the mod_counters
3124 a little later in the autoconfiguration process.
3125
3126 (clabel1->mod_counter == clabel2->mod_counter) &&
3127
3128 The reason we don't check for this is that failed disks
3129 will have lower modification counts. If those disks are
3130 not added to the set they used to belong to, then they will
3131 form their own set, which may result in 2 different sets,
3132 for example, competing to be configured at raid0, and
3133 perhaps competing to be the root filesystem set. If the
3134 wrong ones get configured, or both attempt to become /,
3135 weird behaviour and or serious lossage will occur. Thus we
3136 need to bring them into the fold here, and kick them out at
3137 a later point.
3138
3139 */
3140
3141 clabel1 = cset->ac->clabel;
3142 clabel2 = ac->clabel;
3143 if ((clabel1->version == clabel2->version) &&
3144 (clabel1->serial_number == clabel2->serial_number) &&
3145 (clabel1->num_rows == clabel2->num_rows) &&
3146 (clabel1->num_columns == clabel2->num_columns) &&
3147 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3148 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3149 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3150 (clabel1->parityConfig == clabel2->parityConfig) &&
3151 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3152 (clabel1->blockSize == clabel2->blockSize) &&
3153 rf_component_label_numblocks(clabel1) ==
3154 rf_component_label_numblocks(clabel2) &&
3155 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3156 (clabel1->root_partition == clabel2->root_partition) &&
3157 (clabel1->last_unit == clabel2->last_unit) &&
3158 (clabel1->config_order == clabel2->config_order)) {
3159 /* if it get's here, it almost *has* to be a match */
3160 } else {
3161 /* it's not consistent with somebody in the set..
3162 punt */
3163 return(0);
3164 }
3165 /* all was fine.. it must fit... */
3166 return(1);
3167 }
3168
3169 int
3170 rf_have_enough_components(RF_ConfigSet_t *cset)
3171 {
3172 RF_AutoConfig_t *ac;
3173 RF_AutoConfig_t *auto_config;
3174 RF_ComponentLabel_t *clabel;
3175 int c;
3176 int num_cols;
3177 int num_missing;
3178 int mod_counter;
3179 int mod_counter_found;
3180 int even_pair_failed;
3181 char parity_type;
3182
3183
3184 /* check to see that we have enough 'live' components
3185 of this set. If so, we can configure it if necessary */
3186
3187 num_cols = cset->ac->clabel->num_columns;
3188 parity_type = cset->ac->clabel->parityConfig;
3189
3190 /* XXX Check for duplicate components!?!?!? */
3191
3192 /* Determine what the mod_counter is supposed to be for this set. */
3193
3194 mod_counter_found = 0;
3195 mod_counter = 0;
3196 ac = cset->ac;
3197 while(ac!=NULL) {
3198 if (mod_counter_found==0) {
3199 mod_counter = ac->clabel->mod_counter;
3200 mod_counter_found = 1;
3201 } else {
3202 if (ac->clabel->mod_counter > mod_counter) {
3203 mod_counter = ac->clabel->mod_counter;
3204 }
3205 }
3206 ac = ac->next;
3207 }
3208
3209 num_missing = 0;
3210 auto_config = cset->ac;
3211
3212 even_pair_failed = 0;
3213 for(c=0; c<num_cols; c++) {
3214 ac = auto_config;
3215 while(ac!=NULL) {
3216 if ((ac->clabel->column == c) &&
3217 (ac->clabel->mod_counter == mod_counter)) {
3218 /* it's this one... */
3219 #ifdef DEBUG
3220 printf("Found: %s at %d\n",
3221 ac->devname,c);
3222 #endif
3223 break;
3224 }
3225 ac=ac->next;
3226 }
3227 if (ac==NULL) {
3228 /* Didn't find one here! */
3229 /* special case for RAID 1, especially
3230 where there are more than 2
3231 components (where RAIDframe treats
3232 things a little differently :( ) */
3233 if (parity_type == '1') {
3234 if (c%2 == 0) { /* even component */
3235 even_pair_failed = 1;
3236 } else { /* odd component. If
3237 we're failed, and
3238 so is the even
3239 component, it's
3240 "Good Night, Charlie" */
3241 if (even_pair_failed == 1) {
3242 return(0);
3243 }
3244 }
3245 } else {
3246 /* normal accounting */
3247 num_missing++;
3248 }
3249 }
3250 if ((parity_type == '1') && (c%2 == 1)) {
3251 /* Just did an even component, and we didn't
3252 bail.. reset the even_pair_failed flag,
3253 and go on to the next component.... */
3254 even_pair_failed = 0;
3255 }
3256 }
3257
3258 clabel = cset->ac->clabel;
3259
3260 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3261 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3262 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3263 /* XXX this needs to be made *much* more general */
3264 /* Too many failures */
3265 return(0);
3266 }
3267 /* otherwise, all is well, and we've got enough to take a kick
3268 at autoconfiguring this set */
3269 return(1);
3270 }
3271
3272 void
3273 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3274 RF_Raid_t *raidPtr)
3275 {
3276 RF_ComponentLabel_t *clabel;
3277 int i;
3278
3279 clabel = ac->clabel;
3280
3281 /* 1. Fill in the common stuff */
3282 config->numCol = clabel->num_columns;
3283 config->numSpare = 0; /* XXX should this be set here? */
3284 config->sectPerSU = clabel->sectPerSU;
3285 config->SUsPerPU = clabel->SUsPerPU;
3286 config->SUsPerRU = clabel->SUsPerRU;
3287 config->parityConfig = clabel->parityConfig;
3288 /* XXX... */
3289 strcpy(config->diskQueueType,"fifo");
3290 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3291 config->layoutSpecificSize = 0; /* XXX ?? */
3292
3293 while(ac!=NULL) {
3294 /* row/col values will be in range due to the checks
3295 in reasonable_label() */
3296 strcpy(config->devnames[0][ac->clabel->column],
3297 ac->devname);
3298 ac = ac->next;
3299 }
3300
3301 for(i=0;i<RF_MAXDBGV;i++) {
3302 config->debugVars[i][0] = 0;
3303 }
3304 }
3305
3306 int
3307 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3308 {
3309 RF_ComponentLabel_t *clabel;
3310 int column;
3311 int sparecol;
3312
3313 raidPtr->autoconfigure = new_value;
3314
3315 for(column=0; column<raidPtr->numCol; column++) {
3316 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3317 clabel = raidget_component_label(raidPtr, column);
3318 clabel->autoconfigure = new_value;
3319 raidflush_component_label(raidPtr, column);
3320 }
3321 }
3322 for(column = 0; column < raidPtr->numSpare ; column++) {
3323 sparecol = raidPtr->numCol + column;
3324 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3325 clabel = raidget_component_label(raidPtr, sparecol);
3326 clabel->autoconfigure = new_value;
3327 raidflush_component_label(raidPtr, sparecol);
3328 }
3329 }
3330 return(new_value);
3331 }
3332
3333 int
3334 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3335 {
3336 RF_ComponentLabel_t *clabel;
3337 int column;
3338 int sparecol;
3339
3340 raidPtr->root_partition = new_value;
3341 for(column=0; column<raidPtr->numCol; column++) {
3342 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3343 clabel = raidget_component_label(raidPtr, column);
3344 clabel->root_partition = new_value;
3345 raidflush_component_label(raidPtr, column);
3346 }
3347 }
3348 for(column = 0; column < raidPtr->numSpare ; column++) {
3349 sparecol = raidPtr->numCol + column;
3350 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3351 clabel = raidget_component_label(raidPtr, sparecol);
3352 clabel->root_partition = new_value;
3353 raidflush_component_label(raidPtr, sparecol);
3354 }
3355 }
3356 return(new_value);
3357 }
3358
3359 void
3360 rf_release_all_vps(RF_ConfigSet_t *cset)
3361 {
3362 RF_AutoConfig_t *ac;
3363
3364 ac = cset->ac;
3365 while(ac!=NULL) {
3366 /* Close the vp, and give it back */
3367 if (ac->vp) {
3368 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3369 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3370 vput(ac->vp);
3371 ac->vp = NULL;
3372 }
3373 ac = ac->next;
3374 }
3375 }
3376
3377
3378 void
3379 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3380 {
3381 RF_AutoConfig_t *ac;
3382 RF_AutoConfig_t *next_ac;
3383
3384 ac = cset->ac;
3385 while(ac!=NULL) {
3386 next_ac = ac->next;
3387 /* nuke the label */
3388 free(ac->clabel, M_RAIDFRAME);
3389 /* cleanup the config structure */
3390 free(ac, M_RAIDFRAME);
3391 /* "next.." */
3392 ac = next_ac;
3393 }
3394 /* and, finally, nuke the config set */
3395 free(cset, M_RAIDFRAME);
3396 }
3397
3398
3399 void
3400 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3401 {
3402 /* current version number */
3403 clabel->version = RF_COMPONENT_LABEL_VERSION;
3404 clabel->serial_number = raidPtr->serial_number;
3405 clabel->mod_counter = raidPtr->mod_counter;
3406
3407 clabel->num_rows = 1;
3408 clabel->num_columns = raidPtr->numCol;
3409 clabel->clean = RF_RAID_DIRTY; /* not clean */
3410 clabel->status = rf_ds_optimal; /* "It's good!" */
3411
3412 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3413 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3414 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3415
3416 clabel->blockSize = raidPtr->bytesPerSector;
3417 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3418
3419 /* XXX not portable */
3420 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3421 clabel->maxOutstanding = raidPtr->maxOutstanding;
3422 clabel->autoconfigure = raidPtr->autoconfigure;
3423 clabel->root_partition = raidPtr->root_partition;
3424 clabel->last_unit = raidPtr->raidid;
3425 clabel->config_order = raidPtr->config_order;
3426
3427 #ifndef RF_NO_PARITY_MAP
3428 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3429 #endif
3430 }
3431
3432 struct raid_softc *
3433 rf_auto_config_set(RF_ConfigSet_t *cset)
3434 {
3435 RF_Raid_t *raidPtr;
3436 RF_Config_t *config;
3437 int raidID;
3438 struct raid_softc *sc;
3439
3440 #ifdef DEBUG
3441 printf("RAID autoconfigure\n");
3442 #endif
3443
3444 /* 1. Create a config structure */
3445 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3446
3447 /*
3448 2. Figure out what RAID ID this one is supposed to live at
3449 See if we can get the same RAID dev that it was configured
3450 on last time..
3451 */
3452
3453 raidID = cset->ac->clabel->last_unit;
3454 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3455 sc = raidget(++raidID, false))
3456 continue;
3457 #ifdef DEBUG
3458 printf("Configuring raid%d:\n",raidID);
3459 #endif
3460
3461 if (sc == NULL)
3462 sc = raidget(raidID, true);
3463 raidPtr = &sc->sc_r;
3464
3465 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3466 raidPtr->softc = sc;
3467 raidPtr->raidid = raidID;
3468 raidPtr->openings = RAIDOUTSTANDING;
3469
3470 /* 3. Build the configuration structure */
3471 rf_create_configuration(cset->ac, config, raidPtr);
3472
3473 /* 4. Do the configuration */
3474 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3475 raidinit(sc);
3476
3477 rf_markalldirty(raidPtr);
3478 raidPtr->autoconfigure = 1; /* XXX do this here? */
3479 switch (cset->ac->clabel->root_partition) {
3480 case 1: /* Force Root */
3481 case 2: /* Soft Root: root when boot partition part of raid */
3482 /*
3483 * everything configured just fine. Make a note
3484 * that this set is eligible to be root,
3485 * or forced to be root
3486 */
3487 cset->rootable = cset->ac->clabel->root_partition;
3488 /* XXX do this here? */
3489 raidPtr->root_partition = cset->rootable;
3490 break;
3491 default:
3492 break;
3493 }
3494 } else {
3495 raidput(sc);
3496 sc = NULL;
3497 }
3498
3499 /* 5. Cleanup */
3500 free(config, M_RAIDFRAME);
3501 return sc;
3502 }
3503
3504 void
3505 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3506 size_t xmin, size_t xmax)
3507 {
3508
3509 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3510 pool_sethiwat(p, xmax);
3511 pool_prime(p, xmin);
3512 }
3513
3514 /*
3515 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3516 * to see if there is IO pending and if that IO could possibly be done
3517 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3518 * otherwise.
3519 *
3520 */
3521 int
3522 rf_buf_queue_check(RF_Raid_t *raidPtr)
3523 {
3524 struct raid_softc *rs;
3525 struct dk_softc *dksc;
3526
3527 rs = raidPtr->softc;
3528 dksc = &rs->sc_dksc;
3529
3530 if ((rs->sc_flags & RAIDF_INITED) == 0)
3531 return 1;
3532
3533 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3534 /* there is work to do */
3535 return 0;
3536 }
3537 /* default is nothing to do */
3538 return 1;
3539 }
3540
3541 int
3542 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3543 {
3544 uint64_t numsecs;
3545 unsigned secsize;
3546 int error;
3547
3548 error = getdisksize(vp, &numsecs, &secsize);
3549 if (error == 0) {
3550 diskPtr->blockSize = secsize;
3551 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3552 diskPtr->partitionSize = numsecs;
3553 return 0;
3554 }
3555 return error;
3556 }
3557
3558 static int
3559 raid_match(device_t self, cfdata_t cfdata, void *aux)
3560 {
3561 return 1;
3562 }
3563
3564 static void
3565 raid_attach(device_t parent, device_t self, void *aux)
3566 {
3567 }
3568
3569
3570 static int
3571 raid_detach(device_t self, int flags)
3572 {
3573 int error;
3574 struct raid_softc *rs = raidsoftc(self);
3575
3576 if (rs == NULL)
3577 return ENXIO;
3578
3579 if ((error = raidlock(rs)) != 0)
3580 return (error);
3581
3582 error = raid_detach_unlocked(rs);
3583
3584 raidunlock(rs);
3585
3586 /* XXX raid can be referenced here */
3587
3588 if (error)
3589 return error;
3590
3591 /* Free the softc */
3592 raidput(rs);
3593
3594 return 0;
3595 }
3596
3597 static void
3598 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3599 {
3600 struct dk_softc *dksc = &rs->sc_dksc;
3601 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3602
3603 memset(dg, 0, sizeof(*dg));
3604
3605 dg->dg_secperunit = raidPtr->totalSectors;
3606 dg->dg_secsize = raidPtr->bytesPerSector;
3607 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3608 dg->dg_ntracks = 4 * raidPtr->numCol;
3609
3610 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3611 }
3612
3613 /*
3614 * Get cache info for all the components (including spares).
3615 * Returns intersection of all the cache flags of all disks, or first
3616 * error if any encountered.
3617 * XXXfua feature flags can change as spares are added - lock down somehow
3618 */
3619 static int
3620 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3621 {
3622 int c;
3623 int error;
3624 int dkwhole = 0, dkpart;
3625
3626 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3627 /*
3628 * Check any non-dead disk, even when currently being
3629 * reconstructed.
3630 */
3631 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3632 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3633 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3634 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3635 if (error) {
3636 if (error != ENODEV) {
3637 printf("raid%d: get cache for component %s failed\n",
3638 raidPtr->raidid,
3639 raidPtr->Disks[c].devname);
3640 }
3641
3642 return error;
3643 }
3644
3645 if (c == 0)
3646 dkwhole = dkpart;
3647 else
3648 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3649 }
3650 }
3651
3652 *data = dkwhole;
3653
3654 return 0;
3655 }
3656
3657 /*
3658 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3659 * We end up returning whatever error was returned by the first cache flush
3660 * that fails.
3661 */
3662
3663 int
3664 rf_sync_component_caches(RF_Raid_t *raidPtr)
3665 {
3666 int c, sparecol;
3667 int e,error;
3668 int force = 1;
3669
3670 error = 0;
3671 for (c = 0; c < raidPtr->numCol; c++) {
3672 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3673 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3674 &force, FWRITE, NOCRED);
3675 if (e) {
3676 if (e != ENODEV)
3677 printf("raid%d: cache flush to component %s failed.\n",
3678 raidPtr->raidid, raidPtr->Disks[c].devname);
3679 if (error == 0) {
3680 error = e;
3681 }
3682 }
3683 }
3684 }
3685
3686 for( c = 0; c < raidPtr->numSpare ; c++) {
3687 sparecol = raidPtr->numCol + c;
3688 /* Need to ensure that the reconstruct actually completed! */
3689 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3690 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3691 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3692 if (e) {
3693 if (e != ENODEV)
3694 printf("raid%d: cache flush to component %s failed.\n",
3695 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3696 if (error == 0) {
3697 error = e;
3698 }
3699 }
3700 }
3701 }
3702 return error;
3703 }
3704
3705 /* Fill in info with the current status */
3706 void
3707 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3708 {
3709
3710 if (raidPtr->status != rf_rs_reconstructing) {
3711 info->total = 100;
3712 info->completed = 100;
3713 } else {
3714 info->total = raidPtr->reconControl->numRUsTotal;
3715 info->completed = raidPtr->reconControl->numRUsComplete;
3716 }
3717 info->remaining = info->total - info->completed;
3718 }
3719
3720 /* Fill in info with the current status */
3721 void
3722 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3723 {
3724
3725 if (raidPtr->parity_rewrite_in_progress == 1) {
3726 info->total = raidPtr->Layout.numStripe;
3727 info->completed = raidPtr->parity_rewrite_stripes_done;
3728 } else {
3729 info->completed = 100;
3730 info->total = 100;
3731 }
3732 info->remaining = info->total - info->completed;
3733 }
3734
3735 /* Fill in info with the current status */
3736 void
3737 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3738 {
3739
3740 if (raidPtr->copyback_in_progress == 1) {
3741 info->total = raidPtr->Layout.numStripe;
3742 info->completed = raidPtr->copyback_stripes_done;
3743 info->remaining = info->total - info->completed;
3744 } else {
3745 info->remaining = 0;
3746 info->completed = 100;
3747 info->total = 100;
3748 }
3749 }
3750
3751 /* Fill in config with the current info */
3752 int
3753 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3754 {
3755 int d, i, j;
3756
3757 if (!raidPtr->valid)
3758 return (ENODEV);
3759 config->cols = raidPtr->numCol;
3760 config->ndevs = raidPtr->numCol;
3761 if (config->ndevs >= RF_MAX_DISKS)
3762 return (ENOMEM);
3763 config->nspares = raidPtr->numSpare;
3764 if (config->nspares >= RF_MAX_DISKS)
3765 return (ENOMEM);
3766 config->maxqdepth = raidPtr->maxQueueDepth;
3767 d = 0;
3768 for (j = 0; j < config->cols; j++) {
3769 config->devs[d] = raidPtr->Disks[j];
3770 d++;
3771 }
3772 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3773 config->spares[i] = raidPtr->Disks[j];
3774 if (config->spares[i].status == rf_ds_rebuilding_spare) {
3775 /* XXX: raidctl(8) expects to see this as a used spare */
3776 config->spares[i].status = rf_ds_used_spare;
3777 }
3778 }
3779 return 0;
3780 }
3781
3782 int
3783 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3784 {
3785 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3786 RF_ComponentLabel_t *raid_clabel;
3787 int column = clabel->column;
3788
3789 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3790 return EINVAL;
3791 raid_clabel = raidget_component_label(raidPtr, column);
3792 memcpy(clabel, raid_clabel, sizeof *clabel);
3793
3794 return 0;
3795 }
3796
3797 /*
3798 * Module interface
3799 */
3800
3801 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3802
3803 #ifdef _MODULE
3804 CFDRIVER_DECL(raid, DV_DISK, NULL);
3805 #endif
3806
3807 static int raid_modcmd(modcmd_t, void *);
3808 static int raid_modcmd_init(void);
3809 static int raid_modcmd_fini(void);
3810
3811 static int
3812 raid_modcmd(modcmd_t cmd, void *data)
3813 {
3814 int error;
3815
3816 error = 0;
3817 switch (cmd) {
3818 case MODULE_CMD_INIT:
3819 error = raid_modcmd_init();
3820 break;
3821 case MODULE_CMD_FINI:
3822 error = raid_modcmd_fini();
3823 break;
3824 default:
3825 error = ENOTTY;
3826 break;
3827 }
3828 return error;
3829 }
3830
3831 static int
3832 raid_modcmd_init(void)
3833 {
3834 int error;
3835 int bmajor, cmajor;
3836
3837 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3838 mutex_enter(&raid_lock);
3839 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3840 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3841 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3842 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3843
3844 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3845 #endif
3846
3847 bmajor = cmajor = -1;
3848 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3849 &raid_cdevsw, &cmajor);
3850 if (error != 0 && error != EEXIST) {
3851 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3852 mutex_exit(&raid_lock);
3853 return error;
3854 }
3855 #ifdef _MODULE
3856 error = config_cfdriver_attach(&raid_cd);
3857 if (error != 0) {
3858 aprint_error("%s: config_cfdriver_attach failed %d\n",
3859 __func__, error);
3860 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3861 mutex_exit(&raid_lock);
3862 return error;
3863 }
3864 #endif
3865 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3866 if (error != 0) {
3867 aprint_error("%s: config_cfattach_attach failed %d\n",
3868 __func__, error);
3869 #ifdef _MODULE
3870 config_cfdriver_detach(&raid_cd);
3871 #endif
3872 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3873 mutex_exit(&raid_lock);
3874 return error;
3875 }
3876
3877 raidautoconfigdone = false;
3878
3879 mutex_exit(&raid_lock);
3880
3881 if (error == 0) {
3882 if (rf_BootRaidframe(true) == 0)
3883 aprint_verbose("Kernelized RAIDframe activated\n");
3884 else
3885 panic("Serious error activating RAID!!");
3886 }
3887
3888 /*
3889 * Register a finalizer which will be used to auto-config RAID
3890 * sets once all real hardware devices have been found.
3891 */
3892 error = config_finalize_register(NULL, rf_autoconfig);
3893 if (error != 0) {
3894 aprint_error("WARNING: unable to register RAIDframe "
3895 "finalizer\n");
3896 error = 0;
3897 }
3898
3899 return error;
3900 }
3901
3902 static int
3903 raid_modcmd_fini(void)
3904 {
3905 int error;
3906
3907 mutex_enter(&raid_lock);
3908
3909 /* Don't allow unload if raid device(s) exist. */
3910 if (!LIST_EMPTY(&raids)) {
3911 mutex_exit(&raid_lock);
3912 return EBUSY;
3913 }
3914
3915 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3916 if (error != 0) {
3917 aprint_error("%s: cannot detach cfattach\n",__func__);
3918 mutex_exit(&raid_lock);
3919 return error;
3920 }
3921 #ifdef _MODULE
3922 error = config_cfdriver_detach(&raid_cd);
3923 if (error != 0) {
3924 aprint_error("%s: cannot detach cfdriver\n",__func__);
3925 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3926 mutex_exit(&raid_lock);
3927 return error;
3928 }
3929 #endif
3930 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3931 if (error != 0) {
3932 aprint_error("%s: cannot detach devsw\n",__func__);
3933 #ifdef _MODULE
3934 config_cfdriver_attach(&raid_cd);
3935 #endif
3936 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3937 mutex_exit(&raid_lock);
3938 return error;
3939 }
3940 rf_BootRaidframe(false);
3941 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3942 rf_destroy_mutex2(rf_sparet_wait_mutex);
3943 rf_destroy_cond2(rf_sparet_wait_cv);
3944 rf_destroy_cond2(rf_sparet_resp_cv);
3945 #endif
3946 mutex_exit(&raid_lock);
3947 mutex_destroy(&raid_lock);
3948
3949 return error;
3950 }
3951