rf_netbsdkintf.c revision 1.372 1 /* $NetBSD: rf_netbsdkintf.c,v 1.372 2019/02/06 23:00:16 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.372 2019/02/06 23:00:16 christos Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171
172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
173 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
175 * installation process */
176 #endif
177
178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
179
180 /* prototypes */
181 static void KernelWakeupFunc(struct buf *);
182 static void InitBP(struct buf *, struct vnode *, unsigned,
183 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
184 void *, int, struct proc *);
185 static void raidinit(struct raid_softc *);
186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
188
189 static int raid_match(device_t, cfdata_t, void *);
190 static void raid_attach(device_t, device_t, void *);
191 static int raid_detach(device_t, int);
192
193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
194 daddr_t, daddr_t);
195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
196 daddr_t, daddr_t, int);
197
198 static int raidwrite_component_label(unsigned,
199 dev_t, struct vnode *, RF_ComponentLabel_t *);
200 static int raidread_component_label(unsigned,
201 dev_t, struct vnode *, RF_ComponentLabel_t *);
202
203 static int raid_diskstart(device_t, struct buf *bp);
204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
205 static int raid_lastclose(device_t);
206
207 static dev_type_open(raidopen);
208 static dev_type_close(raidclose);
209 static dev_type_read(raidread);
210 static dev_type_write(raidwrite);
211 static dev_type_ioctl(raidioctl);
212 static dev_type_strategy(raidstrategy);
213 static dev_type_dump(raiddump);
214 static dev_type_size(raidsize);
215
216 const struct bdevsw raid_bdevsw = {
217 .d_open = raidopen,
218 .d_close = raidclose,
219 .d_strategy = raidstrategy,
220 .d_ioctl = raidioctl,
221 .d_dump = raiddump,
222 .d_psize = raidsize,
223 .d_discard = nodiscard,
224 .d_flag = D_DISK
225 };
226
227 const struct cdevsw raid_cdevsw = {
228 .d_open = raidopen,
229 .d_close = raidclose,
230 .d_read = raidread,
231 .d_write = raidwrite,
232 .d_ioctl = raidioctl,
233 .d_stop = nostop,
234 .d_tty = notty,
235 .d_poll = nopoll,
236 .d_mmap = nommap,
237 .d_kqfilter = nokqfilter,
238 .d_discard = nodiscard,
239 .d_flag = D_DISK
240 };
241
242 static struct dkdriver rf_dkdriver = {
243 .d_open = raidopen,
244 .d_close = raidclose,
245 .d_strategy = raidstrategy,
246 .d_diskstart = raid_diskstart,
247 .d_dumpblocks = raid_dumpblocks,
248 .d_lastclose = raid_lastclose,
249 .d_minphys = minphys
250 };
251
252 #define raidunit(x) DISKUNIT(x)
253 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
254
255 extern struct cfdriver raid_cd;
256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
257 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
258 DVF_DETACH_SHUTDOWN);
259
260 /* Internal representation of a rf_recon_req */
261 struct rf_recon_req_internal {
262 RF_RowCol_t col;
263 RF_ReconReqFlags_t flags;
264 void *raidPtr;
265 };
266
267 /*
268 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
269 * Be aware that large numbers can allow the driver to consume a lot of
270 * kernel memory, especially on writes, and in degraded mode reads.
271 *
272 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
273 * a single 64K write will typically require 64K for the old data,
274 * 64K for the old parity, and 64K for the new parity, for a total
275 * of 192K (if the parity buffer is not re-used immediately).
276 * Even it if is used immediately, that's still 128K, which when multiplied
277 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
278 *
279 * Now in degraded mode, for example, a 64K read on the above setup may
280 * require data reconstruction, which will require *all* of the 4 remaining
281 * disks to participate -- 4 * 32K/disk == 128K again.
282 */
283
284 #ifndef RAIDOUTSTANDING
285 #define RAIDOUTSTANDING 6
286 #endif
287
288 #define RAIDLABELDEV(dev) \
289 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
290
291 /* declared here, and made public, for the benefit of KVM stuff.. */
292
293 static int raidlock(struct raid_softc *);
294 static void raidunlock(struct raid_softc *);
295
296 static int raid_detach_unlocked(struct raid_softc *);
297
298 static void rf_markalldirty(RF_Raid_t *);
299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
300
301 void rf_ReconThread(struct rf_recon_req_internal *);
302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
303 void rf_CopybackThread(RF_Raid_t *raidPtr);
304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
305 int rf_autoconfig(device_t);
306 void rf_buildroothack(RF_ConfigSet_t *);
307
308 RF_AutoConfig_t *rf_find_raid_components(void);
309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
313 int rf_set_autoconfig(RF_Raid_t *, int);
314 int rf_set_rootpartition(RF_Raid_t *, int);
315 void rf_release_all_vps(RF_ConfigSet_t *);
316 void rf_cleanup_config_set(RF_ConfigSet_t *);
317 int rf_have_enough_components(RF_ConfigSet_t *);
318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
320
321 /*
322 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
323 * Note that this is overridden by having RAID_AUTOCONFIG as an option
324 * in the kernel config file.
325 */
326 #ifdef RAID_AUTOCONFIG
327 int raidautoconfig = 1;
328 #else
329 int raidautoconfig = 0;
330 #endif
331 static bool raidautoconfigdone = false;
332
333 struct RF_Pools_s rf_pools;
334
335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
336 static kmutex_t raid_lock;
337
338 static struct raid_softc *
339 raidcreate(int unit) {
340 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
341 sc->sc_unit = unit;
342 cv_init(&sc->sc_cv, "raidunit");
343 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
344 return sc;
345 }
346
347 static void
348 raiddestroy(struct raid_softc *sc) {
349 cv_destroy(&sc->sc_cv);
350 mutex_destroy(&sc->sc_mutex);
351 kmem_free(sc, sizeof(*sc));
352 }
353
354 static struct raid_softc *
355 raidget(int unit, bool create) {
356 struct raid_softc *sc;
357 if (unit < 0) {
358 #ifdef DIAGNOSTIC
359 panic("%s: unit %d!", __func__, unit);
360 #endif
361 return NULL;
362 }
363 mutex_enter(&raid_lock);
364 LIST_FOREACH(sc, &raids, sc_link) {
365 if (sc->sc_unit == unit) {
366 mutex_exit(&raid_lock);
367 return sc;
368 }
369 }
370 mutex_exit(&raid_lock);
371 if (!create)
372 return NULL;
373 if ((sc = raidcreate(unit)) == NULL)
374 return NULL;
375 mutex_enter(&raid_lock);
376 LIST_INSERT_HEAD(&raids, sc, sc_link);
377 mutex_exit(&raid_lock);
378 return sc;
379 }
380
381 static void
382 raidput(struct raid_softc *sc) {
383 mutex_enter(&raid_lock);
384 LIST_REMOVE(sc, sc_link);
385 mutex_exit(&raid_lock);
386 raiddestroy(sc);
387 }
388
389 void
390 raidattach(int num)
391 {
392
393 /*
394 * Device attachment and associated initialization now occurs
395 * as part of the module initialization.
396 */
397 }
398
399 int
400 rf_autoconfig(device_t self)
401 {
402 RF_AutoConfig_t *ac_list;
403 RF_ConfigSet_t *config_sets;
404
405 if (!raidautoconfig || raidautoconfigdone == true)
406 return (0);
407
408 /* XXX This code can only be run once. */
409 raidautoconfigdone = true;
410
411 #ifdef __HAVE_CPU_BOOTCONF
412 /*
413 * 0. find the boot device if needed first so we can use it later
414 * this needs to be done before we autoconfigure any raid sets,
415 * because if we use wedges we are not going to be able to open
416 * the boot device later
417 */
418 if (booted_device == NULL)
419 cpu_bootconf();
420 #endif
421 /* 1. locate all RAID components on the system */
422 aprint_debug("Searching for RAID components...\n");
423 ac_list = rf_find_raid_components();
424
425 /* 2. Sort them into their respective sets. */
426 config_sets = rf_create_auto_sets(ac_list);
427
428 /*
429 * 3. Evaluate each set and configure the valid ones.
430 * This gets done in rf_buildroothack().
431 */
432 rf_buildroothack(config_sets);
433
434 return 1;
435 }
436
437 int
438 rf_inited(const struct raid_softc *rs) {
439 return (rs->sc_flags & RAIDF_INITED) != 0;
440 }
441
442 RF_Raid_t *
443 rf_get_raid(struct raid_softc *rs) {
444 return &rs->sc_r;
445 }
446
447 int
448 rf_get_unit(const struct raid_softc *rs) {
449 return rs->sc_unit;
450 }
451
452 static int
453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
454 const char *bootname;
455 size_t len;
456
457 /* if bdv is NULL, the set can't contain it. exit early. */
458 if (bdv == NULL)
459 return 0;
460
461 bootname = device_xname(bdv);
462 len = strlen(bootname);
463
464 for (int col = 0; col < r->numCol; col++) {
465 const char *devname = r->Disks[col].devname;
466 devname += sizeof("/dev/") - 1;
467 if (strncmp(devname, "dk", 2) == 0) {
468 const char *parent =
469 dkwedge_get_parent_name(r->Disks[col].dev);
470 if (parent != NULL)
471 devname = parent;
472 }
473 if (strncmp(devname, bootname, len) == 0) {
474 struct raid_softc *sc = r->softc;
475 aprint_debug("raid%d includes boot device %s\n",
476 sc->sc_unit, devname);
477 return 1;
478 }
479 }
480 return 0;
481 }
482
483 void
484 rf_buildroothack(RF_ConfigSet_t *config_sets)
485 {
486 RF_ConfigSet_t *cset;
487 RF_ConfigSet_t *next_cset;
488 int num_root;
489 struct raid_softc *sc, *rsc;
490 struct dk_softc *dksc;
491
492 sc = rsc = NULL;
493 num_root = 0;
494 cset = config_sets;
495 while (cset != NULL) {
496 next_cset = cset->next;
497 if (rf_have_enough_components(cset) &&
498 cset->ac->clabel->autoconfigure == 1) {
499 sc = rf_auto_config_set(cset);
500 if (sc != NULL) {
501 aprint_debug("raid%d: configured ok, rootable %d\n",
502 sc->sc_unit, cset->rootable);
503 if (cset->rootable) {
504 rsc = sc;
505 num_root++;
506 }
507 } else {
508 /* The autoconfig didn't work :( */
509 aprint_debug("Autoconfig failed\n");
510 rf_release_all_vps(cset);
511 }
512 } else {
513 /* we're not autoconfiguring this set...
514 release the associated resources */
515 rf_release_all_vps(cset);
516 }
517 /* cleanup */
518 rf_cleanup_config_set(cset);
519 cset = next_cset;
520 }
521 dksc = &rsc->sc_dksc;
522
523 /* if the user has specified what the root device should be
524 then we don't touch booted_device or boothowto... */
525
526 if (rootspec != NULL) {
527 DPRINTF("%s: rootspec %s\n", __func__, rootspec);
528 return;
529 }
530
531 /* we found something bootable... */
532
533 /*
534 * XXX: The following code assumes that the root raid
535 * is the first ('a') partition. This is about the best
536 * we can do with a BSD disklabel, but we might be able
537 * to do better with a GPT label, by setting a specified
538 * attribute to indicate the root partition. We can then
539 * stash the partition number in the r->root_partition
540 * high bits (the bottom 2 bits are already used). For
541 * now we just set booted_partition to 0 when we override
542 * root.
543 */
544 if (num_root == 1) {
545 device_t candidate_root;
546 if (dksc->sc_dkdev.dk_nwedges != 0) {
547 char cname[sizeof(cset->ac->devname)];
548 /* XXX: assume partition 'a' first */
549 snprintf(cname, sizeof(cname), "%s%c",
550 device_xname(dksc->sc_dev), 'a');
551 candidate_root = dkwedge_find_by_wname(cname);
552 DPRINTF("%s: candidate wedge root=%s\n", __func__,
553 cname);
554 if (candidate_root == NULL) {
555 /*
556 * If that is not found, because we don't use
557 * disklabel, return the first dk child
558 * XXX: we can skip the 'a' check above
559 * and always do this...
560 */
561 size_t i = 0;
562 candidate_root = dkwedge_find_by_parent(
563 device_xname(dksc->sc_dev), &i);
564 }
565 DPRINTF("%s: candidate wedge root=%p\n", __func__,
566 candidate_root);
567 } else
568 candidate_root = dksc->sc_dev;
569 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
570 DPRINTF("%s: booted_device=%p root_partition=%d "
571 "contains_boot=%d",
572 __func__, booted_device, rsc->sc_r.root_partition,
573 rf_containsboot(&rsc->sc_r, booted_device));
574 /* XXX the check for booted_device == NULL can probably be
575 * dropped, now that rf_containsboot handles that case.
576 */
577 if (booted_device == NULL ||
578 rsc->sc_r.root_partition == 1 ||
579 rf_containsboot(&rsc->sc_r, booted_device)) {
580 booted_device = candidate_root;
581 booted_method = "raidframe/single";
582 booted_partition = 0; /* XXX assume 'a' */
583 }
584 } else if (num_root > 1) {
585 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
586 booted_device);
587
588 /*
589 * Maybe the MD code can help. If it cannot, then
590 * setroot() will discover that we have no
591 * booted_device and will ask the user if nothing was
592 * hardwired in the kernel config file
593 */
594 if (booted_device == NULL)
595 return;
596
597 num_root = 0;
598 mutex_enter(&raid_lock);
599 LIST_FOREACH(sc, &raids, sc_link) {
600 RF_Raid_t *r = &sc->sc_r;
601 if (r->valid == 0)
602 continue;
603
604 if (r->root_partition == 0)
605 continue;
606
607 if (rf_containsboot(r, booted_device)) {
608 num_root++;
609 rsc = sc;
610 dksc = &rsc->sc_dksc;
611 }
612 }
613 mutex_exit(&raid_lock);
614
615 if (num_root == 1) {
616 booted_device = dksc->sc_dev;
617 booted_method = "raidframe/multi";
618 booted_partition = 0; /* XXX assume 'a' */
619 } else {
620 /* we can't guess.. require the user to answer... */
621 boothowto |= RB_ASKNAME;
622 }
623 }
624 }
625
626 static int
627 raidsize(dev_t dev)
628 {
629 struct raid_softc *rs;
630 struct dk_softc *dksc;
631 unsigned int unit;
632
633 unit = raidunit(dev);
634 if ((rs = raidget(unit, false)) == NULL)
635 return -1;
636 dksc = &rs->sc_dksc;
637
638 if ((rs->sc_flags & RAIDF_INITED) == 0)
639 return -1;
640
641 return dk_size(dksc, dev);
642 }
643
644 static int
645 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
646 {
647 unsigned int unit;
648 struct raid_softc *rs;
649 struct dk_softc *dksc;
650
651 unit = raidunit(dev);
652 if ((rs = raidget(unit, false)) == NULL)
653 return ENXIO;
654 dksc = &rs->sc_dksc;
655
656 if ((rs->sc_flags & RAIDF_INITED) == 0)
657 return ENODEV;
658
659 /*
660 Note that blkno is relative to this particular partition.
661 By adding adding RF_PROTECTED_SECTORS, we get a value that
662 is relative to the partition used for the underlying component.
663 */
664 blkno += RF_PROTECTED_SECTORS;
665
666 return dk_dump(dksc, dev, blkno, va, size);
667 }
668
669 static int
670 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
671 {
672 struct raid_softc *rs = raidsoftc(dev);
673 const struct bdevsw *bdev;
674 RF_Raid_t *raidPtr;
675 int c, sparecol, j, scol, dumpto;
676 int error = 0;
677
678 raidPtr = &rs->sc_r;
679
680 /* we only support dumping to RAID 1 sets */
681 if (raidPtr->Layout.numDataCol != 1 ||
682 raidPtr->Layout.numParityCol != 1)
683 return EINVAL;
684
685 if ((error = raidlock(rs)) != 0)
686 return error;
687
688 /* figure out what device is alive.. */
689
690 /*
691 Look for a component to dump to. The preference for the
692 component to dump to is as follows:
693 1) the master
694 2) a used_spare of the master
695 3) the slave
696 4) a used_spare of the slave
697 */
698
699 dumpto = -1;
700 for (c = 0; c < raidPtr->numCol; c++) {
701 if (raidPtr->Disks[c].status == rf_ds_optimal) {
702 /* this might be the one */
703 dumpto = c;
704 break;
705 }
706 }
707
708 /*
709 At this point we have possibly selected a live master or a
710 live slave. We now check to see if there is a spared
711 master (or a spared slave), if we didn't find a live master
712 or a live slave.
713 */
714
715 for (c = 0; c < raidPtr->numSpare; c++) {
716 sparecol = raidPtr->numCol + c;
717 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
718 /* How about this one? */
719 scol = -1;
720 for(j=0;j<raidPtr->numCol;j++) {
721 if (raidPtr->Disks[j].spareCol == sparecol) {
722 scol = j;
723 break;
724 }
725 }
726 if (scol == 0) {
727 /*
728 We must have found a spared master!
729 We'll take that over anything else
730 found so far. (We couldn't have
731 found a real master before, since
732 this is a used spare, and it's
733 saying that it's replacing the
734 master.) On reboot (with
735 autoconfiguration turned on)
736 sparecol will become the 1st
737 component (component0) of this set.
738 */
739 dumpto = sparecol;
740 break;
741 } else if (scol != -1) {
742 /*
743 Must be a spared slave. We'll dump
744 to that if we havn't found anything
745 else so far.
746 */
747 if (dumpto == -1)
748 dumpto = sparecol;
749 }
750 }
751 }
752
753 if (dumpto == -1) {
754 /* we couldn't find any live components to dump to!?!?
755 */
756 error = EINVAL;
757 goto out;
758 }
759
760 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
761 if (bdev == NULL) {
762 error = ENXIO;
763 goto out;
764 }
765
766 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
767 blkno, va, nblk * raidPtr->bytesPerSector);
768
769 out:
770 raidunlock(rs);
771
772 return error;
773 }
774
775 /* ARGSUSED */
776 static int
777 raidopen(dev_t dev, int flags, int fmt,
778 struct lwp *l)
779 {
780 int unit = raidunit(dev);
781 struct raid_softc *rs;
782 struct dk_softc *dksc;
783 int error = 0;
784 int part, pmask;
785
786 if ((rs = raidget(unit, true)) == NULL)
787 return ENXIO;
788 if ((error = raidlock(rs)) != 0)
789 return (error);
790
791 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
792 error = EBUSY;
793 goto bad;
794 }
795
796 dksc = &rs->sc_dksc;
797
798 part = DISKPART(dev);
799 pmask = (1 << part);
800
801 if (!DK_BUSY(dksc, pmask) &&
802 ((rs->sc_flags & RAIDF_INITED) != 0)) {
803 /* First one... mark things as dirty... Note that we *MUST*
804 have done a configure before this. I DO NOT WANT TO BE
805 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
806 THAT THEY BELONG TOGETHER!!!!! */
807 /* XXX should check to see if we're only open for reading
808 here... If so, we needn't do this, but then need some
809 other way of keeping track of what's happened.. */
810
811 rf_markalldirty(&rs->sc_r);
812 }
813
814 if ((rs->sc_flags & RAIDF_INITED) != 0)
815 error = dk_open(dksc, dev, flags, fmt, l);
816
817 bad:
818 raidunlock(rs);
819
820 return (error);
821
822
823 }
824
825 static int
826 raid_lastclose(device_t self)
827 {
828 struct raid_softc *rs = raidsoftc(self);
829
830 /* Last one... device is not unconfigured yet.
831 Device shutdown has taken care of setting the
832 clean bits if RAIDF_INITED is not set
833 mark things as clean... */
834
835 rf_update_component_labels(&rs->sc_r,
836 RF_FINAL_COMPONENT_UPDATE);
837
838 /* pass to unlocked code */
839 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
840 rs->sc_flags |= RAIDF_DETACH;
841
842 return 0;
843 }
844
845 /* ARGSUSED */
846 static int
847 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
848 {
849 int unit = raidunit(dev);
850 struct raid_softc *rs;
851 struct dk_softc *dksc;
852 cfdata_t cf;
853 int error = 0, do_detach = 0, do_put = 0;
854
855 if ((rs = raidget(unit, false)) == NULL)
856 return ENXIO;
857 dksc = &rs->sc_dksc;
858
859 if ((error = raidlock(rs)) != 0)
860 return (error);
861
862 if ((rs->sc_flags & RAIDF_INITED) != 0) {
863 error = dk_close(dksc, dev, flags, fmt, l);
864 if ((rs->sc_flags & RAIDF_DETACH) != 0)
865 do_detach = 1;
866 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
867 do_put = 1;
868
869 raidunlock(rs);
870
871 if (do_detach) {
872 /* free the pseudo device attach bits */
873 cf = device_cfdata(dksc->sc_dev);
874 error = config_detach(dksc->sc_dev, 0);
875 if (error == 0)
876 free(cf, M_RAIDFRAME);
877 } else if (do_put) {
878 raidput(rs);
879 }
880
881 return (error);
882
883 }
884
885 static void
886 raid_wakeup(RF_Raid_t *raidPtr)
887 {
888 rf_lock_mutex2(raidPtr->iodone_lock);
889 rf_signal_cond2(raidPtr->iodone_cv);
890 rf_unlock_mutex2(raidPtr->iodone_lock);
891 }
892
893 static void
894 raidstrategy(struct buf *bp)
895 {
896 unsigned int unit;
897 struct raid_softc *rs;
898 struct dk_softc *dksc;
899 RF_Raid_t *raidPtr;
900
901 unit = raidunit(bp->b_dev);
902 if ((rs = raidget(unit, false)) == NULL) {
903 bp->b_error = ENXIO;
904 goto fail;
905 }
906 if ((rs->sc_flags & RAIDF_INITED) == 0) {
907 bp->b_error = ENXIO;
908 goto fail;
909 }
910 dksc = &rs->sc_dksc;
911 raidPtr = &rs->sc_r;
912
913 /* Queue IO only */
914 if (dk_strategy_defer(dksc, bp))
915 goto done;
916
917 /* schedule the IO to happen at the next convenient time */
918 raid_wakeup(raidPtr);
919
920 done:
921 return;
922
923 fail:
924 bp->b_resid = bp->b_bcount;
925 biodone(bp);
926 }
927
928 static int
929 raid_diskstart(device_t dev, struct buf *bp)
930 {
931 struct raid_softc *rs = raidsoftc(dev);
932 RF_Raid_t *raidPtr;
933
934 raidPtr = &rs->sc_r;
935 if (!raidPtr->valid) {
936 db1_printf(("raid is not valid..\n"));
937 return ENODEV;
938 }
939
940 /* XXX */
941 bp->b_resid = 0;
942
943 return raiddoaccess(raidPtr, bp);
944 }
945
946 void
947 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
948 {
949 struct raid_softc *rs;
950 struct dk_softc *dksc;
951
952 rs = raidPtr->softc;
953 dksc = &rs->sc_dksc;
954
955 dk_done(dksc, bp);
956
957 rf_lock_mutex2(raidPtr->mutex);
958 raidPtr->openings++;
959 rf_unlock_mutex2(raidPtr->mutex);
960
961 /* schedule more IO */
962 raid_wakeup(raidPtr);
963 }
964
965 /* ARGSUSED */
966 static int
967 raidread(dev_t dev, struct uio *uio, int flags)
968 {
969 int unit = raidunit(dev);
970 struct raid_softc *rs;
971
972 if ((rs = raidget(unit, false)) == NULL)
973 return ENXIO;
974
975 if ((rs->sc_flags & RAIDF_INITED) == 0)
976 return (ENXIO);
977
978 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
979
980 }
981
982 /* ARGSUSED */
983 static int
984 raidwrite(dev_t dev, struct uio *uio, int flags)
985 {
986 int unit = raidunit(dev);
987 struct raid_softc *rs;
988
989 if ((rs = raidget(unit, false)) == NULL)
990 return ENXIO;
991
992 if ((rs->sc_flags & RAIDF_INITED) == 0)
993 return (ENXIO);
994
995 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
996
997 }
998
999 static int
1000 raid_detach_unlocked(struct raid_softc *rs)
1001 {
1002 struct dk_softc *dksc = &rs->sc_dksc;
1003 RF_Raid_t *raidPtr;
1004 int error;
1005
1006 raidPtr = &rs->sc_r;
1007
1008 if (DK_BUSY(dksc, 0) ||
1009 raidPtr->recon_in_progress != 0 ||
1010 raidPtr->parity_rewrite_in_progress != 0 ||
1011 raidPtr->copyback_in_progress != 0)
1012 return EBUSY;
1013
1014 if ((rs->sc_flags & RAIDF_INITED) == 0)
1015 return 0;
1016
1017 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1018
1019 if ((error = rf_Shutdown(raidPtr)) != 0)
1020 return error;
1021
1022 rs->sc_flags &= ~RAIDF_INITED;
1023
1024 /* Kill off any queued buffers */
1025 dk_drain(dksc);
1026 bufq_free(dksc->sc_bufq);
1027
1028 /* Detach the disk. */
1029 dkwedge_delall(&dksc->sc_dkdev);
1030 disk_detach(&dksc->sc_dkdev);
1031 disk_destroy(&dksc->sc_dkdev);
1032 dk_detach(dksc);
1033
1034 return 0;
1035 }
1036
1037 static bool
1038 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1039 {
1040 switch (cmd) {
1041 case RAIDFRAME_ADD_HOT_SPARE:
1042 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1043 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1044 case RAIDFRAME_CHECK_PARITY:
1045 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1046 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1047 case RAIDFRAME_CHECK_RECON_STATUS:
1048 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1049 case RAIDFRAME_COPYBACK:
1050 case RAIDFRAME_DELETE_COMPONENT:
1051 case RAIDFRAME_FAIL_DISK:
1052 case RAIDFRAME_GET_ACCTOTALS:
1053 case RAIDFRAME_GET_COMPONENT_LABEL:
1054 case RAIDFRAME_GET_INFO:
1055 case RAIDFRAME_GET_SIZE:
1056 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1057 case RAIDFRAME_INIT_LABELS:
1058 case RAIDFRAME_KEEP_ACCTOTALS:
1059 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1060 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1061 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1062 case RAIDFRAME_PARITYMAP_STATUS:
1063 case RAIDFRAME_REBUILD_IN_PLACE:
1064 case RAIDFRAME_REMOVE_HOT_SPARE:
1065 case RAIDFRAME_RESET_ACCTOTALS:
1066 case RAIDFRAME_REWRITEPARITY:
1067 case RAIDFRAME_SET_AUTOCONFIG:
1068 case RAIDFRAME_SET_COMPONENT_LABEL:
1069 case RAIDFRAME_SET_ROOT:
1070 return (rs->sc_flags & RAIDF_INITED) == 0;
1071 }
1072 return false;
1073 }
1074
1075 int
1076 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1077 {
1078 struct rf_recon_req_internal *rrint;
1079
1080 if (raidPtr->Layout.map->faultsTolerated == 0) {
1081 /* Can't do this on a RAID 0!! */
1082 return EINVAL;
1083 }
1084
1085 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1086 /* bad column */
1087 return EINVAL;
1088 }
1089
1090 rf_lock_mutex2(raidPtr->mutex);
1091 if (raidPtr->status == rf_rs_reconstructing) {
1092 /* you can't fail a disk while we're reconstructing! */
1093 /* XXX wrong for RAID6 */
1094 goto out;
1095 }
1096 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1097 (raidPtr->numFailures > 0)) {
1098 /* some other component has failed. Let's not make
1099 things worse. XXX wrong for RAID6 */
1100 goto out;
1101 }
1102 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1103 /* Can't fail a spared disk! */
1104 goto out;
1105 }
1106 rf_unlock_mutex2(raidPtr->mutex);
1107
1108 /* make a copy of the recon request so that we don't rely on
1109 * the user's buffer */
1110 RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
1111 if (rrint == NULL)
1112 return(ENOMEM);
1113 rrint->col = rr->col;
1114 rrint->flags = rr->flags;
1115 rrint->raidPtr = raidPtr;
1116
1117 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1118 rrint, "raid_recon");
1119 out:
1120 rf_unlock_mutex2(raidPtr->mutex);
1121 return EINVAL;
1122 }
1123
1124 static int
1125 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1126 {
1127 /* allocate a buffer for the layout-specific data, and copy it in */
1128 if (k_cfg->layoutSpecificSize == 0)
1129 return 0;
1130
1131 if (k_cfg->layoutSpecificSize > 10000) {
1132 /* sanity check */
1133 return EINVAL;
1134 }
1135
1136 u_char *specific_buf;
1137 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, (u_char *));
1138 if (specific_buf == NULL)
1139 return ENOMEM;
1140
1141 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1142 k_cfg->layoutSpecificSize);
1143 if (retcode) {
1144 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1145 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1146 return retcode;
1147 }
1148
1149 k_cfg->layoutSpecific = specific_buf;
1150 return 0;
1151 }
1152
1153 static int
1154 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1155 {
1156 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1157
1158 if (rs->sc_r.valid) {
1159 /* There is a valid RAID set running on this unit! */
1160 printf("raid%d: Device already configured!\n", rs->sc_unit);
1161 return EINVAL;
1162 }
1163
1164 /* copy-in the configuration information */
1165 /* data points to a pointer to the configuration structure */
1166 RF_Malloc(*k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1167 if (*k_cfg == NULL) {
1168 return ENOMEM;
1169 }
1170 int retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1171 if (retcode == 0)
1172 return 0;
1173 RF_Free(*k_cfg, sizeof(RF_Config_t));
1174 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1175 rs->sc_flags |= RAIDF_SHUTDOWN;
1176 return retcode;
1177 }
1178
1179 int
1180 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1181 {
1182 int retcode;
1183 RF_Raid_t *raidPtr = &rs->sc_r;
1184
1185 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1186
1187 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1188 goto out;
1189
1190 /* should do some kind of sanity check on the configuration.
1191 * Store the sum of all the bytes in the last byte? */
1192
1193 /* configure the system */
1194
1195 /*
1196 * Clear the entire RAID descriptor, just to make sure
1197 * there is no stale data left in the case of a
1198 * reconfiguration
1199 */
1200 memset(raidPtr, 0, sizeof(*raidPtr));
1201 raidPtr->softc = rs;
1202 raidPtr->raidid = rs->sc_unit;
1203
1204 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1205
1206 if (retcode == 0) {
1207 /* allow this many simultaneous IO's to
1208 this RAID device */
1209 raidPtr->openings = RAIDOUTSTANDING;
1210
1211 raidinit(rs);
1212 raid_wakeup(raidPtr);
1213 rf_markalldirty(raidPtr);
1214 }
1215
1216 /* free the buffers. No return code here. */
1217 if (k_cfg->layoutSpecificSize) {
1218 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1219 }
1220 out:
1221 RF_Free(k_cfg, sizeof(RF_Config_t));
1222 if (retcode) {
1223 /*
1224 * If configuration failed, set sc_flags so that we
1225 * will detach the device when we close it.
1226 */
1227 rs->sc_flags |= RAIDF_SHUTDOWN;
1228 }
1229 return retcode;
1230 }
1231
1232 #if RF_DISABLED
1233 static int
1234 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1235 {
1236
1237 /* XXX check the label for valid stuff... */
1238 /* Note that some things *should not* get modified --
1239 the user should be re-initing the labels instead of
1240 trying to patch things.
1241 */
1242 #ifdef DEBUG
1243 int raidid = raidPtr->raidid;
1244 printf("raid%d: Got component label:\n", raidid);
1245 printf("raid%d: Version: %d\n", raidid, clabel->version);
1246 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1247 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1248 printf("raid%d: Column: %d\n", raidid, clabel->column);
1249 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1250 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1251 printf("raid%d: Status: %d\n", raidid, clabel->status);
1252 #endif /* DEBUG */
1253 clabel->row = 0;
1254 int column = clabel->column;
1255
1256 if ((column < 0) || (column >= raidPtr->numCol)) {
1257 return(EINVAL);
1258 }
1259
1260 /* XXX this isn't allowed to do anything for now :-) */
1261
1262 /* XXX and before it is, we need to fill in the rest
1263 of the fields!?!?!?! */
1264 memcpy(raidget_component_label(raidPtr, column),
1265 clabel, sizeof(*clabel));
1266 raidflush_component_label(raidPtr, column);
1267 return 0;
1268 }
1269 #endif
1270
1271 static int
1272 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1273 {
1274 /*
1275 we only want the serial number from
1276 the above. We get all the rest of the information
1277 from the config that was used to create this RAID
1278 set.
1279 */
1280
1281 raidPtr->serial_number = clabel->serial_number;
1282
1283 for (int column = 0; column < raidPtr->numCol; column++) {
1284 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1285 if (RF_DEAD_DISK(diskPtr->status))
1286 continue;
1287 RF_ComponentLabel_t *ci_label = raidget_component_label(
1288 raidPtr, column);
1289 /* Zeroing this is important. */
1290 memset(ci_label, 0, sizeof(*ci_label));
1291 raid_init_component_label(raidPtr, ci_label);
1292 ci_label->serial_number = raidPtr->serial_number;
1293 ci_label->row = 0; /* we dont' pretend to support more */
1294 rf_component_label_set_partitionsize(ci_label,
1295 diskPtr->partitionSize);
1296 ci_label->column = column;
1297 raidflush_component_label(raidPtr, column);
1298 /* XXXjld what about the spares? */
1299 }
1300
1301 return 0;
1302 }
1303
1304 static int
1305 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1306 {
1307
1308 if (raidPtr->Layout.map->faultsTolerated == 0) {
1309 /* Can't do this on a RAID 0!! */
1310 return EINVAL;
1311 }
1312
1313 if (raidPtr->recon_in_progress == 1) {
1314 /* a reconstruct is already in progress! */
1315 return EINVAL;
1316 }
1317
1318 RF_SingleComponent_t component;
1319 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1320 component.row = 0; /* we don't support any more */
1321 int column = component.column;
1322
1323 if ((column < 0) || (column >= raidPtr->numCol)) {
1324 return EINVAL;
1325 }
1326
1327 rf_lock_mutex2(raidPtr->mutex);
1328 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1329 (raidPtr->numFailures > 0)) {
1330 /* XXX 0 above shouldn't be constant!!! */
1331 /* some component other than this has failed.
1332 Let's not make things worse than they already
1333 are... */
1334 printf("raid%d: Unable to reconstruct to disk at:\n",
1335 raidPtr->raidid);
1336 printf("raid%d: Col: %d Too many failures.\n",
1337 raidPtr->raidid, column);
1338 rf_unlock_mutex2(raidPtr->mutex);
1339 return EINVAL;
1340 }
1341
1342 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1343 printf("raid%d: Unable to reconstruct to disk at:\n",
1344 raidPtr->raidid);
1345 printf("raid%d: Col: %d "
1346 "Reconstruction already occurring!\n",
1347 raidPtr->raidid, column);
1348
1349 rf_unlock_mutex2(raidPtr->mutex);
1350 return EINVAL;
1351 }
1352
1353 if (raidPtr->Disks[column].status == rf_ds_spared) {
1354 rf_unlock_mutex2(raidPtr->mutex);
1355 return EINVAL;
1356 }
1357
1358 rf_unlock_mutex2(raidPtr->mutex);
1359
1360 struct rf_recon_req_internal *rrint;
1361 RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
1362 if (rrint == NULL)
1363 return ENOMEM;
1364
1365 rrint->col = column;
1366 rrint->raidPtr = raidPtr;
1367
1368 return RF_CREATE_THREAD(raidPtr->recon_thread,
1369 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1370 }
1371
1372 static int
1373 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1374 {
1375 /*
1376 * This makes no sense on a RAID 0, or if we are not reconstructing
1377 * so tell the user it's done.
1378 */
1379 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1380 raidPtr->status != rf_rs_reconstructing) {
1381 *data = 100;
1382 return 0;
1383 }
1384 if (raidPtr->reconControl->numRUsTotal == 0) {
1385 *data = 0;
1386 return 0;
1387 }
1388 *data = (raidPtr->reconControl->numRUsComplete * 100
1389 / raidPtr->reconControl->numRUsTotal);
1390 return 0;
1391 }
1392
1393 static int
1394 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1395 {
1396 int unit = raidunit(dev);
1397 int part, pmask;
1398 struct raid_softc *rs;
1399 struct dk_softc *dksc;
1400 RF_Config_t *k_cfg;
1401 RF_Raid_t *raidPtr;
1402 RF_AccTotals_t *totals;
1403 RF_SingleComponent_t component;
1404 RF_DeviceConfig_t *d_cfg, *ucfgp;
1405 int retcode = 0;
1406 int column;
1407 RF_ComponentLabel_t *clabel;
1408 RF_SingleComponent_t *sparePtr,*componentPtr;
1409 int d;
1410
1411 if ((rs = raidget(unit, false)) == NULL)
1412 return ENXIO;
1413
1414 dksc = &rs->sc_dksc;
1415 raidPtr = &rs->sc_r;
1416
1417 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1418 (int) DISKPART(dev), (int) unit, cmd));
1419
1420 /* Must be initialized for these... */
1421 if (rf_must_be_initialized(rs, cmd))
1422 return ENXIO;
1423
1424 switch (cmd) {
1425 /* configure the system */
1426 case RAIDFRAME_CONFIGURE:
1427 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1428 return retcode;
1429 return rf_construct(rs, k_cfg);
1430
1431 /* shutdown the system */
1432 case RAIDFRAME_SHUTDOWN:
1433
1434 part = DISKPART(dev);
1435 pmask = (1 << part);
1436
1437 if ((retcode = raidlock(rs)) != 0)
1438 return retcode;
1439
1440 if (DK_BUSY(dksc, pmask) ||
1441 raidPtr->recon_in_progress != 0 ||
1442 raidPtr->parity_rewrite_in_progress != 0 ||
1443 raidPtr->copyback_in_progress != 0)
1444 retcode = EBUSY;
1445 else {
1446 /* detach and free on close */
1447 rs->sc_flags |= RAIDF_SHUTDOWN;
1448 retcode = 0;
1449 }
1450
1451 raidunlock(rs);
1452
1453 return retcode;
1454 case RAIDFRAME_GET_COMPONENT_LABEL:
1455 return rf_get_component_label(raidPtr, data);
1456
1457 #if RF_DISABLED
1458 case RAIDFRAME_SET_COMPONENT_LABEL:
1459 return rf_set_component_label(raidPtr, data);
1460 #endif
1461
1462 case RAIDFRAME_INIT_LABELS:
1463 return rf_init_component_label(raidPtr, data);
1464
1465 case RAIDFRAME_SET_AUTOCONFIG:
1466 d = rf_set_autoconfig(raidPtr, *(int *) data);
1467 printf("raid%d: New autoconfig value is: %d\n",
1468 raidPtr->raidid, d);
1469 *(int *) data = d;
1470 return retcode;
1471
1472 case RAIDFRAME_SET_ROOT:
1473 d = rf_set_rootpartition(raidPtr, *(int *) data);
1474 printf("raid%d: New rootpartition value is: %d\n",
1475 raidPtr->raidid, d);
1476 *(int *) data = d;
1477 return retcode;
1478
1479 /* initialize all parity */
1480 case RAIDFRAME_REWRITEPARITY:
1481
1482 if (raidPtr->Layout.map->faultsTolerated == 0) {
1483 /* Parity for RAID 0 is trivially correct */
1484 raidPtr->parity_good = RF_RAID_CLEAN;
1485 return 0;
1486 }
1487
1488 if (raidPtr->parity_rewrite_in_progress == 1) {
1489 /* Re-write is already in progress! */
1490 return EINVAL;
1491 }
1492
1493 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1494 rf_RewriteParityThread, raidPtr,"raid_parity");
1495
1496 case RAIDFRAME_ADD_HOT_SPARE:
1497 sparePtr = (RF_SingleComponent_t *) data;
1498 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1499 return rf_add_hot_spare(raidPtr, &component);
1500
1501 case RAIDFRAME_REMOVE_HOT_SPARE:
1502 return retcode;
1503
1504 case RAIDFRAME_DELETE_COMPONENT:
1505 componentPtr = (RF_SingleComponent_t *)data;
1506 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1507 return rf_delete_component(raidPtr, &component);
1508
1509 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1510 componentPtr = (RF_SingleComponent_t *)data;
1511 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1512 return rf_incorporate_hot_spare(raidPtr, &component);
1513
1514 case RAIDFRAME_REBUILD_IN_PLACE:
1515 return rf_rebuild_in_place(raidPtr, data);
1516
1517 case RAIDFRAME_GET_INFO:
1518 ucfgp = *(RF_DeviceConfig_t **)data;
1519 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1520 (RF_DeviceConfig_t *));
1521 if (d_cfg == NULL)
1522 return ENOMEM;
1523 retcode = rf_get_info(raidPtr, d_cfg);
1524 if (retcode == 0) {
1525 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1526 }
1527 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1528 return retcode;
1529
1530 case RAIDFRAME_CHECK_PARITY:
1531 *(int *) data = raidPtr->parity_good;
1532 return 0;
1533
1534 case RAIDFRAME_PARITYMAP_STATUS:
1535 if (rf_paritymap_ineligible(raidPtr))
1536 return EINVAL;
1537 rf_paritymap_status(raidPtr->parity_map, data);
1538 return 0;
1539
1540 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1541 if (rf_paritymap_ineligible(raidPtr))
1542 return EINVAL;
1543 if (raidPtr->parity_map == NULL)
1544 return ENOENT; /* ??? */
1545 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1546 return EINVAL;
1547 return 0;
1548
1549 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1550 if (rf_paritymap_ineligible(raidPtr))
1551 return EINVAL;
1552 *(int *) data = rf_paritymap_get_disable(raidPtr);
1553 return 0;
1554
1555 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1556 if (rf_paritymap_ineligible(raidPtr))
1557 return EINVAL;
1558 rf_paritymap_set_disable(raidPtr, *(int *)data);
1559 /* XXX should errors be passed up? */
1560 return 0;
1561
1562 case RAIDFRAME_RESET_ACCTOTALS:
1563 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1564 return 0;
1565
1566 case RAIDFRAME_GET_ACCTOTALS:
1567 totals = (RF_AccTotals_t *) data;
1568 *totals = raidPtr->acc_totals;
1569 return 0;
1570
1571 case RAIDFRAME_KEEP_ACCTOTALS:
1572 raidPtr->keep_acc_totals = *(int *)data;
1573 return 0;
1574
1575 case RAIDFRAME_GET_SIZE:
1576 *(int *) data = raidPtr->totalSectors;
1577 return 0;
1578
1579 case RAIDFRAME_FAIL_DISK:
1580 return rf_fail_disk(raidPtr, data);
1581
1582 /* invoke a copyback operation after recon on whatever disk
1583 * needs it, if any */
1584 case RAIDFRAME_COPYBACK:
1585
1586 if (raidPtr->Layout.map->faultsTolerated == 0) {
1587 /* This makes no sense on a RAID 0!! */
1588 return EINVAL;
1589 }
1590
1591 if (raidPtr->copyback_in_progress == 1) {
1592 /* Copyback is already in progress! */
1593 return EINVAL;
1594 }
1595
1596 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1597 rf_CopybackThread, raidPtr, "raid_copyback");
1598
1599 /* return the percentage completion of reconstruction */
1600 case RAIDFRAME_CHECK_RECON_STATUS:
1601 return rf_check_recon_status(raidPtr, data);
1602
1603 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1604 rf_check_recon_status_ext(raidPtr, data);
1605 return 0;
1606
1607 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1608 if (raidPtr->Layout.map->faultsTolerated == 0) {
1609 /* This makes no sense on a RAID 0, so tell the
1610 user it's done. */
1611 *(int *) data = 100;
1612 return 0;
1613 }
1614 if (raidPtr->parity_rewrite_in_progress == 1) {
1615 *(int *) data = 100 *
1616 raidPtr->parity_rewrite_stripes_done /
1617 raidPtr->Layout.numStripe;
1618 } else {
1619 *(int *) data = 100;
1620 }
1621 return 0;
1622
1623 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1624 rf_check_parityrewrite_status_ext(raidPtr, data);
1625 return 0;
1626
1627 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1628 if (raidPtr->Layout.map->faultsTolerated == 0) {
1629 /* This makes no sense on a RAID 0 */
1630 *(int *) data = 100;
1631 return 0;
1632 }
1633 if (raidPtr->copyback_in_progress == 1) {
1634 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1635 raidPtr->Layout.numStripe;
1636 } else {
1637 *(int *) data = 100;
1638 }
1639 return 0;
1640
1641 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1642 rf_check_copyback_status_ext(raidPtr, data);
1643 return 0;
1644
1645 case RAIDFRAME_SET_LAST_UNIT:
1646 for (column = 0; column < raidPtr->numCol; column++)
1647 if (raidPtr->Disks[column].status != rf_ds_optimal)
1648 return EBUSY;
1649
1650 for (column = 0; column < raidPtr->numCol; column++) {
1651 clabel = raidget_component_label(raidPtr, column);
1652 clabel->last_unit = *(int *)data;
1653 raidflush_component_label(raidPtr, column);
1654 }
1655 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1656 return 0;
1657
1658 /* the sparetable daemon calls this to wait for the kernel to
1659 * need a spare table. this ioctl does not return until a
1660 * spare table is needed. XXX -- calling mpsleep here in the
1661 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1662 * -- I should either compute the spare table in the kernel,
1663 * or have a different -- XXX XXX -- interface (a different
1664 * character device) for delivering the table -- XXX */
1665 #if RF_DISABLED
1666 case RAIDFRAME_SPARET_WAIT:
1667 rf_lock_mutex2(rf_sparet_wait_mutex);
1668 while (!rf_sparet_wait_queue)
1669 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1670 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1671 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1672 rf_unlock_mutex2(rf_sparet_wait_mutex);
1673
1674 /* structure assignment */
1675 *((RF_SparetWait_t *) data) = *waitreq;
1676
1677 RF_Free(waitreq, sizeof(*waitreq));
1678 return 0;
1679
1680 /* wakes up a process waiting on SPARET_WAIT and puts an error
1681 * code in it that will cause the dameon to exit */
1682 case RAIDFRAME_ABORT_SPARET_WAIT:
1683 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1684 waitreq->fcol = -1;
1685 rf_lock_mutex2(rf_sparet_wait_mutex);
1686 waitreq->next = rf_sparet_wait_queue;
1687 rf_sparet_wait_queue = waitreq;
1688 rf_broadcast_cond2(rf_sparet_wait_cv);
1689 rf_unlock_mutex2(rf_sparet_wait_mutex);
1690 return 0;
1691
1692 /* used by the spare table daemon to deliver a spare table
1693 * into the kernel */
1694 case RAIDFRAME_SEND_SPARET:
1695
1696 /* install the spare table */
1697 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1698
1699 /* respond to the requestor. the return status of the spare
1700 * table installation is passed in the "fcol" field */
1701 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1702 waitreq->fcol = retcode;
1703 rf_lock_mutex2(rf_sparet_wait_mutex);
1704 waitreq->next = rf_sparet_resp_queue;
1705 rf_sparet_resp_queue = waitreq;
1706 rf_broadcast_cond2(rf_sparet_resp_cv);
1707 rf_unlock_mutex2(rf_sparet_wait_mutex);
1708
1709 return retcode;
1710 #endif
1711 default:
1712 /*
1713 * Don't bother trying to load compat modules
1714 * if it is not our ioctl. This is more efficient
1715 * and makes rump tests not depend on compat code
1716 */
1717 if (IOCGROUP(cmd) != 'r')
1718 break;
1719 #ifdef _LP64
1720 if ((l->l_proc->p_flag & PK_32) != 0) {
1721 module_autoload("compat_netbsd32_raid",
1722 MODULE_CLASS_EXEC);
1723 MODULE_CALL_HOOK(raidframe_netbsd32_ioctl_hook,
1724 (rs, cmd, data), enosys(), retcode);
1725 if (retcode != EPASSTHROUGH)
1726 return retcode;
1727 }
1728 #endif
1729 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1730 MODULE_CALL_HOOK(raidframe_ioctl_80_hook,
1731 (rs, cmd, data), enosys(), retcode);
1732 if (retcode != EPASSTHROUGH)
1733 return retcode;
1734
1735 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1736 MODULE_CALL_HOOK(raidframe_ioctl_50_hook,
1737 (rs, cmd, data), enosys(), retcode);
1738 if (retcode != EPASSTHROUGH)
1739 return retcode;
1740 break; /* fall through to the os-specific code below */
1741
1742 }
1743
1744 if (!raidPtr->valid)
1745 return (EINVAL);
1746
1747 /*
1748 * Add support for "regular" device ioctls here.
1749 */
1750
1751 switch (cmd) {
1752 case DIOCGCACHE:
1753 retcode = rf_get_component_caches(raidPtr, (int *)data);
1754 break;
1755
1756 case DIOCCACHESYNC:
1757 retcode = rf_sync_component_caches(raidPtr);
1758 break;
1759
1760 default:
1761 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1762 break;
1763 }
1764
1765 return (retcode);
1766
1767 }
1768
1769
1770 /* raidinit -- complete the rest of the initialization for the
1771 RAIDframe device. */
1772
1773
1774 static void
1775 raidinit(struct raid_softc *rs)
1776 {
1777 cfdata_t cf;
1778 unsigned int unit;
1779 struct dk_softc *dksc = &rs->sc_dksc;
1780 RF_Raid_t *raidPtr = &rs->sc_r;
1781 device_t dev;
1782
1783 unit = raidPtr->raidid;
1784
1785 /* XXX doesn't check bounds. */
1786 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1787
1788 /* attach the pseudo device */
1789 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1790 cf->cf_name = raid_cd.cd_name;
1791 cf->cf_atname = raid_cd.cd_name;
1792 cf->cf_unit = unit;
1793 cf->cf_fstate = FSTATE_STAR;
1794
1795 dev = config_attach_pseudo(cf);
1796 if (dev == NULL) {
1797 printf("raid%d: config_attach_pseudo failed\n",
1798 raidPtr->raidid);
1799 free(cf, M_RAIDFRAME);
1800 return;
1801 }
1802
1803 /* provide a backpointer to the real softc */
1804 raidsoftc(dev) = rs;
1805
1806 /* disk_attach actually creates space for the CPU disklabel, among
1807 * other things, so it's critical to call this *BEFORE* we try putzing
1808 * with disklabels. */
1809 dk_init(dksc, dev, DKTYPE_RAID);
1810 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1811
1812 /* XXX There may be a weird interaction here between this, and
1813 * protectedSectors, as used in RAIDframe. */
1814
1815 rs->sc_size = raidPtr->totalSectors;
1816
1817 /* Attach dk and disk subsystems */
1818 dk_attach(dksc);
1819 disk_attach(&dksc->sc_dkdev);
1820 rf_set_geometry(rs, raidPtr);
1821
1822 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1823
1824 /* mark unit as usuable */
1825 rs->sc_flags |= RAIDF_INITED;
1826
1827 dkwedge_discover(&dksc->sc_dkdev);
1828 }
1829
1830 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1831 /* wake up the daemon & tell it to get us a spare table
1832 * XXX
1833 * the entries in the queues should be tagged with the raidPtr
1834 * so that in the extremely rare case that two recons happen at once,
1835 * we know for which device were requesting a spare table
1836 * XXX
1837 *
1838 * XXX This code is not currently used. GO
1839 */
1840 int
1841 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1842 {
1843 int retcode;
1844
1845 rf_lock_mutex2(rf_sparet_wait_mutex);
1846 req->next = rf_sparet_wait_queue;
1847 rf_sparet_wait_queue = req;
1848 rf_broadcast_cond2(rf_sparet_wait_cv);
1849
1850 /* mpsleep unlocks the mutex */
1851 while (!rf_sparet_resp_queue) {
1852 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1853 }
1854 req = rf_sparet_resp_queue;
1855 rf_sparet_resp_queue = req->next;
1856 rf_unlock_mutex2(rf_sparet_wait_mutex);
1857
1858 retcode = req->fcol;
1859 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1860 * alloc'd */
1861 return (retcode);
1862 }
1863 #endif
1864
1865 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1866 * bp & passes it down.
1867 * any calls originating in the kernel must use non-blocking I/O
1868 * do some extra sanity checking to return "appropriate" error values for
1869 * certain conditions (to make some standard utilities work)
1870 *
1871 * Formerly known as: rf_DoAccessKernel
1872 */
1873 void
1874 raidstart(RF_Raid_t *raidPtr)
1875 {
1876 struct raid_softc *rs;
1877 struct dk_softc *dksc;
1878
1879 rs = raidPtr->softc;
1880 dksc = &rs->sc_dksc;
1881 /* quick check to see if anything has died recently */
1882 rf_lock_mutex2(raidPtr->mutex);
1883 if (raidPtr->numNewFailures > 0) {
1884 rf_unlock_mutex2(raidPtr->mutex);
1885 rf_update_component_labels(raidPtr,
1886 RF_NORMAL_COMPONENT_UPDATE);
1887 rf_lock_mutex2(raidPtr->mutex);
1888 raidPtr->numNewFailures--;
1889 }
1890 rf_unlock_mutex2(raidPtr->mutex);
1891
1892 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1893 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1894 return;
1895 }
1896
1897 dk_start(dksc, NULL);
1898 }
1899
1900 static int
1901 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1902 {
1903 RF_SectorCount_t num_blocks, pb, sum;
1904 RF_RaidAddr_t raid_addr;
1905 daddr_t blocknum;
1906 int do_async;
1907 int rc;
1908
1909 rf_lock_mutex2(raidPtr->mutex);
1910 if (raidPtr->openings == 0) {
1911 rf_unlock_mutex2(raidPtr->mutex);
1912 return EAGAIN;
1913 }
1914 rf_unlock_mutex2(raidPtr->mutex);
1915
1916 blocknum = bp->b_rawblkno;
1917
1918 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1919 (int) blocknum));
1920
1921 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1922 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1923
1924 /* *THIS* is where we adjust what block we're going to...
1925 * but DO NOT TOUCH bp->b_blkno!!! */
1926 raid_addr = blocknum;
1927
1928 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1929 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1930 sum = raid_addr + num_blocks + pb;
1931 if (1 || rf_debugKernelAccess) {
1932 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1933 (int) raid_addr, (int) sum, (int) num_blocks,
1934 (int) pb, (int) bp->b_resid));
1935 }
1936 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1937 || (sum < num_blocks) || (sum < pb)) {
1938 rc = ENOSPC;
1939 goto done;
1940 }
1941 /*
1942 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1943 */
1944
1945 if (bp->b_bcount & raidPtr->sectorMask) {
1946 rc = ENOSPC;
1947 goto done;
1948 }
1949 db1_printf(("Calling DoAccess..\n"));
1950
1951
1952 rf_lock_mutex2(raidPtr->mutex);
1953 raidPtr->openings--;
1954 rf_unlock_mutex2(raidPtr->mutex);
1955
1956 /*
1957 * Everything is async.
1958 */
1959 do_async = 1;
1960
1961 /* don't ever condition on bp->b_flags & B_WRITE.
1962 * always condition on B_READ instead */
1963
1964 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1965 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1966 do_async, raid_addr, num_blocks,
1967 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1968
1969 done:
1970 return rc;
1971 }
1972
1973 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1974
1975 int
1976 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1977 {
1978 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1979 struct buf *bp;
1980
1981 req->queue = queue;
1982 bp = req->bp;
1983
1984 switch (req->type) {
1985 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1986 /* XXX need to do something extra here.. */
1987 /* I'm leaving this in, as I've never actually seen it used,
1988 * and I'd like folks to report it... GO */
1989 printf(("WAKEUP CALLED\n"));
1990 queue->numOutstanding++;
1991
1992 bp->b_flags = 0;
1993 bp->b_private = req;
1994
1995 KernelWakeupFunc(bp);
1996 break;
1997
1998 case RF_IO_TYPE_READ:
1999 case RF_IO_TYPE_WRITE:
2000 #if RF_ACC_TRACE > 0
2001 if (req->tracerec) {
2002 RF_ETIMER_START(req->tracerec->timer);
2003 }
2004 #endif
2005 InitBP(bp, queue->rf_cinfo->ci_vp,
2006 op, queue->rf_cinfo->ci_dev,
2007 req->sectorOffset, req->numSector,
2008 req->buf, KernelWakeupFunc, (void *) req,
2009 queue->raidPtr->logBytesPerSector, req->b_proc);
2010
2011 if (rf_debugKernelAccess) {
2012 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2013 (long) bp->b_blkno));
2014 }
2015 queue->numOutstanding++;
2016 queue->last_deq_sector = req->sectorOffset;
2017 /* acc wouldn't have been let in if there were any pending
2018 * reqs at any other priority */
2019 queue->curPriority = req->priority;
2020
2021 db1_printf(("Going for %c to unit %d col %d\n",
2022 req->type, queue->raidPtr->raidid,
2023 queue->col));
2024 db1_printf(("sector %d count %d (%d bytes) %d\n",
2025 (int) req->sectorOffset, (int) req->numSector,
2026 (int) (req->numSector <<
2027 queue->raidPtr->logBytesPerSector),
2028 (int) queue->raidPtr->logBytesPerSector));
2029
2030 /*
2031 * XXX: drop lock here since this can block at
2032 * least with backing SCSI devices. Retake it
2033 * to minimize fuss with calling interfaces.
2034 */
2035
2036 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2037 bdev_strategy(bp);
2038 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2039 break;
2040
2041 default:
2042 panic("bad req->type in rf_DispatchKernelIO");
2043 }
2044 db1_printf(("Exiting from DispatchKernelIO\n"));
2045
2046 return (0);
2047 }
2048 /* this is the callback function associated with a I/O invoked from
2049 kernel code.
2050 */
2051 static void
2052 KernelWakeupFunc(struct buf *bp)
2053 {
2054 RF_DiskQueueData_t *req = NULL;
2055 RF_DiskQueue_t *queue;
2056
2057 db1_printf(("recovering the request queue:\n"));
2058
2059 req = bp->b_private;
2060
2061 queue = (RF_DiskQueue_t *) req->queue;
2062
2063 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2064
2065 #if RF_ACC_TRACE > 0
2066 if (req->tracerec) {
2067 RF_ETIMER_STOP(req->tracerec->timer);
2068 RF_ETIMER_EVAL(req->tracerec->timer);
2069 rf_lock_mutex2(rf_tracing_mutex);
2070 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2071 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2072 req->tracerec->num_phys_ios++;
2073 rf_unlock_mutex2(rf_tracing_mutex);
2074 }
2075 #endif
2076
2077 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2078 * ballistic, and mark the component as hosed... */
2079
2080 if (bp->b_error != 0) {
2081 /* Mark the disk as dead */
2082 /* but only mark it once... */
2083 /* and only if it wouldn't leave this RAID set
2084 completely broken */
2085 if (((queue->raidPtr->Disks[queue->col].status ==
2086 rf_ds_optimal) ||
2087 (queue->raidPtr->Disks[queue->col].status ==
2088 rf_ds_used_spare)) &&
2089 (queue->raidPtr->numFailures <
2090 queue->raidPtr->Layout.map->faultsTolerated)) {
2091 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2092 queue->raidPtr->raidid,
2093 bp->b_error,
2094 queue->raidPtr->Disks[queue->col].devname);
2095 queue->raidPtr->Disks[queue->col].status =
2096 rf_ds_failed;
2097 queue->raidPtr->status = rf_rs_degraded;
2098 queue->raidPtr->numFailures++;
2099 queue->raidPtr->numNewFailures++;
2100 } else { /* Disk is already dead... */
2101 /* printf("Disk already marked as dead!\n"); */
2102 }
2103
2104 }
2105
2106 /* Fill in the error value */
2107 req->error = bp->b_error;
2108
2109 /* Drop this one on the "finished" queue... */
2110 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2111
2112 /* Let the raidio thread know there is work to be done. */
2113 rf_signal_cond2(queue->raidPtr->iodone_cv);
2114
2115 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2116 }
2117
2118
2119 /*
2120 * initialize a buf structure for doing an I/O in the kernel.
2121 */
2122 static void
2123 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2124 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2125 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2126 struct proc *b_proc)
2127 {
2128 /* bp->b_flags = B_PHYS | rw_flag; */
2129 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2130 bp->b_oflags = 0;
2131 bp->b_cflags = 0;
2132 bp->b_bcount = numSect << logBytesPerSector;
2133 bp->b_bufsize = bp->b_bcount;
2134 bp->b_error = 0;
2135 bp->b_dev = dev;
2136 bp->b_data = bf;
2137 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2138 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2139 if (bp->b_bcount == 0) {
2140 panic("bp->b_bcount is zero in InitBP!!");
2141 }
2142 bp->b_proc = b_proc;
2143 bp->b_iodone = cbFunc;
2144 bp->b_private = cbArg;
2145 }
2146
2147 /*
2148 * Wait interruptibly for an exclusive lock.
2149 *
2150 * XXX
2151 * Several drivers do this; it should be abstracted and made MP-safe.
2152 * (Hmm... where have we seen this warning before :-> GO )
2153 */
2154 static int
2155 raidlock(struct raid_softc *rs)
2156 {
2157 int error;
2158
2159 error = 0;
2160 mutex_enter(&rs->sc_mutex);
2161 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2162 rs->sc_flags |= RAIDF_WANTED;
2163 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2164 if (error != 0)
2165 goto done;
2166 }
2167 rs->sc_flags |= RAIDF_LOCKED;
2168 done:
2169 mutex_exit(&rs->sc_mutex);
2170 return (error);
2171 }
2172 /*
2173 * Unlock and wake up any waiters.
2174 */
2175 static void
2176 raidunlock(struct raid_softc *rs)
2177 {
2178
2179 mutex_enter(&rs->sc_mutex);
2180 rs->sc_flags &= ~RAIDF_LOCKED;
2181 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2182 rs->sc_flags &= ~RAIDF_WANTED;
2183 cv_broadcast(&rs->sc_cv);
2184 }
2185 mutex_exit(&rs->sc_mutex);
2186 }
2187
2188
2189 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2190 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2191 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2192
2193 static daddr_t
2194 rf_component_info_offset(void)
2195 {
2196
2197 return RF_COMPONENT_INFO_OFFSET;
2198 }
2199
2200 static daddr_t
2201 rf_component_info_size(unsigned secsize)
2202 {
2203 daddr_t info_size;
2204
2205 KASSERT(secsize);
2206 if (secsize > RF_COMPONENT_INFO_SIZE)
2207 info_size = secsize;
2208 else
2209 info_size = RF_COMPONENT_INFO_SIZE;
2210
2211 return info_size;
2212 }
2213
2214 static daddr_t
2215 rf_parity_map_offset(RF_Raid_t *raidPtr)
2216 {
2217 daddr_t map_offset;
2218
2219 KASSERT(raidPtr->bytesPerSector);
2220 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2221 map_offset = raidPtr->bytesPerSector;
2222 else
2223 map_offset = RF_COMPONENT_INFO_SIZE;
2224 map_offset += rf_component_info_offset();
2225
2226 return map_offset;
2227 }
2228
2229 static daddr_t
2230 rf_parity_map_size(RF_Raid_t *raidPtr)
2231 {
2232 daddr_t map_size;
2233
2234 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2235 map_size = raidPtr->bytesPerSector;
2236 else
2237 map_size = RF_PARITY_MAP_SIZE;
2238
2239 return map_size;
2240 }
2241
2242 int
2243 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2244 {
2245 RF_ComponentLabel_t *clabel;
2246
2247 clabel = raidget_component_label(raidPtr, col);
2248 clabel->clean = RF_RAID_CLEAN;
2249 raidflush_component_label(raidPtr, col);
2250 return(0);
2251 }
2252
2253
2254 int
2255 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2256 {
2257 RF_ComponentLabel_t *clabel;
2258
2259 clabel = raidget_component_label(raidPtr, col);
2260 clabel->clean = RF_RAID_DIRTY;
2261 raidflush_component_label(raidPtr, col);
2262 return(0);
2263 }
2264
2265 int
2266 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2267 {
2268 KASSERT(raidPtr->bytesPerSector);
2269 return raidread_component_label(raidPtr->bytesPerSector,
2270 raidPtr->Disks[col].dev,
2271 raidPtr->raid_cinfo[col].ci_vp,
2272 &raidPtr->raid_cinfo[col].ci_label);
2273 }
2274
2275 RF_ComponentLabel_t *
2276 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2277 {
2278 return &raidPtr->raid_cinfo[col].ci_label;
2279 }
2280
2281 int
2282 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2283 {
2284 RF_ComponentLabel_t *label;
2285
2286 label = &raidPtr->raid_cinfo[col].ci_label;
2287 label->mod_counter = raidPtr->mod_counter;
2288 #ifndef RF_NO_PARITY_MAP
2289 label->parity_map_modcount = label->mod_counter;
2290 #endif
2291 return raidwrite_component_label(raidPtr->bytesPerSector,
2292 raidPtr->Disks[col].dev,
2293 raidPtr->raid_cinfo[col].ci_vp, label);
2294 }
2295
2296
2297 static int
2298 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2299 RF_ComponentLabel_t *clabel)
2300 {
2301 return raidread_component_area(dev, b_vp, clabel,
2302 sizeof(RF_ComponentLabel_t),
2303 rf_component_info_offset(),
2304 rf_component_info_size(secsize));
2305 }
2306
2307 /* ARGSUSED */
2308 static int
2309 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2310 size_t msize, daddr_t offset, daddr_t dsize)
2311 {
2312 struct buf *bp;
2313 int error;
2314
2315 /* XXX should probably ensure that we don't try to do this if
2316 someone has changed rf_protected_sectors. */
2317
2318 if (b_vp == NULL) {
2319 /* For whatever reason, this component is not valid.
2320 Don't try to read a component label from it. */
2321 return(EINVAL);
2322 }
2323
2324 /* get a block of the appropriate size... */
2325 bp = geteblk((int)dsize);
2326 bp->b_dev = dev;
2327
2328 /* get our ducks in a row for the read */
2329 bp->b_blkno = offset / DEV_BSIZE;
2330 bp->b_bcount = dsize;
2331 bp->b_flags |= B_READ;
2332 bp->b_resid = dsize;
2333
2334 bdev_strategy(bp);
2335 error = biowait(bp);
2336
2337 if (!error) {
2338 memcpy(data, bp->b_data, msize);
2339 }
2340
2341 brelse(bp, 0);
2342 return(error);
2343 }
2344
2345
2346 static int
2347 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2348 RF_ComponentLabel_t *clabel)
2349 {
2350 return raidwrite_component_area(dev, b_vp, clabel,
2351 sizeof(RF_ComponentLabel_t),
2352 rf_component_info_offset(),
2353 rf_component_info_size(secsize), 0);
2354 }
2355
2356 /* ARGSUSED */
2357 static int
2358 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2359 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2360 {
2361 struct buf *bp;
2362 int error;
2363
2364 /* get a block of the appropriate size... */
2365 bp = geteblk((int)dsize);
2366 bp->b_dev = dev;
2367
2368 /* get our ducks in a row for the write */
2369 bp->b_blkno = offset / DEV_BSIZE;
2370 bp->b_bcount = dsize;
2371 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2372 bp->b_resid = dsize;
2373
2374 memset(bp->b_data, 0, dsize);
2375 memcpy(bp->b_data, data, msize);
2376
2377 bdev_strategy(bp);
2378 if (asyncp)
2379 return 0;
2380 error = biowait(bp);
2381 brelse(bp, 0);
2382 if (error) {
2383 #if 1
2384 printf("Failed to write RAID component info!\n");
2385 #endif
2386 }
2387
2388 return(error);
2389 }
2390
2391 void
2392 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2393 {
2394 int c;
2395
2396 for (c = 0; c < raidPtr->numCol; c++) {
2397 /* Skip dead disks. */
2398 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2399 continue;
2400 /* XXXjld: what if an error occurs here? */
2401 raidwrite_component_area(raidPtr->Disks[c].dev,
2402 raidPtr->raid_cinfo[c].ci_vp, map,
2403 RF_PARITYMAP_NBYTE,
2404 rf_parity_map_offset(raidPtr),
2405 rf_parity_map_size(raidPtr), 0);
2406 }
2407 }
2408
2409 void
2410 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2411 {
2412 struct rf_paritymap_ondisk tmp;
2413 int c,first;
2414
2415 first=1;
2416 for (c = 0; c < raidPtr->numCol; c++) {
2417 /* Skip dead disks. */
2418 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2419 continue;
2420 raidread_component_area(raidPtr->Disks[c].dev,
2421 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2422 RF_PARITYMAP_NBYTE,
2423 rf_parity_map_offset(raidPtr),
2424 rf_parity_map_size(raidPtr));
2425 if (first) {
2426 memcpy(map, &tmp, sizeof(*map));
2427 first = 0;
2428 } else {
2429 rf_paritymap_merge(map, &tmp);
2430 }
2431 }
2432 }
2433
2434 void
2435 rf_markalldirty(RF_Raid_t *raidPtr)
2436 {
2437 RF_ComponentLabel_t *clabel;
2438 int sparecol;
2439 int c;
2440 int j;
2441 int scol = -1;
2442
2443 raidPtr->mod_counter++;
2444 for (c = 0; c < raidPtr->numCol; c++) {
2445 /* we don't want to touch (at all) a disk that has
2446 failed */
2447 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2448 clabel = raidget_component_label(raidPtr, c);
2449 if (clabel->status == rf_ds_spared) {
2450 /* XXX do something special...
2451 but whatever you do, don't
2452 try to access it!! */
2453 } else {
2454 raidmarkdirty(raidPtr, c);
2455 }
2456 }
2457 }
2458
2459 for( c = 0; c < raidPtr->numSpare ; c++) {
2460 sparecol = raidPtr->numCol + c;
2461 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2462 /*
2463
2464 we claim this disk is "optimal" if it's
2465 rf_ds_used_spare, as that means it should be
2466 directly substitutable for the disk it replaced.
2467 We note that too...
2468
2469 */
2470
2471 for(j=0;j<raidPtr->numCol;j++) {
2472 if (raidPtr->Disks[j].spareCol == sparecol) {
2473 scol = j;
2474 break;
2475 }
2476 }
2477
2478 clabel = raidget_component_label(raidPtr, sparecol);
2479 /* make sure status is noted */
2480
2481 raid_init_component_label(raidPtr, clabel);
2482
2483 clabel->row = 0;
2484 clabel->column = scol;
2485 /* Note: we *don't* change status from rf_ds_used_spare
2486 to rf_ds_optimal */
2487 /* clabel.status = rf_ds_optimal; */
2488
2489 raidmarkdirty(raidPtr, sparecol);
2490 }
2491 }
2492 }
2493
2494
2495 void
2496 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2497 {
2498 RF_ComponentLabel_t *clabel;
2499 int sparecol;
2500 int c;
2501 int j;
2502 int scol;
2503 struct raid_softc *rs = raidPtr->softc;
2504
2505 scol = -1;
2506
2507 /* XXX should do extra checks to make sure things really are clean,
2508 rather than blindly setting the clean bit... */
2509
2510 raidPtr->mod_counter++;
2511
2512 for (c = 0; c < raidPtr->numCol; c++) {
2513 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2514 clabel = raidget_component_label(raidPtr, c);
2515 /* make sure status is noted */
2516 clabel->status = rf_ds_optimal;
2517
2518 /* note what unit we are configured as */
2519 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2520 clabel->last_unit = raidPtr->raidid;
2521
2522 raidflush_component_label(raidPtr, c);
2523 if (final == RF_FINAL_COMPONENT_UPDATE) {
2524 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2525 raidmarkclean(raidPtr, c);
2526 }
2527 }
2528 }
2529 /* else we don't touch it.. */
2530 }
2531
2532 for( c = 0; c < raidPtr->numSpare ; c++) {
2533 sparecol = raidPtr->numCol + c;
2534 /* Need to ensure that the reconstruct actually completed! */
2535 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2536 /*
2537
2538 we claim this disk is "optimal" if it's
2539 rf_ds_used_spare, as that means it should be
2540 directly substitutable for the disk it replaced.
2541 We note that too...
2542
2543 */
2544
2545 for(j=0;j<raidPtr->numCol;j++) {
2546 if (raidPtr->Disks[j].spareCol == sparecol) {
2547 scol = j;
2548 break;
2549 }
2550 }
2551
2552 /* XXX shouldn't *really* need this... */
2553 clabel = raidget_component_label(raidPtr, sparecol);
2554 /* make sure status is noted */
2555
2556 raid_init_component_label(raidPtr, clabel);
2557
2558 clabel->column = scol;
2559 clabel->status = rf_ds_optimal;
2560 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2561 clabel->last_unit = raidPtr->raidid;
2562
2563 raidflush_component_label(raidPtr, sparecol);
2564 if (final == RF_FINAL_COMPONENT_UPDATE) {
2565 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2566 raidmarkclean(raidPtr, sparecol);
2567 }
2568 }
2569 }
2570 }
2571 }
2572
2573 void
2574 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2575 {
2576
2577 if (vp != NULL) {
2578 if (auto_configured == 1) {
2579 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2580 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2581 vput(vp);
2582
2583 } else {
2584 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2585 }
2586 }
2587 }
2588
2589
2590 void
2591 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2592 {
2593 int r,c;
2594 struct vnode *vp;
2595 int acd;
2596
2597
2598 /* We take this opportunity to close the vnodes like we should.. */
2599
2600 for (c = 0; c < raidPtr->numCol; c++) {
2601 vp = raidPtr->raid_cinfo[c].ci_vp;
2602 acd = raidPtr->Disks[c].auto_configured;
2603 rf_close_component(raidPtr, vp, acd);
2604 raidPtr->raid_cinfo[c].ci_vp = NULL;
2605 raidPtr->Disks[c].auto_configured = 0;
2606 }
2607
2608 for (r = 0; r < raidPtr->numSpare; r++) {
2609 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2610 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2611 rf_close_component(raidPtr, vp, acd);
2612 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2613 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2614 }
2615 }
2616
2617
2618 void
2619 rf_ReconThread(struct rf_recon_req_internal *req)
2620 {
2621 int s;
2622 RF_Raid_t *raidPtr;
2623
2624 s = splbio();
2625 raidPtr = (RF_Raid_t *) req->raidPtr;
2626 raidPtr->recon_in_progress = 1;
2627
2628 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2629 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2630
2631 RF_Free(req, sizeof(*req));
2632
2633 raidPtr->recon_in_progress = 0;
2634 splx(s);
2635
2636 /* That's all... */
2637 kthread_exit(0); /* does not return */
2638 }
2639
2640 void
2641 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2642 {
2643 int retcode;
2644 int s;
2645
2646 raidPtr->parity_rewrite_stripes_done = 0;
2647 raidPtr->parity_rewrite_in_progress = 1;
2648 s = splbio();
2649 retcode = rf_RewriteParity(raidPtr);
2650 splx(s);
2651 if (retcode) {
2652 printf("raid%d: Error re-writing parity (%d)!\n",
2653 raidPtr->raidid, retcode);
2654 } else {
2655 /* set the clean bit! If we shutdown correctly,
2656 the clean bit on each component label will get
2657 set */
2658 raidPtr->parity_good = RF_RAID_CLEAN;
2659 }
2660 raidPtr->parity_rewrite_in_progress = 0;
2661
2662 /* Anyone waiting for us to stop? If so, inform them... */
2663 if (raidPtr->waitShutdown) {
2664 rf_lock_mutex2(raidPtr->rad_lock);
2665 cv_broadcast(&raidPtr->parity_rewrite_cv);
2666 rf_unlock_mutex2(raidPtr->rad_lock);
2667 }
2668
2669 /* That's all... */
2670 kthread_exit(0); /* does not return */
2671 }
2672
2673
2674 void
2675 rf_CopybackThread(RF_Raid_t *raidPtr)
2676 {
2677 int s;
2678
2679 raidPtr->copyback_in_progress = 1;
2680 s = splbio();
2681 rf_CopybackReconstructedData(raidPtr);
2682 splx(s);
2683 raidPtr->copyback_in_progress = 0;
2684
2685 /* That's all... */
2686 kthread_exit(0); /* does not return */
2687 }
2688
2689
2690 void
2691 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2692 {
2693 int s;
2694 RF_Raid_t *raidPtr;
2695
2696 s = splbio();
2697 raidPtr = req->raidPtr;
2698 raidPtr->recon_in_progress = 1;
2699 rf_ReconstructInPlace(raidPtr, req->col);
2700 RF_Free(req, sizeof(*req));
2701 raidPtr->recon_in_progress = 0;
2702 splx(s);
2703
2704 /* That's all... */
2705 kthread_exit(0); /* does not return */
2706 }
2707
2708 static RF_AutoConfig_t *
2709 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2710 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2711 unsigned secsize)
2712 {
2713 int good_one = 0;
2714 RF_ComponentLabel_t *clabel;
2715 RF_AutoConfig_t *ac;
2716
2717 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2718 if (clabel == NULL) {
2719 oomem:
2720 while(ac_list) {
2721 ac = ac_list;
2722 if (ac->clabel)
2723 free(ac->clabel, M_RAIDFRAME);
2724 ac_list = ac_list->next;
2725 free(ac, M_RAIDFRAME);
2726 }
2727 printf("RAID auto config: out of memory!\n");
2728 return NULL; /* XXX probably should panic? */
2729 }
2730
2731 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2732 /* Got the label. Does it look reasonable? */
2733 if (rf_reasonable_label(clabel, numsecs) &&
2734 (rf_component_label_partitionsize(clabel) <= size)) {
2735 #ifdef DEBUG
2736 printf("Component on: %s: %llu\n",
2737 cname, (unsigned long long)size);
2738 rf_print_component_label(clabel);
2739 #endif
2740 /* if it's reasonable, add it, else ignore it. */
2741 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2742 M_NOWAIT);
2743 if (ac == NULL) {
2744 free(clabel, M_RAIDFRAME);
2745 goto oomem;
2746 }
2747 strlcpy(ac->devname, cname, sizeof(ac->devname));
2748 ac->dev = dev;
2749 ac->vp = vp;
2750 ac->clabel = clabel;
2751 ac->next = ac_list;
2752 ac_list = ac;
2753 good_one = 1;
2754 }
2755 }
2756 if (!good_one) {
2757 /* cleanup */
2758 free(clabel, M_RAIDFRAME);
2759 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2760 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2761 vput(vp);
2762 }
2763 return ac_list;
2764 }
2765
2766 RF_AutoConfig_t *
2767 rf_find_raid_components(void)
2768 {
2769 struct vnode *vp;
2770 struct disklabel label;
2771 device_t dv;
2772 deviter_t di;
2773 dev_t dev;
2774 int bmajor, bminor, wedge, rf_part_found;
2775 int error;
2776 int i;
2777 RF_AutoConfig_t *ac_list;
2778 uint64_t numsecs;
2779 unsigned secsize;
2780 int dowedges;
2781
2782 /* initialize the AutoConfig list */
2783 ac_list = NULL;
2784
2785 /*
2786 * we begin by trolling through *all* the devices on the system *twice*
2787 * first we scan for wedges, second for other devices. This avoids
2788 * using a raw partition instead of a wedge that covers the whole disk
2789 */
2790
2791 for (dowedges=1; dowedges>=0; --dowedges) {
2792 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2793 dv = deviter_next(&di)) {
2794
2795 /* we are only interested in disks... */
2796 if (device_class(dv) != DV_DISK)
2797 continue;
2798
2799 /* we don't care about floppies... */
2800 if (device_is_a(dv, "fd")) {
2801 continue;
2802 }
2803
2804 /* we don't care about CD's... */
2805 if (device_is_a(dv, "cd")) {
2806 continue;
2807 }
2808
2809 /* we don't care about md's... */
2810 if (device_is_a(dv, "md")) {
2811 continue;
2812 }
2813
2814 /* hdfd is the Atari/Hades floppy driver */
2815 if (device_is_a(dv, "hdfd")) {
2816 continue;
2817 }
2818
2819 /* fdisa is the Atari/Milan floppy driver */
2820 if (device_is_a(dv, "fdisa")) {
2821 continue;
2822 }
2823
2824 /* are we in the wedges pass ? */
2825 wedge = device_is_a(dv, "dk");
2826 if (wedge != dowedges) {
2827 continue;
2828 }
2829
2830 /* need to find the device_name_to_block_device_major stuff */
2831 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2832
2833 rf_part_found = 0; /*No raid partition as yet*/
2834
2835 /* get a vnode for the raw partition of this disk */
2836 bminor = minor(device_unit(dv));
2837 dev = wedge ? makedev(bmajor, bminor) :
2838 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2839 if (bdevvp(dev, &vp))
2840 panic("RAID can't alloc vnode");
2841
2842 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2843
2844 if (error) {
2845 /* "Who cares." Continue looking
2846 for something that exists*/
2847 vput(vp);
2848 continue;
2849 }
2850
2851 error = getdisksize(vp, &numsecs, &secsize);
2852 if (error) {
2853 /*
2854 * Pseudo devices like vnd and cgd can be
2855 * opened but may still need some configuration.
2856 * Ignore these quietly.
2857 */
2858 if (error != ENXIO)
2859 printf("RAIDframe: can't get disk size"
2860 " for dev %s (%d)\n",
2861 device_xname(dv), error);
2862 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2863 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2864 vput(vp);
2865 continue;
2866 }
2867 if (wedge) {
2868 struct dkwedge_info dkw;
2869 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2870 NOCRED);
2871 if (error) {
2872 printf("RAIDframe: can't get wedge info for "
2873 "dev %s (%d)\n", device_xname(dv), error);
2874 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2875 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2876 vput(vp);
2877 continue;
2878 }
2879
2880 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2881 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2882 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2883 vput(vp);
2884 continue;
2885 }
2886
2887 ac_list = rf_get_component(ac_list, dev, vp,
2888 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2889 rf_part_found = 1; /*There is a raid component on this disk*/
2890 continue;
2891 }
2892
2893 /* Ok, the disk exists. Go get the disklabel. */
2894 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2895 if (error) {
2896 /*
2897 * XXX can't happen - open() would
2898 * have errored out (or faked up one)
2899 */
2900 if (error != ENOTTY)
2901 printf("RAIDframe: can't get label for dev "
2902 "%s (%d)\n", device_xname(dv), error);
2903 }
2904
2905 /* don't need this any more. We'll allocate it again
2906 a little later if we really do... */
2907 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2908 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2909 vput(vp);
2910
2911 if (error)
2912 continue;
2913
2914 rf_part_found = 0; /*No raid partitions yet*/
2915 for (i = 0; i < label.d_npartitions; i++) {
2916 char cname[sizeof(ac_list->devname)];
2917
2918 /* We only support partitions marked as RAID */
2919 if (label.d_partitions[i].p_fstype != FS_RAID)
2920 continue;
2921
2922 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2923 if (bdevvp(dev, &vp))
2924 panic("RAID can't alloc vnode");
2925
2926 error = VOP_OPEN(vp, FREAD, NOCRED);
2927 if (error) {
2928 /* Whatever... */
2929 vput(vp);
2930 continue;
2931 }
2932 snprintf(cname, sizeof(cname), "%s%c",
2933 device_xname(dv), 'a' + i);
2934 ac_list = rf_get_component(ac_list, dev, vp, cname,
2935 label.d_partitions[i].p_size, numsecs, secsize);
2936 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2937 }
2938
2939 /*
2940 *If there is no raid component on this disk, either in a
2941 *disklabel or inside a wedge, check the raw partition as well,
2942 *as it is possible to configure raid components on raw disk
2943 *devices.
2944 */
2945
2946 if (!rf_part_found) {
2947 char cname[sizeof(ac_list->devname)];
2948
2949 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2950 if (bdevvp(dev, &vp))
2951 panic("RAID can't alloc vnode");
2952
2953 error = VOP_OPEN(vp, FREAD, NOCRED);
2954 if (error) {
2955 /* Whatever... */
2956 vput(vp);
2957 continue;
2958 }
2959 snprintf(cname, sizeof(cname), "%s%c",
2960 device_xname(dv), 'a' + RAW_PART);
2961 ac_list = rf_get_component(ac_list, dev, vp, cname,
2962 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2963 }
2964 }
2965 deviter_release(&di);
2966 }
2967 return ac_list;
2968 }
2969
2970
2971 int
2972 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2973 {
2974
2975 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2976 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2977 ((clabel->clean == RF_RAID_CLEAN) ||
2978 (clabel->clean == RF_RAID_DIRTY)) &&
2979 clabel->row >=0 &&
2980 clabel->column >= 0 &&
2981 clabel->num_rows > 0 &&
2982 clabel->num_columns > 0 &&
2983 clabel->row < clabel->num_rows &&
2984 clabel->column < clabel->num_columns &&
2985 clabel->blockSize > 0 &&
2986 /*
2987 * numBlocksHi may contain garbage, but it is ok since
2988 * the type is unsigned. If it is really garbage,
2989 * rf_fix_old_label_size() will fix it.
2990 */
2991 rf_component_label_numblocks(clabel) > 0) {
2992 /*
2993 * label looks reasonable enough...
2994 * let's make sure it has no old garbage.
2995 */
2996 if (numsecs)
2997 rf_fix_old_label_size(clabel, numsecs);
2998 return(1);
2999 }
3000 return(0);
3001 }
3002
3003
3004 /*
3005 * For reasons yet unknown, some old component labels have garbage in
3006 * the newer numBlocksHi region, and this causes lossage. Since those
3007 * disks will also have numsecs set to less than 32 bits of sectors,
3008 * we can determine when this corruption has occurred, and fix it.
3009 *
3010 * The exact same problem, with the same unknown reason, happens to
3011 * the partitionSizeHi member as well.
3012 */
3013 static void
3014 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3015 {
3016
3017 if (numsecs < ((uint64_t)1 << 32)) {
3018 if (clabel->numBlocksHi) {
3019 printf("WARNING: total sectors < 32 bits, yet "
3020 "numBlocksHi set\n"
3021 "WARNING: resetting numBlocksHi to zero.\n");
3022 clabel->numBlocksHi = 0;
3023 }
3024
3025 if (clabel->partitionSizeHi) {
3026 printf("WARNING: total sectors < 32 bits, yet "
3027 "partitionSizeHi set\n"
3028 "WARNING: resetting partitionSizeHi to zero.\n");
3029 clabel->partitionSizeHi = 0;
3030 }
3031 }
3032 }
3033
3034
3035 #ifdef DEBUG
3036 void
3037 rf_print_component_label(RF_ComponentLabel_t *clabel)
3038 {
3039 uint64_t numBlocks;
3040 static const char *rp[] = {
3041 "No", "Force", "Soft", "*invalid*"
3042 };
3043
3044
3045 numBlocks = rf_component_label_numblocks(clabel);
3046
3047 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3048 clabel->row, clabel->column,
3049 clabel->num_rows, clabel->num_columns);
3050 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3051 clabel->version, clabel->serial_number,
3052 clabel->mod_counter);
3053 printf(" Clean: %s Status: %d\n",
3054 clabel->clean ? "Yes" : "No", clabel->status);
3055 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3056 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3057 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3058 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3059 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3060 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3061 printf(" Last configured as: raid%d\n", clabel->last_unit);
3062 #if 0
3063 printf(" Config order: %d\n", clabel->config_order);
3064 #endif
3065
3066 }
3067 #endif
3068
3069 RF_ConfigSet_t *
3070 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3071 {
3072 RF_AutoConfig_t *ac;
3073 RF_ConfigSet_t *config_sets;
3074 RF_ConfigSet_t *cset;
3075 RF_AutoConfig_t *ac_next;
3076
3077
3078 config_sets = NULL;
3079
3080 /* Go through the AutoConfig list, and figure out which components
3081 belong to what sets. */
3082 ac = ac_list;
3083 while(ac!=NULL) {
3084 /* we're going to putz with ac->next, so save it here
3085 for use at the end of the loop */
3086 ac_next = ac->next;
3087
3088 if (config_sets == NULL) {
3089 /* will need at least this one... */
3090 config_sets = (RF_ConfigSet_t *)
3091 malloc(sizeof(RF_ConfigSet_t),
3092 M_RAIDFRAME, M_NOWAIT);
3093 if (config_sets == NULL) {
3094 panic("rf_create_auto_sets: No memory!");
3095 }
3096 /* this one is easy :) */
3097 config_sets->ac = ac;
3098 config_sets->next = NULL;
3099 config_sets->rootable = 0;
3100 ac->next = NULL;
3101 } else {
3102 /* which set does this component fit into? */
3103 cset = config_sets;
3104 while(cset!=NULL) {
3105 if (rf_does_it_fit(cset, ac)) {
3106 /* looks like it matches... */
3107 ac->next = cset->ac;
3108 cset->ac = ac;
3109 break;
3110 }
3111 cset = cset->next;
3112 }
3113 if (cset==NULL) {
3114 /* didn't find a match above... new set..*/
3115 cset = (RF_ConfigSet_t *)
3116 malloc(sizeof(RF_ConfigSet_t),
3117 M_RAIDFRAME, M_NOWAIT);
3118 if (cset == NULL) {
3119 panic("rf_create_auto_sets: No memory!");
3120 }
3121 cset->ac = ac;
3122 ac->next = NULL;
3123 cset->next = config_sets;
3124 cset->rootable = 0;
3125 config_sets = cset;
3126 }
3127 }
3128 ac = ac_next;
3129 }
3130
3131
3132 return(config_sets);
3133 }
3134
3135 static int
3136 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3137 {
3138 RF_ComponentLabel_t *clabel1, *clabel2;
3139
3140 /* If this one matches the *first* one in the set, that's good
3141 enough, since the other members of the set would have been
3142 through here too... */
3143 /* note that we are not checking partitionSize here..
3144
3145 Note that we are also not checking the mod_counters here.
3146 If everything else matches except the mod_counter, that's
3147 good enough for this test. We will deal with the mod_counters
3148 a little later in the autoconfiguration process.
3149
3150 (clabel1->mod_counter == clabel2->mod_counter) &&
3151
3152 The reason we don't check for this is that failed disks
3153 will have lower modification counts. If those disks are
3154 not added to the set they used to belong to, then they will
3155 form their own set, which may result in 2 different sets,
3156 for example, competing to be configured at raid0, and
3157 perhaps competing to be the root filesystem set. If the
3158 wrong ones get configured, or both attempt to become /,
3159 weird behaviour and or serious lossage will occur. Thus we
3160 need to bring them into the fold here, and kick them out at
3161 a later point.
3162
3163 */
3164
3165 clabel1 = cset->ac->clabel;
3166 clabel2 = ac->clabel;
3167 if ((clabel1->version == clabel2->version) &&
3168 (clabel1->serial_number == clabel2->serial_number) &&
3169 (clabel1->num_rows == clabel2->num_rows) &&
3170 (clabel1->num_columns == clabel2->num_columns) &&
3171 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3172 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3173 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3174 (clabel1->parityConfig == clabel2->parityConfig) &&
3175 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3176 (clabel1->blockSize == clabel2->blockSize) &&
3177 rf_component_label_numblocks(clabel1) ==
3178 rf_component_label_numblocks(clabel2) &&
3179 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3180 (clabel1->root_partition == clabel2->root_partition) &&
3181 (clabel1->last_unit == clabel2->last_unit) &&
3182 (clabel1->config_order == clabel2->config_order)) {
3183 /* if it get's here, it almost *has* to be a match */
3184 } else {
3185 /* it's not consistent with somebody in the set..
3186 punt */
3187 return(0);
3188 }
3189 /* all was fine.. it must fit... */
3190 return(1);
3191 }
3192
3193 int
3194 rf_have_enough_components(RF_ConfigSet_t *cset)
3195 {
3196 RF_AutoConfig_t *ac;
3197 RF_AutoConfig_t *auto_config;
3198 RF_ComponentLabel_t *clabel;
3199 int c;
3200 int num_cols;
3201 int num_missing;
3202 int mod_counter;
3203 int mod_counter_found;
3204 int even_pair_failed;
3205 char parity_type;
3206
3207
3208 /* check to see that we have enough 'live' components
3209 of this set. If so, we can configure it if necessary */
3210
3211 num_cols = cset->ac->clabel->num_columns;
3212 parity_type = cset->ac->clabel->parityConfig;
3213
3214 /* XXX Check for duplicate components!?!?!? */
3215
3216 /* Determine what the mod_counter is supposed to be for this set. */
3217
3218 mod_counter_found = 0;
3219 mod_counter = 0;
3220 ac = cset->ac;
3221 while(ac!=NULL) {
3222 if (mod_counter_found==0) {
3223 mod_counter = ac->clabel->mod_counter;
3224 mod_counter_found = 1;
3225 } else {
3226 if (ac->clabel->mod_counter > mod_counter) {
3227 mod_counter = ac->clabel->mod_counter;
3228 }
3229 }
3230 ac = ac->next;
3231 }
3232
3233 num_missing = 0;
3234 auto_config = cset->ac;
3235
3236 even_pair_failed = 0;
3237 for(c=0; c<num_cols; c++) {
3238 ac = auto_config;
3239 while(ac!=NULL) {
3240 if ((ac->clabel->column == c) &&
3241 (ac->clabel->mod_counter == mod_counter)) {
3242 /* it's this one... */
3243 #ifdef DEBUG
3244 printf("Found: %s at %d\n",
3245 ac->devname,c);
3246 #endif
3247 break;
3248 }
3249 ac=ac->next;
3250 }
3251 if (ac==NULL) {
3252 /* Didn't find one here! */
3253 /* special case for RAID 1, especially
3254 where there are more than 2
3255 components (where RAIDframe treats
3256 things a little differently :( ) */
3257 if (parity_type == '1') {
3258 if (c%2 == 0) { /* even component */
3259 even_pair_failed = 1;
3260 } else { /* odd component. If
3261 we're failed, and
3262 so is the even
3263 component, it's
3264 "Good Night, Charlie" */
3265 if (even_pair_failed == 1) {
3266 return(0);
3267 }
3268 }
3269 } else {
3270 /* normal accounting */
3271 num_missing++;
3272 }
3273 }
3274 if ((parity_type == '1') && (c%2 == 1)) {
3275 /* Just did an even component, and we didn't
3276 bail.. reset the even_pair_failed flag,
3277 and go on to the next component.... */
3278 even_pair_failed = 0;
3279 }
3280 }
3281
3282 clabel = cset->ac->clabel;
3283
3284 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3285 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3286 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3287 /* XXX this needs to be made *much* more general */
3288 /* Too many failures */
3289 return(0);
3290 }
3291 /* otherwise, all is well, and we've got enough to take a kick
3292 at autoconfiguring this set */
3293 return(1);
3294 }
3295
3296 void
3297 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3298 RF_Raid_t *raidPtr)
3299 {
3300 RF_ComponentLabel_t *clabel;
3301 int i;
3302
3303 clabel = ac->clabel;
3304
3305 /* 1. Fill in the common stuff */
3306 config->numCol = clabel->num_columns;
3307 config->numSpare = 0; /* XXX should this be set here? */
3308 config->sectPerSU = clabel->sectPerSU;
3309 config->SUsPerPU = clabel->SUsPerPU;
3310 config->SUsPerRU = clabel->SUsPerRU;
3311 config->parityConfig = clabel->parityConfig;
3312 /* XXX... */
3313 strcpy(config->diskQueueType,"fifo");
3314 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3315 config->layoutSpecificSize = 0; /* XXX ?? */
3316
3317 while(ac!=NULL) {
3318 /* row/col values will be in range due to the checks
3319 in reasonable_label() */
3320 strcpy(config->devnames[0][ac->clabel->column],
3321 ac->devname);
3322 ac = ac->next;
3323 }
3324
3325 for(i=0;i<RF_MAXDBGV;i++) {
3326 config->debugVars[i][0] = 0;
3327 }
3328 }
3329
3330 int
3331 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3332 {
3333 RF_ComponentLabel_t *clabel;
3334 int column;
3335 int sparecol;
3336
3337 raidPtr->autoconfigure = new_value;
3338
3339 for(column=0; column<raidPtr->numCol; column++) {
3340 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3341 clabel = raidget_component_label(raidPtr, column);
3342 clabel->autoconfigure = new_value;
3343 raidflush_component_label(raidPtr, column);
3344 }
3345 }
3346 for(column = 0; column < raidPtr->numSpare ; column++) {
3347 sparecol = raidPtr->numCol + column;
3348 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3349 clabel = raidget_component_label(raidPtr, sparecol);
3350 clabel->autoconfigure = new_value;
3351 raidflush_component_label(raidPtr, sparecol);
3352 }
3353 }
3354 return(new_value);
3355 }
3356
3357 int
3358 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3359 {
3360 RF_ComponentLabel_t *clabel;
3361 int column;
3362 int sparecol;
3363
3364 raidPtr->root_partition = new_value;
3365 for(column=0; column<raidPtr->numCol; column++) {
3366 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3367 clabel = raidget_component_label(raidPtr, column);
3368 clabel->root_partition = new_value;
3369 raidflush_component_label(raidPtr, column);
3370 }
3371 }
3372 for(column = 0; column < raidPtr->numSpare ; column++) {
3373 sparecol = raidPtr->numCol + column;
3374 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3375 clabel = raidget_component_label(raidPtr, sparecol);
3376 clabel->root_partition = new_value;
3377 raidflush_component_label(raidPtr, sparecol);
3378 }
3379 }
3380 return(new_value);
3381 }
3382
3383 void
3384 rf_release_all_vps(RF_ConfigSet_t *cset)
3385 {
3386 RF_AutoConfig_t *ac;
3387
3388 ac = cset->ac;
3389 while(ac!=NULL) {
3390 /* Close the vp, and give it back */
3391 if (ac->vp) {
3392 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3393 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3394 vput(ac->vp);
3395 ac->vp = NULL;
3396 }
3397 ac = ac->next;
3398 }
3399 }
3400
3401
3402 void
3403 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3404 {
3405 RF_AutoConfig_t *ac;
3406 RF_AutoConfig_t *next_ac;
3407
3408 ac = cset->ac;
3409 while(ac!=NULL) {
3410 next_ac = ac->next;
3411 /* nuke the label */
3412 free(ac->clabel, M_RAIDFRAME);
3413 /* cleanup the config structure */
3414 free(ac, M_RAIDFRAME);
3415 /* "next.." */
3416 ac = next_ac;
3417 }
3418 /* and, finally, nuke the config set */
3419 free(cset, M_RAIDFRAME);
3420 }
3421
3422
3423 void
3424 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3425 {
3426 /* current version number */
3427 clabel->version = RF_COMPONENT_LABEL_VERSION;
3428 clabel->serial_number = raidPtr->serial_number;
3429 clabel->mod_counter = raidPtr->mod_counter;
3430
3431 clabel->num_rows = 1;
3432 clabel->num_columns = raidPtr->numCol;
3433 clabel->clean = RF_RAID_DIRTY; /* not clean */
3434 clabel->status = rf_ds_optimal; /* "It's good!" */
3435
3436 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3437 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3438 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3439
3440 clabel->blockSize = raidPtr->bytesPerSector;
3441 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3442
3443 /* XXX not portable */
3444 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3445 clabel->maxOutstanding = raidPtr->maxOutstanding;
3446 clabel->autoconfigure = raidPtr->autoconfigure;
3447 clabel->root_partition = raidPtr->root_partition;
3448 clabel->last_unit = raidPtr->raidid;
3449 clabel->config_order = raidPtr->config_order;
3450
3451 #ifndef RF_NO_PARITY_MAP
3452 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3453 #endif
3454 }
3455
3456 struct raid_softc *
3457 rf_auto_config_set(RF_ConfigSet_t *cset)
3458 {
3459 RF_Raid_t *raidPtr;
3460 RF_Config_t *config;
3461 int raidID;
3462 struct raid_softc *sc;
3463
3464 #ifdef DEBUG
3465 printf("RAID autoconfigure\n");
3466 #endif
3467
3468 /* 1. Create a config structure */
3469 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3470 if (config == NULL) {
3471 printf("%s: Out of mem - config!?!?\n", __func__);
3472 /* XXX do something more intelligent here. */
3473 return NULL;
3474 }
3475
3476 /*
3477 2. Figure out what RAID ID this one is supposed to live at
3478 See if we can get the same RAID dev that it was configured
3479 on last time..
3480 */
3481
3482 raidID = cset->ac->clabel->last_unit;
3483 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3484 sc = raidget(++raidID, false))
3485 continue;
3486 #ifdef DEBUG
3487 printf("Configuring raid%d:\n",raidID);
3488 #endif
3489
3490 if (sc == NULL)
3491 sc = raidget(raidID, true);
3492 if (sc == NULL) {
3493 printf("%s: Out of mem - softc!?!?\n", __func__);
3494 /* XXX do something more intelligent here. */
3495 free(config, M_RAIDFRAME);
3496 return NULL;
3497 }
3498
3499 raidPtr = &sc->sc_r;
3500
3501 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3502 raidPtr->softc = sc;
3503 raidPtr->raidid = raidID;
3504 raidPtr->openings = RAIDOUTSTANDING;
3505
3506 /* 3. Build the configuration structure */
3507 rf_create_configuration(cset->ac, config, raidPtr);
3508
3509 /* 4. Do the configuration */
3510 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3511 raidinit(sc);
3512
3513 rf_markalldirty(raidPtr);
3514 raidPtr->autoconfigure = 1; /* XXX do this here? */
3515 switch (cset->ac->clabel->root_partition) {
3516 case 1: /* Force Root */
3517 case 2: /* Soft Root: root when boot partition part of raid */
3518 /*
3519 * everything configured just fine. Make a note
3520 * that this set is eligible to be root,
3521 * or forced to be root
3522 */
3523 cset->rootable = cset->ac->clabel->root_partition;
3524 /* XXX do this here? */
3525 raidPtr->root_partition = cset->rootable;
3526 break;
3527 default:
3528 break;
3529 }
3530 } else {
3531 raidput(sc);
3532 sc = NULL;
3533 }
3534
3535 /* 5. Cleanup */
3536 free(config, M_RAIDFRAME);
3537 return sc;
3538 }
3539
3540 void
3541 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3542 size_t xmin, size_t xmax)
3543 {
3544 int error;
3545
3546 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3547 pool_sethiwat(p, xmax);
3548 if ((error = pool_prime(p, xmin)) != 0)
3549 panic("%s: failed to prime pool: %d", __func__, error);
3550 pool_setlowat(p, xmin);
3551 }
3552
3553 /*
3554 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3555 * to see if there is IO pending and if that IO could possibly be done
3556 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3557 * otherwise.
3558 *
3559 */
3560 int
3561 rf_buf_queue_check(RF_Raid_t *raidPtr)
3562 {
3563 struct raid_softc *rs;
3564 struct dk_softc *dksc;
3565
3566 rs = raidPtr->softc;
3567 dksc = &rs->sc_dksc;
3568
3569 if ((rs->sc_flags & RAIDF_INITED) == 0)
3570 return 1;
3571
3572 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3573 /* there is work to do */
3574 return 0;
3575 }
3576 /* default is nothing to do */
3577 return 1;
3578 }
3579
3580 int
3581 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3582 {
3583 uint64_t numsecs;
3584 unsigned secsize;
3585 int error;
3586
3587 error = getdisksize(vp, &numsecs, &secsize);
3588 if (error == 0) {
3589 diskPtr->blockSize = secsize;
3590 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3591 diskPtr->partitionSize = numsecs;
3592 return 0;
3593 }
3594 return error;
3595 }
3596
3597 static int
3598 raid_match(device_t self, cfdata_t cfdata, void *aux)
3599 {
3600 return 1;
3601 }
3602
3603 static void
3604 raid_attach(device_t parent, device_t self, void *aux)
3605 {
3606 }
3607
3608
3609 static int
3610 raid_detach(device_t self, int flags)
3611 {
3612 int error;
3613 struct raid_softc *rs = raidsoftc(self);
3614
3615 if (rs == NULL)
3616 return ENXIO;
3617
3618 if ((error = raidlock(rs)) != 0)
3619 return (error);
3620
3621 error = raid_detach_unlocked(rs);
3622
3623 raidunlock(rs);
3624
3625 /* XXX raid can be referenced here */
3626
3627 if (error)
3628 return error;
3629
3630 /* Free the softc */
3631 raidput(rs);
3632
3633 return 0;
3634 }
3635
3636 static void
3637 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3638 {
3639 struct dk_softc *dksc = &rs->sc_dksc;
3640 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3641
3642 memset(dg, 0, sizeof(*dg));
3643
3644 dg->dg_secperunit = raidPtr->totalSectors;
3645 dg->dg_secsize = raidPtr->bytesPerSector;
3646 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3647 dg->dg_ntracks = 4 * raidPtr->numCol;
3648
3649 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3650 }
3651
3652 /*
3653 * Get cache info for all the components (including spares).
3654 * Returns intersection of all the cache flags of all disks, or first
3655 * error if any encountered.
3656 * XXXfua feature flags can change as spares are added - lock down somehow
3657 */
3658 static int
3659 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3660 {
3661 int c;
3662 int error;
3663 int dkwhole = 0, dkpart;
3664
3665 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3666 /*
3667 * Check any non-dead disk, even when currently being
3668 * reconstructed.
3669 */
3670 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3671 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3672 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3673 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3674 if (error) {
3675 if (error != ENODEV) {
3676 printf("raid%d: get cache for component %s failed\n",
3677 raidPtr->raidid,
3678 raidPtr->Disks[c].devname);
3679 }
3680
3681 return error;
3682 }
3683
3684 if (c == 0)
3685 dkwhole = dkpart;
3686 else
3687 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3688 }
3689 }
3690
3691 *data = dkwhole;
3692
3693 return 0;
3694 }
3695
3696 /*
3697 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3698 * We end up returning whatever error was returned by the first cache flush
3699 * that fails.
3700 */
3701
3702 int
3703 rf_sync_component_caches(RF_Raid_t *raidPtr)
3704 {
3705 int c, sparecol;
3706 int e,error;
3707 int force = 1;
3708
3709 error = 0;
3710 for (c = 0; c < raidPtr->numCol; c++) {
3711 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3712 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3713 &force, FWRITE, NOCRED);
3714 if (e) {
3715 if (e != ENODEV)
3716 printf("raid%d: cache flush to component %s failed.\n",
3717 raidPtr->raidid, raidPtr->Disks[c].devname);
3718 if (error == 0) {
3719 error = e;
3720 }
3721 }
3722 }
3723 }
3724
3725 for( c = 0; c < raidPtr->numSpare ; c++) {
3726 sparecol = raidPtr->numCol + c;
3727 /* Need to ensure that the reconstruct actually completed! */
3728 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3729 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3730 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3731 if (e) {
3732 if (e != ENODEV)
3733 printf("raid%d: cache flush to component %s failed.\n",
3734 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3735 if (error == 0) {
3736 error = e;
3737 }
3738 }
3739 }
3740 }
3741 return error;
3742 }
3743
3744 /* Fill in info with the current status */
3745 void
3746 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3747 {
3748
3749 if (raidPtr->status != rf_rs_reconstructing) {
3750 info->total = 100;
3751 info->completed = 100;
3752 } else {
3753 info->total = raidPtr->reconControl->numRUsTotal;
3754 info->completed = raidPtr->reconControl->numRUsComplete;
3755 }
3756 info->remaining = info->total - info->completed;
3757 }
3758
3759 /* Fill in info with the current status */
3760 void
3761 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3762 {
3763
3764 if (raidPtr->parity_rewrite_in_progress == 1) {
3765 info->total = raidPtr->Layout.numStripe;
3766 info->completed = raidPtr->parity_rewrite_stripes_done;
3767 } else {
3768 info->completed = 100;
3769 info->total = 100;
3770 }
3771 info->remaining = info->total - info->completed;
3772 }
3773
3774 /* Fill in info with the current status */
3775 void
3776 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3777 {
3778
3779 if (raidPtr->copyback_in_progress == 1) {
3780 info->total = raidPtr->Layout.numStripe;
3781 info->completed = raidPtr->copyback_stripes_done;
3782 info->remaining = info->total - info->completed;
3783 } else {
3784 info->remaining = 0;
3785 info->completed = 100;
3786 info->total = 100;
3787 }
3788 }
3789
3790 /* Fill in config with the current info */
3791 int
3792 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3793 {
3794 int d, i, j;
3795
3796 if (!raidPtr->valid)
3797 return (ENODEV);
3798 config->cols = raidPtr->numCol;
3799 config->ndevs = raidPtr->numCol;
3800 if (config->ndevs >= RF_MAX_DISKS)
3801 return (ENOMEM);
3802 config->nspares = raidPtr->numSpare;
3803 if (config->nspares >= RF_MAX_DISKS)
3804 return (ENOMEM);
3805 config->maxqdepth = raidPtr->maxQueueDepth;
3806 d = 0;
3807 for (j = 0; j < config->cols; j++) {
3808 config->devs[d] = raidPtr->Disks[j];
3809 d++;
3810 }
3811 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3812 config->spares[i] = raidPtr->Disks[j];
3813 if (config->spares[i].status == rf_ds_rebuilding_spare) {
3814 /* XXX: raidctl(8) expects to see this as a used spare */
3815 config->spares[i].status = rf_ds_used_spare;
3816 }
3817 }
3818 return 0;
3819 }
3820
3821 int
3822 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3823 {
3824 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3825 RF_ComponentLabel_t *raid_clabel;
3826 int column = clabel->column;
3827
3828 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3829 return EINVAL;
3830 raid_clabel = raidget_component_label(raidPtr, column);
3831 memcpy(clabel, raid_clabel, sizeof *clabel);
3832
3833 return 0;
3834 }
3835
3836 /*
3837 * Module interface
3838 */
3839
3840 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3841
3842 #ifdef _MODULE
3843 CFDRIVER_DECL(raid, DV_DISK, NULL);
3844 #endif
3845
3846 static int raid_modcmd(modcmd_t, void *);
3847 static int raid_modcmd_init(void);
3848 static int raid_modcmd_fini(void);
3849
3850 static int
3851 raid_modcmd(modcmd_t cmd, void *data)
3852 {
3853 int error;
3854
3855 error = 0;
3856 switch (cmd) {
3857 case MODULE_CMD_INIT:
3858 error = raid_modcmd_init();
3859 break;
3860 case MODULE_CMD_FINI:
3861 error = raid_modcmd_fini();
3862 break;
3863 default:
3864 error = ENOTTY;
3865 break;
3866 }
3867 return error;
3868 }
3869
3870 static int
3871 raid_modcmd_init(void)
3872 {
3873 int error;
3874 int bmajor, cmajor;
3875
3876 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3877 mutex_enter(&raid_lock);
3878 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3879 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3880 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3881 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3882
3883 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3884 #endif
3885
3886 bmajor = cmajor = -1;
3887 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3888 &raid_cdevsw, &cmajor);
3889 if (error != 0 && error != EEXIST) {
3890 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3891 mutex_exit(&raid_lock);
3892 return error;
3893 }
3894 #ifdef _MODULE
3895 error = config_cfdriver_attach(&raid_cd);
3896 if (error != 0) {
3897 aprint_error("%s: config_cfdriver_attach failed %d\n",
3898 __func__, error);
3899 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3900 mutex_exit(&raid_lock);
3901 return error;
3902 }
3903 #endif
3904 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3905 if (error != 0) {
3906 aprint_error("%s: config_cfattach_attach failed %d\n",
3907 __func__, error);
3908 #ifdef _MODULE
3909 config_cfdriver_detach(&raid_cd);
3910 #endif
3911 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3912 mutex_exit(&raid_lock);
3913 return error;
3914 }
3915
3916 raidautoconfigdone = false;
3917
3918 mutex_exit(&raid_lock);
3919
3920 if (error == 0) {
3921 if (rf_BootRaidframe(true) == 0)
3922 aprint_verbose("Kernelized RAIDframe activated\n");
3923 else
3924 panic("Serious error activating RAID!!");
3925 }
3926
3927 /*
3928 * Register a finalizer which will be used to auto-config RAID
3929 * sets once all real hardware devices have been found.
3930 */
3931 error = config_finalize_register(NULL, rf_autoconfig);
3932 if (error != 0) {
3933 aprint_error("WARNING: unable to register RAIDframe "
3934 "finalizer\n");
3935 error = 0;
3936 }
3937
3938 return error;
3939 }
3940
3941 static int
3942 raid_modcmd_fini(void)
3943 {
3944 int error;
3945
3946 mutex_enter(&raid_lock);
3947
3948 /* Don't allow unload if raid device(s) exist. */
3949 if (!LIST_EMPTY(&raids)) {
3950 mutex_exit(&raid_lock);
3951 return EBUSY;
3952 }
3953
3954 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3955 if (error != 0) {
3956 aprint_error("%s: cannot detach cfattach\n",__func__);
3957 mutex_exit(&raid_lock);
3958 return error;
3959 }
3960 #ifdef _MODULE
3961 error = config_cfdriver_detach(&raid_cd);
3962 if (error != 0) {
3963 aprint_error("%s: cannot detach cfdriver\n",__func__);
3964 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3965 mutex_exit(&raid_lock);
3966 return error;
3967 }
3968 #endif
3969 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3970 if (error != 0) {
3971 aprint_error("%s: cannot detach devsw\n",__func__);
3972 #ifdef _MODULE
3973 config_cfdriver_attach(&raid_cd);
3974 #endif
3975 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3976 mutex_exit(&raid_lock);
3977 return error;
3978 }
3979 rf_BootRaidframe(false);
3980 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3981 rf_destroy_mutex2(rf_sparet_wait_mutex);
3982 rf_destroy_cond2(rf_sparet_wait_cv);
3983 rf_destroy_cond2(rf_sparet_resp_cv);
3984 #endif
3985 mutex_exit(&raid_lock);
3986 mutex_destroy(&raid_lock);
3987
3988 return error;
3989 }
3990