rf_netbsdkintf.c revision 1.382 1 /* $NetBSD: rf_netbsdkintf.c,v 1.382 2020/04/13 00:27:17 chs Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.382 2020/04/13 00:27:17 chs Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171
172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
173 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
175 * installation process */
176 #endif
177
178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
179
180 /* prototypes */
181 static void KernelWakeupFunc(struct buf *);
182 static void InitBP(struct buf *, struct vnode *, unsigned,
183 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
184 void *, int, struct proc *);
185 static void raidinit(struct raid_softc *);
186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
188
189 static int raid_match(device_t, cfdata_t, void *);
190 static void raid_attach(device_t, device_t, void *);
191 static int raid_detach(device_t, int);
192
193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
194 daddr_t, daddr_t);
195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
196 daddr_t, daddr_t, int);
197
198 static int raidwrite_component_label(unsigned,
199 dev_t, struct vnode *, RF_ComponentLabel_t *);
200 static int raidread_component_label(unsigned,
201 dev_t, struct vnode *, RF_ComponentLabel_t *);
202
203 static int raid_diskstart(device_t, struct buf *bp);
204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
205 static int raid_lastclose(device_t);
206
207 static dev_type_open(raidopen);
208 static dev_type_close(raidclose);
209 static dev_type_read(raidread);
210 static dev_type_write(raidwrite);
211 static dev_type_ioctl(raidioctl);
212 static dev_type_strategy(raidstrategy);
213 static dev_type_dump(raiddump);
214 static dev_type_size(raidsize);
215
216 const struct bdevsw raid_bdevsw = {
217 .d_open = raidopen,
218 .d_close = raidclose,
219 .d_strategy = raidstrategy,
220 .d_ioctl = raidioctl,
221 .d_dump = raiddump,
222 .d_psize = raidsize,
223 .d_discard = nodiscard,
224 .d_flag = D_DISK
225 };
226
227 const struct cdevsw raid_cdevsw = {
228 .d_open = raidopen,
229 .d_close = raidclose,
230 .d_read = raidread,
231 .d_write = raidwrite,
232 .d_ioctl = raidioctl,
233 .d_stop = nostop,
234 .d_tty = notty,
235 .d_poll = nopoll,
236 .d_mmap = nommap,
237 .d_kqfilter = nokqfilter,
238 .d_discard = nodiscard,
239 .d_flag = D_DISK
240 };
241
242 static struct dkdriver rf_dkdriver = {
243 .d_open = raidopen,
244 .d_close = raidclose,
245 .d_strategy = raidstrategy,
246 .d_diskstart = raid_diskstart,
247 .d_dumpblocks = raid_dumpblocks,
248 .d_lastclose = raid_lastclose,
249 .d_minphys = minphys
250 };
251
252 #define raidunit(x) DISKUNIT(x)
253 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
254
255 extern struct cfdriver raid_cd;
256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
257 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
258 DVF_DETACH_SHUTDOWN);
259
260 /* Internal representation of a rf_recon_req */
261 struct rf_recon_req_internal {
262 RF_RowCol_t col;
263 RF_ReconReqFlags_t flags;
264 void *raidPtr;
265 };
266
267 /*
268 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
269 * Be aware that large numbers can allow the driver to consume a lot of
270 * kernel memory, especially on writes, and in degraded mode reads.
271 *
272 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
273 * a single 64K write will typically require 64K for the old data,
274 * 64K for the old parity, and 64K for the new parity, for a total
275 * of 192K (if the parity buffer is not re-used immediately).
276 * Even it if is used immediately, that's still 128K, which when multiplied
277 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
278 *
279 * Now in degraded mode, for example, a 64K read on the above setup may
280 * require data reconstruction, which will require *all* of the 4 remaining
281 * disks to participate -- 4 * 32K/disk == 128K again.
282 */
283
284 #ifndef RAIDOUTSTANDING
285 #define RAIDOUTSTANDING 6
286 #endif
287
288 #define RAIDLABELDEV(dev) \
289 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
290
291 /* declared here, and made public, for the benefit of KVM stuff.. */
292
293 static int raidlock(struct raid_softc *);
294 static void raidunlock(struct raid_softc *);
295
296 static int raid_detach_unlocked(struct raid_softc *);
297
298 static void rf_markalldirty(RF_Raid_t *);
299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
300
301 void rf_ReconThread(struct rf_recon_req_internal *);
302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
303 void rf_CopybackThread(RF_Raid_t *raidPtr);
304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
305 int rf_autoconfig(device_t);
306 void rf_buildroothack(RF_ConfigSet_t *);
307
308 RF_AutoConfig_t *rf_find_raid_components(void);
309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
313 int rf_set_autoconfig(RF_Raid_t *, int);
314 int rf_set_rootpartition(RF_Raid_t *, int);
315 void rf_release_all_vps(RF_ConfigSet_t *);
316 void rf_cleanup_config_set(RF_ConfigSet_t *);
317 int rf_have_enough_components(RF_ConfigSet_t *);
318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
320
321 /*
322 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
323 * Note that this is overridden by having RAID_AUTOCONFIG as an option
324 * in the kernel config file.
325 */
326 #ifdef RAID_AUTOCONFIG
327 int raidautoconfig = 1;
328 #else
329 int raidautoconfig = 0;
330 #endif
331 static bool raidautoconfigdone = false;
332
333 struct RF_Pools_s rf_pools;
334
335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
336 static kmutex_t raid_lock;
337
338 static struct raid_softc *
339 raidcreate(int unit) {
340 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
341 sc->sc_unit = unit;
342 cv_init(&sc->sc_cv, "raidunit");
343 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
344 return sc;
345 }
346
347 static void
348 raiddestroy(struct raid_softc *sc) {
349 cv_destroy(&sc->sc_cv);
350 mutex_destroy(&sc->sc_mutex);
351 kmem_free(sc, sizeof(*sc));
352 }
353
354 static struct raid_softc *
355 raidget(int unit, bool create) {
356 struct raid_softc *sc;
357 if (unit < 0) {
358 #ifdef DIAGNOSTIC
359 panic("%s: unit %d!", __func__, unit);
360 #endif
361 return NULL;
362 }
363 mutex_enter(&raid_lock);
364 LIST_FOREACH(sc, &raids, sc_link) {
365 if (sc->sc_unit == unit) {
366 mutex_exit(&raid_lock);
367 return sc;
368 }
369 }
370 mutex_exit(&raid_lock);
371 if (!create)
372 return NULL;
373 sc = raidcreate(unit);
374 mutex_enter(&raid_lock);
375 LIST_INSERT_HEAD(&raids, sc, sc_link);
376 mutex_exit(&raid_lock);
377 return sc;
378 }
379
380 static void
381 raidput(struct raid_softc *sc) {
382 mutex_enter(&raid_lock);
383 LIST_REMOVE(sc, sc_link);
384 mutex_exit(&raid_lock);
385 raiddestroy(sc);
386 }
387
388 void
389 raidattach(int num)
390 {
391
392 /*
393 * Device attachment and associated initialization now occurs
394 * as part of the module initialization.
395 */
396 }
397
398 int
399 rf_autoconfig(device_t self)
400 {
401 RF_AutoConfig_t *ac_list;
402 RF_ConfigSet_t *config_sets;
403
404 if (!raidautoconfig || raidautoconfigdone == true)
405 return (0);
406
407 /* XXX This code can only be run once. */
408 raidautoconfigdone = true;
409
410 #ifdef __HAVE_CPU_BOOTCONF
411 /*
412 * 0. find the boot device if needed first so we can use it later
413 * this needs to be done before we autoconfigure any raid sets,
414 * because if we use wedges we are not going to be able to open
415 * the boot device later
416 */
417 if (booted_device == NULL)
418 cpu_bootconf();
419 #endif
420 /* 1. locate all RAID components on the system */
421 aprint_debug("Searching for RAID components...\n");
422 ac_list = rf_find_raid_components();
423
424 /* 2. Sort them into their respective sets. */
425 config_sets = rf_create_auto_sets(ac_list);
426
427 /*
428 * 3. Evaluate each set and configure the valid ones.
429 * This gets done in rf_buildroothack().
430 */
431 rf_buildroothack(config_sets);
432
433 return 1;
434 }
435
436 int
437 rf_inited(const struct raid_softc *rs) {
438 return (rs->sc_flags & RAIDF_INITED) != 0;
439 }
440
441 RF_Raid_t *
442 rf_get_raid(struct raid_softc *rs) {
443 return &rs->sc_r;
444 }
445
446 int
447 rf_get_unit(const struct raid_softc *rs) {
448 return rs->sc_unit;
449 }
450
451 static int
452 rf_containsboot(RF_Raid_t *r, device_t bdv) {
453 const char *bootname;
454 size_t len;
455
456 /* if bdv is NULL, the set can't contain it. exit early. */
457 if (bdv == NULL)
458 return 0;
459
460 bootname = device_xname(bdv);
461 len = strlen(bootname);
462
463 for (int col = 0; col < r->numCol; col++) {
464 const char *devname = r->Disks[col].devname;
465 devname += sizeof("/dev/") - 1;
466 if (strncmp(devname, "dk", 2) == 0) {
467 const char *parent =
468 dkwedge_get_parent_name(r->Disks[col].dev);
469 if (parent != NULL)
470 devname = parent;
471 }
472 if (strncmp(devname, bootname, len) == 0) {
473 struct raid_softc *sc = r->softc;
474 aprint_debug("raid%d includes boot device %s\n",
475 sc->sc_unit, devname);
476 return 1;
477 }
478 }
479 return 0;
480 }
481
482 void
483 rf_buildroothack(RF_ConfigSet_t *config_sets)
484 {
485 RF_ConfigSet_t *cset;
486 RF_ConfigSet_t *next_cset;
487 int num_root;
488 struct raid_softc *sc, *rsc;
489 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
490
491 sc = rsc = NULL;
492 num_root = 0;
493 cset = config_sets;
494 while (cset != NULL) {
495 next_cset = cset->next;
496 if (rf_have_enough_components(cset) &&
497 cset->ac->clabel->autoconfigure == 1) {
498 sc = rf_auto_config_set(cset);
499 if (sc != NULL) {
500 aprint_debug("raid%d: configured ok, rootable %d\n",
501 sc->sc_unit, cset->rootable);
502 if (cset->rootable) {
503 rsc = sc;
504 num_root++;
505 }
506 } else {
507 /* The autoconfig didn't work :( */
508 aprint_debug("Autoconfig failed\n");
509 rf_release_all_vps(cset);
510 }
511 } else {
512 /* we're not autoconfiguring this set...
513 release the associated resources */
514 rf_release_all_vps(cset);
515 }
516 /* cleanup */
517 rf_cleanup_config_set(cset);
518 cset = next_cset;
519 }
520
521 /* if the user has specified what the root device should be
522 then we don't touch booted_device or boothowto... */
523
524 if (rootspec != NULL) {
525 DPRINTF("%s: rootspec %s\n", __func__, rootspec);
526 return;
527 }
528
529 /* we found something bootable... */
530
531 /*
532 * XXX: The following code assumes that the root raid
533 * is the first ('a') partition. This is about the best
534 * we can do with a BSD disklabel, but we might be able
535 * to do better with a GPT label, by setting a specified
536 * attribute to indicate the root partition. We can then
537 * stash the partition number in the r->root_partition
538 * high bits (the bottom 2 bits are already used). For
539 * now we just set booted_partition to 0 when we override
540 * root.
541 */
542 if (num_root == 1) {
543 device_t candidate_root;
544 dksc = &rsc->sc_dksc;
545 if (dksc->sc_dkdev.dk_nwedges != 0) {
546 char cname[sizeof(cset->ac->devname)];
547 /* XXX: assume partition 'a' first */
548 snprintf(cname, sizeof(cname), "%s%c",
549 device_xname(dksc->sc_dev), 'a');
550 candidate_root = dkwedge_find_by_wname(cname);
551 DPRINTF("%s: candidate wedge root=%s\n", __func__,
552 cname);
553 if (candidate_root == NULL) {
554 /*
555 * If that is not found, because we don't use
556 * disklabel, return the first dk child
557 * XXX: we can skip the 'a' check above
558 * and always do this...
559 */
560 size_t i = 0;
561 candidate_root = dkwedge_find_by_parent(
562 device_xname(dksc->sc_dev), &i);
563 }
564 DPRINTF("%s: candidate wedge root=%p\n", __func__,
565 candidate_root);
566 } else
567 candidate_root = dksc->sc_dev;
568 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
569 DPRINTF("%s: booted_device=%p root_partition=%d "
570 "contains_boot=%d",
571 __func__, booted_device, rsc->sc_r.root_partition,
572 rf_containsboot(&rsc->sc_r, booted_device));
573 /* XXX the check for booted_device == NULL can probably be
574 * dropped, now that rf_containsboot handles that case.
575 */
576 if (booted_device == NULL ||
577 rsc->sc_r.root_partition == 1 ||
578 rf_containsboot(&rsc->sc_r, booted_device)) {
579 booted_device = candidate_root;
580 booted_method = "raidframe/single";
581 booted_partition = 0; /* XXX assume 'a' */
582 }
583 } else if (num_root > 1) {
584 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
585 booted_device);
586
587 /*
588 * Maybe the MD code can help. If it cannot, then
589 * setroot() will discover that we have no
590 * booted_device and will ask the user if nothing was
591 * hardwired in the kernel config file
592 */
593 if (booted_device == NULL)
594 return;
595
596 num_root = 0;
597 mutex_enter(&raid_lock);
598 LIST_FOREACH(sc, &raids, sc_link) {
599 RF_Raid_t *r = &sc->sc_r;
600 if (r->valid == 0)
601 continue;
602
603 if (r->root_partition == 0)
604 continue;
605
606 if (rf_containsboot(r, booted_device)) {
607 num_root++;
608 rsc = sc;
609 dksc = &rsc->sc_dksc;
610 }
611 }
612 mutex_exit(&raid_lock);
613
614 if (num_root == 1) {
615 booted_device = dksc->sc_dev;
616 booted_method = "raidframe/multi";
617 booted_partition = 0; /* XXX assume 'a' */
618 } else {
619 /* we can't guess.. require the user to answer... */
620 boothowto |= RB_ASKNAME;
621 }
622 }
623 }
624
625 static int
626 raidsize(dev_t dev)
627 {
628 struct raid_softc *rs;
629 struct dk_softc *dksc;
630 unsigned int unit;
631
632 unit = raidunit(dev);
633 if ((rs = raidget(unit, false)) == NULL)
634 return -1;
635 dksc = &rs->sc_dksc;
636
637 if ((rs->sc_flags & RAIDF_INITED) == 0)
638 return -1;
639
640 return dk_size(dksc, dev);
641 }
642
643 static int
644 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
645 {
646 unsigned int unit;
647 struct raid_softc *rs;
648 struct dk_softc *dksc;
649
650 unit = raidunit(dev);
651 if ((rs = raidget(unit, false)) == NULL)
652 return ENXIO;
653 dksc = &rs->sc_dksc;
654
655 if ((rs->sc_flags & RAIDF_INITED) == 0)
656 return ENODEV;
657
658 /*
659 Note that blkno is relative to this particular partition.
660 By adding adding RF_PROTECTED_SECTORS, we get a value that
661 is relative to the partition used for the underlying component.
662 */
663 blkno += RF_PROTECTED_SECTORS;
664
665 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
666 }
667
668 static int
669 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
670 {
671 struct raid_softc *rs = raidsoftc(dev);
672 const struct bdevsw *bdev;
673 RF_Raid_t *raidPtr;
674 int c, sparecol, j, scol, dumpto;
675 int error = 0;
676
677 raidPtr = &rs->sc_r;
678
679 /* we only support dumping to RAID 1 sets */
680 if (raidPtr->Layout.numDataCol != 1 ||
681 raidPtr->Layout.numParityCol != 1)
682 return EINVAL;
683
684 if ((error = raidlock(rs)) != 0)
685 return error;
686
687 /* figure out what device is alive.. */
688
689 /*
690 Look for a component to dump to. The preference for the
691 component to dump to is as follows:
692 1) the master
693 2) a used_spare of the master
694 3) the slave
695 4) a used_spare of the slave
696 */
697
698 dumpto = -1;
699 for (c = 0; c < raidPtr->numCol; c++) {
700 if (raidPtr->Disks[c].status == rf_ds_optimal) {
701 /* this might be the one */
702 dumpto = c;
703 break;
704 }
705 }
706
707 /*
708 At this point we have possibly selected a live master or a
709 live slave. We now check to see if there is a spared
710 master (or a spared slave), if we didn't find a live master
711 or a live slave.
712 */
713
714 for (c = 0; c < raidPtr->numSpare; c++) {
715 sparecol = raidPtr->numCol + c;
716 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
717 /* How about this one? */
718 scol = -1;
719 for(j=0;j<raidPtr->numCol;j++) {
720 if (raidPtr->Disks[j].spareCol == sparecol) {
721 scol = j;
722 break;
723 }
724 }
725 if (scol == 0) {
726 /*
727 We must have found a spared master!
728 We'll take that over anything else
729 found so far. (We couldn't have
730 found a real master before, since
731 this is a used spare, and it's
732 saying that it's replacing the
733 master.) On reboot (with
734 autoconfiguration turned on)
735 sparecol will become the 1st
736 component (component0) of this set.
737 */
738 dumpto = sparecol;
739 break;
740 } else if (scol != -1) {
741 /*
742 Must be a spared slave. We'll dump
743 to that if we havn't found anything
744 else so far.
745 */
746 if (dumpto == -1)
747 dumpto = sparecol;
748 }
749 }
750 }
751
752 if (dumpto == -1) {
753 /* we couldn't find any live components to dump to!?!?
754 */
755 error = EINVAL;
756 goto out;
757 }
758
759 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
760 if (bdev == NULL) {
761 error = ENXIO;
762 goto out;
763 }
764
765 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
766 blkno, va, nblk * raidPtr->bytesPerSector);
767
768 out:
769 raidunlock(rs);
770
771 return error;
772 }
773
774 /* ARGSUSED */
775 static int
776 raidopen(dev_t dev, int flags, int fmt,
777 struct lwp *l)
778 {
779 int unit = raidunit(dev);
780 struct raid_softc *rs;
781 struct dk_softc *dksc;
782 int error = 0;
783 int part, pmask;
784
785 if ((rs = raidget(unit, true)) == NULL)
786 return ENXIO;
787 if ((error = raidlock(rs)) != 0)
788 return (error);
789
790 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
791 error = EBUSY;
792 goto bad;
793 }
794
795 dksc = &rs->sc_dksc;
796
797 part = DISKPART(dev);
798 pmask = (1 << part);
799
800 if (!DK_BUSY(dksc, pmask) &&
801 ((rs->sc_flags & RAIDF_INITED) != 0)) {
802 /* First one... mark things as dirty... Note that we *MUST*
803 have done a configure before this. I DO NOT WANT TO BE
804 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
805 THAT THEY BELONG TOGETHER!!!!! */
806 /* XXX should check to see if we're only open for reading
807 here... If so, we needn't do this, but then need some
808 other way of keeping track of what's happened.. */
809
810 rf_markalldirty(&rs->sc_r);
811 }
812
813 if ((rs->sc_flags & RAIDF_INITED) != 0)
814 error = dk_open(dksc, dev, flags, fmt, l);
815
816 bad:
817 raidunlock(rs);
818
819 return (error);
820
821
822 }
823
824 static int
825 raid_lastclose(device_t self)
826 {
827 struct raid_softc *rs = raidsoftc(self);
828
829 /* Last one... device is not unconfigured yet.
830 Device shutdown has taken care of setting the
831 clean bits if RAIDF_INITED is not set
832 mark things as clean... */
833
834 rf_update_component_labels(&rs->sc_r,
835 RF_FINAL_COMPONENT_UPDATE);
836
837 /* pass to unlocked code */
838 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
839 rs->sc_flags |= RAIDF_DETACH;
840
841 return 0;
842 }
843
844 /* ARGSUSED */
845 static int
846 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
847 {
848 int unit = raidunit(dev);
849 struct raid_softc *rs;
850 struct dk_softc *dksc;
851 cfdata_t cf;
852 int error = 0, do_detach = 0, do_put = 0;
853
854 if ((rs = raidget(unit, false)) == NULL)
855 return ENXIO;
856 dksc = &rs->sc_dksc;
857
858 if ((error = raidlock(rs)) != 0)
859 return (error);
860
861 if ((rs->sc_flags & RAIDF_INITED) != 0) {
862 error = dk_close(dksc, dev, flags, fmt, l);
863 if ((rs->sc_flags & RAIDF_DETACH) != 0)
864 do_detach = 1;
865 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
866 do_put = 1;
867
868 raidunlock(rs);
869
870 if (do_detach) {
871 /* free the pseudo device attach bits */
872 cf = device_cfdata(dksc->sc_dev);
873 error = config_detach(dksc->sc_dev, 0);
874 if (error == 0)
875 free(cf, M_RAIDFRAME);
876 } else if (do_put) {
877 raidput(rs);
878 }
879
880 return (error);
881
882 }
883
884 static void
885 raid_wakeup(RF_Raid_t *raidPtr)
886 {
887 rf_lock_mutex2(raidPtr->iodone_lock);
888 rf_signal_cond2(raidPtr->iodone_cv);
889 rf_unlock_mutex2(raidPtr->iodone_lock);
890 }
891
892 static void
893 raidstrategy(struct buf *bp)
894 {
895 unsigned int unit;
896 struct raid_softc *rs;
897 struct dk_softc *dksc;
898 RF_Raid_t *raidPtr;
899
900 unit = raidunit(bp->b_dev);
901 if ((rs = raidget(unit, false)) == NULL) {
902 bp->b_error = ENXIO;
903 goto fail;
904 }
905 if ((rs->sc_flags & RAIDF_INITED) == 0) {
906 bp->b_error = ENXIO;
907 goto fail;
908 }
909 dksc = &rs->sc_dksc;
910 raidPtr = &rs->sc_r;
911
912 /* Queue IO only */
913 if (dk_strategy_defer(dksc, bp))
914 goto done;
915
916 /* schedule the IO to happen at the next convenient time */
917 raid_wakeup(raidPtr);
918
919 done:
920 return;
921
922 fail:
923 bp->b_resid = bp->b_bcount;
924 biodone(bp);
925 }
926
927 static int
928 raid_diskstart(device_t dev, struct buf *bp)
929 {
930 struct raid_softc *rs = raidsoftc(dev);
931 RF_Raid_t *raidPtr;
932
933 raidPtr = &rs->sc_r;
934 if (!raidPtr->valid) {
935 db1_printf(("raid is not valid..\n"));
936 return ENODEV;
937 }
938
939 /* XXX */
940 bp->b_resid = 0;
941
942 return raiddoaccess(raidPtr, bp);
943 }
944
945 void
946 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
947 {
948 struct raid_softc *rs;
949 struct dk_softc *dksc;
950
951 rs = raidPtr->softc;
952 dksc = &rs->sc_dksc;
953
954 dk_done(dksc, bp);
955
956 rf_lock_mutex2(raidPtr->mutex);
957 raidPtr->openings++;
958 rf_unlock_mutex2(raidPtr->mutex);
959
960 /* schedule more IO */
961 raid_wakeup(raidPtr);
962 }
963
964 /* ARGSUSED */
965 static int
966 raidread(dev_t dev, struct uio *uio, int flags)
967 {
968 int unit = raidunit(dev);
969 struct raid_softc *rs;
970
971 if ((rs = raidget(unit, false)) == NULL)
972 return ENXIO;
973
974 if ((rs->sc_flags & RAIDF_INITED) == 0)
975 return (ENXIO);
976
977 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
978
979 }
980
981 /* ARGSUSED */
982 static int
983 raidwrite(dev_t dev, struct uio *uio, int flags)
984 {
985 int unit = raidunit(dev);
986 struct raid_softc *rs;
987
988 if ((rs = raidget(unit, false)) == NULL)
989 return ENXIO;
990
991 if ((rs->sc_flags & RAIDF_INITED) == 0)
992 return (ENXIO);
993
994 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
995
996 }
997
998 static int
999 raid_detach_unlocked(struct raid_softc *rs)
1000 {
1001 struct dk_softc *dksc = &rs->sc_dksc;
1002 RF_Raid_t *raidPtr;
1003 int error;
1004
1005 raidPtr = &rs->sc_r;
1006
1007 if (DK_BUSY(dksc, 0) ||
1008 raidPtr->recon_in_progress != 0 ||
1009 raidPtr->parity_rewrite_in_progress != 0 ||
1010 raidPtr->copyback_in_progress != 0)
1011 return EBUSY;
1012
1013 if ((rs->sc_flags & RAIDF_INITED) == 0)
1014 return 0;
1015
1016 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1017
1018 if ((error = rf_Shutdown(raidPtr)) != 0)
1019 return error;
1020
1021 rs->sc_flags &= ~RAIDF_INITED;
1022
1023 /* Kill off any queued buffers */
1024 dk_drain(dksc);
1025 bufq_free(dksc->sc_bufq);
1026
1027 /* Detach the disk. */
1028 dkwedge_delall(&dksc->sc_dkdev);
1029 disk_detach(&dksc->sc_dkdev);
1030 disk_destroy(&dksc->sc_dkdev);
1031 dk_detach(dksc);
1032
1033 return 0;
1034 }
1035
1036 static bool
1037 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1038 {
1039 switch (cmd) {
1040 case RAIDFRAME_ADD_HOT_SPARE:
1041 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1042 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1043 case RAIDFRAME_CHECK_PARITY:
1044 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1045 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1046 case RAIDFRAME_CHECK_RECON_STATUS:
1047 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1048 case RAIDFRAME_COPYBACK:
1049 case RAIDFRAME_DELETE_COMPONENT:
1050 case RAIDFRAME_FAIL_DISK:
1051 case RAIDFRAME_GET_ACCTOTALS:
1052 case RAIDFRAME_GET_COMPONENT_LABEL:
1053 case RAIDFRAME_GET_INFO:
1054 case RAIDFRAME_GET_SIZE:
1055 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1056 case RAIDFRAME_INIT_LABELS:
1057 case RAIDFRAME_KEEP_ACCTOTALS:
1058 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1059 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1060 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1061 case RAIDFRAME_PARITYMAP_STATUS:
1062 case RAIDFRAME_REBUILD_IN_PLACE:
1063 case RAIDFRAME_REMOVE_HOT_SPARE:
1064 case RAIDFRAME_RESET_ACCTOTALS:
1065 case RAIDFRAME_REWRITEPARITY:
1066 case RAIDFRAME_SET_AUTOCONFIG:
1067 case RAIDFRAME_SET_COMPONENT_LABEL:
1068 case RAIDFRAME_SET_ROOT:
1069 return (rs->sc_flags & RAIDF_INITED) == 0;
1070 }
1071 return false;
1072 }
1073
1074 int
1075 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1076 {
1077 struct rf_recon_req_internal *rrint;
1078
1079 if (raidPtr->Layout.map->faultsTolerated == 0) {
1080 /* Can't do this on a RAID 0!! */
1081 return EINVAL;
1082 }
1083
1084 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1085 /* bad column */
1086 return EINVAL;
1087 }
1088
1089 rf_lock_mutex2(raidPtr->mutex);
1090 if (raidPtr->status == rf_rs_reconstructing) {
1091 /* you can't fail a disk while we're reconstructing! */
1092 /* XXX wrong for RAID6 */
1093 goto out;
1094 }
1095 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1096 (raidPtr->numFailures > 0)) {
1097 /* some other component has failed. Let's not make
1098 things worse. XXX wrong for RAID6 */
1099 goto out;
1100 }
1101 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1102 /* Can't fail a spared disk! */
1103 goto out;
1104 }
1105 rf_unlock_mutex2(raidPtr->mutex);
1106
1107 /* make a copy of the recon request so that we don't rely on
1108 * the user's buffer */
1109 rrint = RF_Malloc(sizeof(*rrint));
1110 if (rrint == NULL)
1111 return(ENOMEM);
1112 rrint->col = rr->col;
1113 rrint->flags = rr->flags;
1114 rrint->raidPtr = raidPtr;
1115
1116 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1117 rrint, "raid_recon");
1118 out:
1119 rf_unlock_mutex2(raidPtr->mutex);
1120 return EINVAL;
1121 }
1122
1123 static int
1124 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1125 {
1126 /* allocate a buffer for the layout-specific data, and copy it in */
1127 if (k_cfg->layoutSpecificSize == 0)
1128 return 0;
1129
1130 if (k_cfg->layoutSpecificSize > 10000) {
1131 /* sanity check */
1132 return EINVAL;
1133 }
1134
1135 u_char *specific_buf;
1136 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1137 if (specific_buf == NULL)
1138 return ENOMEM;
1139
1140 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1141 k_cfg->layoutSpecificSize);
1142 if (retcode) {
1143 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1144 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1145 return retcode;
1146 }
1147
1148 k_cfg->layoutSpecific = specific_buf;
1149 return 0;
1150 }
1151
1152 static int
1153 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1154 {
1155 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1156
1157 if (rs->sc_r.valid) {
1158 /* There is a valid RAID set running on this unit! */
1159 printf("raid%d: Device already configured!\n", rs->sc_unit);
1160 return EINVAL;
1161 }
1162
1163 /* copy-in the configuration information */
1164 /* data points to a pointer to the configuration structure */
1165 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1166 if (*k_cfg == NULL) {
1167 return ENOMEM;
1168 }
1169 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1170 if (retcode == 0)
1171 return 0;
1172 RF_Free(*k_cfg, sizeof(RF_Config_t));
1173 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1174 rs->sc_flags |= RAIDF_SHUTDOWN;
1175 return retcode;
1176 }
1177
1178 int
1179 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1180 {
1181 int retcode;
1182 RF_Raid_t *raidPtr = &rs->sc_r;
1183
1184 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1185
1186 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1187 goto out;
1188
1189 /* should do some kind of sanity check on the configuration.
1190 * Store the sum of all the bytes in the last byte? */
1191
1192 /* configure the system */
1193
1194 /*
1195 * Clear the entire RAID descriptor, just to make sure
1196 * there is no stale data left in the case of a
1197 * reconfiguration
1198 */
1199 memset(raidPtr, 0, sizeof(*raidPtr));
1200 raidPtr->softc = rs;
1201 raidPtr->raidid = rs->sc_unit;
1202
1203 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1204
1205 if (retcode == 0) {
1206 /* allow this many simultaneous IO's to
1207 this RAID device */
1208 raidPtr->openings = RAIDOUTSTANDING;
1209
1210 raidinit(rs);
1211 raid_wakeup(raidPtr);
1212 rf_markalldirty(raidPtr);
1213 }
1214
1215 /* free the buffers. No return code here. */
1216 if (k_cfg->layoutSpecificSize) {
1217 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1218 }
1219 out:
1220 RF_Free(k_cfg, sizeof(RF_Config_t));
1221 if (retcode) {
1222 /*
1223 * If configuration failed, set sc_flags so that we
1224 * will detach the device when we close it.
1225 */
1226 rs->sc_flags |= RAIDF_SHUTDOWN;
1227 }
1228 return retcode;
1229 }
1230
1231 #if RF_DISABLED
1232 static int
1233 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1234 {
1235
1236 /* XXX check the label for valid stuff... */
1237 /* Note that some things *should not* get modified --
1238 the user should be re-initing the labels instead of
1239 trying to patch things.
1240 */
1241 #ifdef DEBUG
1242 int raidid = raidPtr->raidid;
1243 printf("raid%d: Got component label:\n", raidid);
1244 printf("raid%d: Version: %d\n", raidid, clabel->version);
1245 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1246 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1247 printf("raid%d: Column: %d\n", raidid, clabel->column);
1248 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1249 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1250 printf("raid%d: Status: %d\n", raidid, clabel->status);
1251 #endif /* DEBUG */
1252 clabel->row = 0;
1253 int column = clabel->column;
1254
1255 if ((column < 0) || (column >= raidPtr->numCol)) {
1256 return(EINVAL);
1257 }
1258
1259 /* XXX this isn't allowed to do anything for now :-) */
1260
1261 /* XXX and before it is, we need to fill in the rest
1262 of the fields!?!?!?! */
1263 memcpy(raidget_component_label(raidPtr, column),
1264 clabel, sizeof(*clabel));
1265 raidflush_component_label(raidPtr, column);
1266 return 0;
1267 }
1268 #endif
1269
1270 static int
1271 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1272 {
1273 /*
1274 we only want the serial number from
1275 the above. We get all the rest of the information
1276 from the config that was used to create this RAID
1277 set.
1278 */
1279
1280 raidPtr->serial_number = clabel->serial_number;
1281
1282 for (int column = 0; column < raidPtr->numCol; column++) {
1283 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1284 if (RF_DEAD_DISK(diskPtr->status))
1285 continue;
1286 RF_ComponentLabel_t *ci_label = raidget_component_label(
1287 raidPtr, column);
1288 /* Zeroing this is important. */
1289 memset(ci_label, 0, sizeof(*ci_label));
1290 raid_init_component_label(raidPtr, ci_label);
1291 ci_label->serial_number = raidPtr->serial_number;
1292 ci_label->row = 0; /* we dont' pretend to support more */
1293 rf_component_label_set_partitionsize(ci_label,
1294 diskPtr->partitionSize);
1295 ci_label->column = column;
1296 raidflush_component_label(raidPtr, column);
1297 /* XXXjld what about the spares? */
1298 }
1299
1300 return 0;
1301 }
1302
1303 static int
1304 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1305 {
1306
1307 if (raidPtr->Layout.map->faultsTolerated == 0) {
1308 /* Can't do this on a RAID 0!! */
1309 return EINVAL;
1310 }
1311
1312 if (raidPtr->recon_in_progress == 1) {
1313 /* a reconstruct is already in progress! */
1314 return EINVAL;
1315 }
1316
1317 RF_SingleComponent_t component;
1318 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1319 component.row = 0; /* we don't support any more */
1320 int column = component.column;
1321
1322 if ((column < 0) || (column >= raidPtr->numCol)) {
1323 return EINVAL;
1324 }
1325
1326 rf_lock_mutex2(raidPtr->mutex);
1327 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1328 (raidPtr->numFailures > 0)) {
1329 /* XXX 0 above shouldn't be constant!!! */
1330 /* some component other than this has failed.
1331 Let's not make things worse than they already
1332 are... */
1333 printf("raid%d: Unable to reconstruct to disk at:\n",
1334 raidPtr->raidid);
1335 printf("raid%d: Col: %d Too many failures.\n",
1336 raidPtr->raidid, column);
1337 rf_unlock_mutex2(raidPtr->mutex);
1338 return EINVAL;
1339 }
1340
1341 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1342 printf("raid%d: Unable to reconstruct to disk at:\n",
1343 raidPtr->raidid);
1344 printf("raid%d: Col: %d "
1345 "Reconstruction already occurring!\n",
1346 raidPtr->raidid, column);
1347
1348 rf_unlock_mutex2(raidPtr->mutex);
1349 return EINVAL;
1350 }
1351
1352 if (raidPtr->Disks[column].status == rf_ds_spared) {
1353 rf_unlock_mutex2(raidPtr->mutex);
1354 return EINVAL;
1355 }
1356
1357 rf_unlock_mutex2(raidPtr->mutex);
1358
1359 struct rf_recon_req_internal *rrint;
1360 rrint = RF_Malloc(sizeof(*rrint));
1361 if (rrint == NULL)
1362 return ENOMEM;
1363
1364 rrint->col = column;
1365 rrint->raidPtr = raidPtr;
1366
1367 return RF_CREATE_THREAD(raidPtr->recon_thread,
1368 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1369 }
1370
1371 static int
1372 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1373 {
1374 /*
1375 * This makes no sense on a RAID 0, or if we are not reconstructing
1376 * so tell the user it's done.
1377 */
1378 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1379 raidPtr->status != rf_rs_reconstructing) {
1380 *data = 100;
1381 return 0;
1382 }
1383 if (raidPtr->reconControl->numRUsTotal == 0) {
1384 *data = 0;
1385 return 0;
1386 }
1387 *data = (raidPtr->reconControl->numRUsComplete * 100
1388 / raidPtr->reconControl->numRUsTotal);
1389 return 0;
1390 }
1391
1392 static int
1393 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1394 {
1395 int unit = raidunit(dev);
1396 int part, pmask;
1397 struct raid_softc *rs;
1398 struct dk_softc *dksc;
1399 RF_Config_t *k_cfg;
1400 RF_Raid_t *raidPtr;
1401 RF_AccTotals_t *totals;
1402 RF_SingleComponent_t component;
1403 RF_DeviceConfig_t *d_cfg, *ucfgp;
1404 int retcode = 0;
1405 int column;
1406 RF_ComponentLabel_t *clabel;
1407 RF_SingleComponent_t *sparePtr,*componentPtr;
1408 int d;
1409
1410 if ((rs = raidget(unit, false)) == NULL)
1411 return ENXIO;
1412
1413 dksc = &rs->sc_dksc;
1414 raidPtr = &rs->sc_r;
1415
1416 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1417 (int) DISKPART(dev), (int) unit, cmd));
1418
1419 /* Must be initialized for these... */
1420 if (rf_must_be_initialized(rs, cmd))
1421 return ENXIO;
1422
1423 switch (cmd) {
1424 /* configure the system */
1425 case RAIDFRAME_CONFIGURE:
1426 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1427 return retcode;
1428 return rf_construct(rs, k_cfg);
1429
1430 /* shutdown the system */
1431 case RAIDFRAME_SHUTDOWN:
1432
1433 part = DISKPART(dev);
1434 pmask = (1 << part);
1435
1436 if ((retcode = raidlock(rs)) != 0)
1437 return retcode;
1438
1439 if (DK_BUSY(dksc, pmask) ||
1440 raidPtr->recon_in_progress != 0 ||
1441 raidPtr->parity_rewrite_in_progress != 0 ||
1442 raidPtr->copyback_in_progress != 0)
1443 retcode = EBUSY;
1444 else {
1445 /* detach and free on close */
1446 rs->sc_flags |= RAIDF_SHUTDOWN;
1447 retcode = 0;
1448 }
1449
1450 raidunlock(rs);
1451
1452 return retcode;
1453 case RAIDFRAME_GET_COMPONENT_LABEL:
1454 return rf_get_component_label(raidPtr, data);
1455
1456 #if RF_DISABLED
1457 case RAIDFRAME_SET_COMPONENT_LABEL:
1458 return rf_set_component_label(raidPtr, data);
1459 #endif
1460
1461 case RAIDFRAME_INIT_LABELS:
1462 return rf_init_component_label(raidPtr, data);
1463
1464 case RAIDFRAME_SET_AUTOCONFIG:
1465 d = rf_set_autoconfig(raidPtr, *(int *) data);
1466 printf("raid%d: New autoconfig value is: %d\n",
1467 raidPtr->raidid, d);
1468 *(int *) data = d;
1469 return retcode;
1470
1471 case RAIDFRAME_SET_ROOT:
1472 d = rf_set_rootpartition(raidPtr, *(int *) data);
1473 printf("raid%d: New rootpartition value is: %d\n",
1474 raidPtr->raidid, d);
1475 *(int *) data = d;
1476 return retcode;
1477
1478 /* initialize all parity */
1479 case RAIDFRAME_REWRITEPARITY:
1480
1481 if (raidPtr->Layout.map->faultsTolerated == 0) {
1482 /* Parity for RAID 0 is trivially correct */
1483 raidPtr->parity_good = RF_RAID_CLEAN;
1484 return 0;
1485 }
1486
1487 if (raidPtr->parity_rewrite_in_progress == 1) {
1488 /* Re-write is already in progress! */
1489 return EINVAL;
1490 }
1491
1492 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1493 rf_RewriteParityThread, raidPtr,"raid_parity");
1494
1495 case RAIDFRAME_ADD_HOT_SPARE:
1496 sparePtr = (RF_SingleComponent_t *) data;
1497 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1498 return rf_add_hot_spare(raidPtr, &component);
1499
1500 case RAIDFRAME_REMOVE_HOT_SPARE:
1501 return retcode;
1502
1503 case RAIDFRAME_DELETE_COMPONENT:
1504 componentPtr = (RF_SingleComponent_t *)data;
1505 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1506 return rf_delete_component(raidPtr, &component);
1507
1508 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1509 componentPtr = (RF_SingleComponent_t *)data;
1510 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1511 return rf_incorporate_hot_spare(raidPtr, &component);
1512
1513 case RAIDFRAME_REBUILD_IN_PLACE:
1514 return rf_rebuild_in_place(raidPtr, data);
1515
1516 case RAIDFRAME_GET_INFO:
1517 ucfgp = *(RF_DeviceConfig_t **)data;
1518 d_cfg = RF_Malloc(sizeof(*d_cfg));
1519 if (d_cfg == NULL)
1520 return ENOMEM;
1521 retcode = rf_get_info(raidPtr, d_cfg);
1522 if (retcode == 0) {
1523 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1524 }
1525 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1526 return retcode;
1527
1528 case RAIDFRAME_CHECK_PARITY:
1529 *(int *) data = raidPtr->parity_good;
1530 return 0;
1531
1532 case RAIDFRAME_PARITYMAP_STATUS:
1533 if (rf_paritymap_ineligible(raidPtr))
1534 return EINVAL;
1535 rf_paritymap_status(raidPtr->parity_map, data);
1536 return 0;
1537
1538 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1539 if (rf_paritymap_ineligible(raidPtr))
1540 return EINVAL;
1541 if (raidPtr->parity_map == NULL)
1542 return ENOENT; /* ??? */
1543 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1544 return EINVAL;
1545 return 0;
1546
1547 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1548 if (rf_paritymap_ineligible(raidPtr))
1549 return EINVAL;
1550 *(int *) data = rf_paritymap_get_disable(raidPtr);
1551 return 0;
1552
1553 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1554 if (rf_paritymap_ineligible(raidPtr))
1555 return EINVAL;
1556 rf_paritymap_set_disable(raidPtr, *(int *)data);
1557 /* XXX should errors be passed up? */
1558 return 0;
1559
1560 case RAIDFRAME_RESET_ACCTOTALS:
1561 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1562 return 0;
1563
1564 case RAIDFRAME_GET_ACCTOTALS:
1565 totals = (RF_AccTotals_t *) data;
1566 *totals = raidPtr->acc_totals;
1567 return 0;
1568
1569 case RAIDFRAME_KEEP_ACCTOTALS:
1570 raidPtr->keep_acc_totals = *(int *)data;
1571 return 0;
1572
1573 case RAIDFRAME_GET_SIZE:
1574 *(int *) data = raidPtr->totalSectors;
1575 return 0;
1576
1577 case RAIDFRAME_FAIL_DISK:
1578 return rf_fail_disk(raidPtr, data);
1579
1580 /* invoke a copyback operation after recon on whatever disk
1581 * needs it, if any */
1582 case RAIDFRAME_COPYBACK:
1583
1584 if (raidPtr->Layout.map->faultsTolerated == 0) {
1585 /* This makes no sense on a RAID 0!! */
1586 return EINVAL;
1587 }
1588
1589 if (raidPtr->copyback_in_progress == 1) {
1590 /* Copyback is already in progress! */
1591 return EINVAL;
1592 }
1593
1594 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1595 rf_CopybackThread, raidPtr, "raid_copyback");
1596
1597 /* return the percentage completion of reconstruction */
1598 case RAIDFRAME_CHECK_RECON_STATUS:
1599 return rf_check_recon_status(raidPtr, data);
1600
1601 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1602 rf_check_recon_status_ext(raidPtr, data);
1603 return 0;
1604
1605 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1606 if (raidPtr->Layout.map->faultsTolerated == 0) {
1607 /* This makes no sense on a RAID 0, so tell the
1608 user it's done. */
1609 *(int *) data = 100;
1610 return 0;
1611 }
1612 if (raidPtr->parity_rewrite_in_progress == 1) {
1613 *(int *) data = 100 *
1614 raidPtr->parity_rewrite_stripes_done /
1615 raidPtr->Layout.numStripe;
1616 } else {
1617 *(int *) data = 100;
1618 }
1619 return 0;
1620
1621 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1622 rf_check_parityrewrite_status_ext(raidPtr, data);
1623 return 0;
1624
1625 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1626 if (raidPtr->Layout.map->faultsTolerated == 0) {
1627 /* This makes no sense on a RAID 0 */
1628 *(int *) data = 100;
1629 return 0;
1630 }
1631 if (raidPtr->copyback_in_progress == 1) {
1632 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1633 raidPtr->Layout.numStripe;
1634 } else {
1635 *(int *) data = 100;
1636 }
1637 return 0;
1638
1639 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1640 rf_check_copyback_status_ext(raidPtr, data);
1641 return 0;
1642
1643 case RAIDFRAME_SET_LAST_UNIT:
1644 for (column = 0; column < raidPtr->numCol; column++)
1645 if (raidPtr->Disks[column].status != rf_ds_optimal)
1646 return EBUSY;
1647
1648 for (column = 0; column < raidPtr->numCol; column++) {
1649 clabel = raidget_component_label(raidPtr, column);
1650 clabel->last_unit = *(int *)data;
1651 raidflush_component_label(raidPtr, column);
1652 }
1653 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1654 return 0;
1655
1656 /* the sparetable daemon calls this to wait for the kernel to
1657 * need a spare table. this ioctl does not return until a
1658 * spare table is needed. XXX -- calling mpsleep here in the
1659 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1660 * -- I should either compute the spare table in the kernel,
1661 * or have a different -- XXX XXX -- interface (a different
1662 * character device) for delivering the table -- XXX */
1663 #if RF_DISABLED
1664 case RAIDFRAME_SPARET_WAIT:
1665 rf_lock_mutex2(rf_sparet_wait_mutex);
1666 while (!rf_sparet_wait_queue)
1667 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1668 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1669 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1670 rf_unlock_mutex2(rf_sparet_wait_mutex);
1671
1672 /* structure assignment */
1673 *((RF_SparetWait_t *) data) = *waitreq;
1674
1675 RF_Free(waitreq, sizeof(*waitreq));
1676 return 0;
1677
1678 /* wakes up a process waiting on SPARET_WAIT and puts an error
1679 * code in it that will cause the dameon to exit */
1680 case RAIDFRAME_ABORT_SPARET_WAIT:
1681 waitreq = RF_Malloc(sizeof(*waitreq));
1682 waitreq->fcol = -1;
1683 rf_lock_mutex2(rf_sparet_wait_mutex);
1684 waitreq->next = rf_sparet_wait_queue;
1685 rf_sparet_wait_queue = waitreq;
1686 rf_broadcast_cond2(rf_sparet_wait_cv);
1687 rf_unlock_mutex2(rf_sparet_wait_mutex);
1688 return 0;
1689
1690 /* used by the spare table daemon to deliver a spare table
1691 * into the kernel */
1692 case RAIDFRAME_SEND_SPARET:
1693
1694 /* install the spare table */
1695 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1696
1697 /* respond to the requestor. the return status of the spare
1698 * table installation is passed in the "fcol" field */
1699 waitred = RF_Malloc(sizeof(*waitreq));
1700 waitreq->fcol = retcode;
1701 rf_lock_mutex2(rf_sparet_wait_mutex);
1702 waitreq->next = rf_sparet_resp_queue;
1703 rf_sparet_resp_queue = waitreq;
1704 rf_broadcast_cond2(rf_sparet_resp_cv);
1705 rf_unlock_mutex2(rf_sparet_wait_mutex);
1706
1707 return retcode;
1708 #endif
1709 default:
1710 /*
1711 * Don't bother trying to load compat modules
1712 * if it is not our ioctl. This is more efficient
1713 * and makes rump tests not depend on compat code
1714 */
1715 if (IOCGROUP(cmd) != 'r')
1716 break;
1717 #ifdef _LP64
1718 if ((l->l_proc->p_flag & PK_32) != 0) {
1719 module_autoload("compat_netbsd32_raid",
1720 MODULE_CLASS_EXEC);
1721 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1722 (rs, cmd, data), enosys(), retcode);
1723 if (retcode != EPASSTHROUGH)
1724 return retcode;
1725 }
1726 #endif
1727 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1728 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1729 (rs, cmd, data), enosys(), retcode);
1730 if (retcode != EPASSTHROUGH)
1731 return retcode;
1732
1733 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1734 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1735 (rs, cmd, data), enosys(), retcode);
1736 if (retcode != EPASSTHROUGH)
1737 return retcode;
1738 break; /* fall through to the os-specific code below */
1739
1740 }
1741
1742 if (!raidPtr->valid)
1743 return (EINVAL);
1744
1745 /*
1746 * Add support for "regular" device ioctls here.
1747 */
1748
1749 switch (cmd) {
1750 case DIOCGCACHE:
1751 retcode = rf_get_component_caches(raidPtr, (int *)data);
1752 break;
1753
1754 case DIOCCACHESYNC:
1755 retcode = rf_sync_component_caches(raidPtr);
1756 break;
1757
1758 default:
1759 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1760 break;
1761 }
1762
1763 return (retcode);
1764
1765 }
1766
1767
1768 /* raidinit -- complete the rest of the initialization for the
1769 RAIDframe device. */
1770
1771
1772 static void
1773 raidinit(struct raid_softc *rs)
1774 {
1775 cfdata_t cf;
1776 unsigned int unit;
1777 struct dk_softc *dksc = &rs->sc_dksc;
1778 RF_Raid_t *raidPtr = &rs->sc_r;
1779 device_t dev;
1780
1781 unit = raidPtr->raidid;
1782
1783 /* XXX doesn't check bounds. */
1784 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1785
1786 /* attach the pseudo device */
1787 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1788 cf->cf_name = raid_cd.cd_name;
1789 cf->cf_atname = raid_cd.cd_name;
1790 cf->cf_unit = unit;
1791 cf->cf_fstate = FSTATE_STAR;
1792
1793 dev = config_attach_pseudo(cf);
1794 if (dev == NULL) {
1795 printf("raid%d: config_attach_pseudo failed\n",
1796 raidPtr->raidid);
1797 free(cf, M_RAIDFRAME);
1798 return;
1799 }
1800
1801 /* provide a backpointer to the real softc */
1802 raidsoftc(dev) = rs;
1803
1804 /* disk_attach actually creates space for the CPU disklabel, among
1805 * other things, so it's critical to call this *BEFORE* we try putzing
1806 * with disklabels. */
1807 dk_init(dksc, dev, DKTYPE_RAID);
1808 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1809
1810 /* XXX There may be a weird interaction here between this, and
1811 * protectedSectors, as used in RAIDframe. */
1812
1813 rs->sc_size = raidPtr->totalSectors;
1814
1815 /* Attach dk and disk subsystems */
1816 dk_attach(dksc);
1817 disk_attach(&dksc->sc_dkdev);
1818 rf_set_geometry(rs, raidPtr);
1819
1820 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1821
1822 /* mark unit as usuable */
1823 rs->sc_flags |= RAIDF_INITED;
1824
1825 dkwedge_discover(&dksc->sc_dkdev);
1826 }
1827
1828 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1829 /* wake up the daemon & tell it to get us a spare table
1830 * XXX
1831 * the entries in the queues should be tagged with the raidPtr
1832 * so that in the extremely rare case that two recons happen at once,
1833 * we know for which device were requesting a spare table
1834 * XXX
1835 *
1836 * XXX This code is not currently used. GO
1837 */
1838 int
1839 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1840 {
1841 int retcode;
1842
1843 rf_lock_mutex2(rf_sparet_wait_mutex);
1844 req->next = rf_sparet_wait_queue;
1845 rf_sparet_wait_queue = req;
1846 rf_broadcast_cond2(rf_sparet_wait_cv);
1847
1848 /* mpsleep unlocks the mutex */
1849 while (!rf_sparet_resp_queue) {
1850 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1851 }
1852 req = rf_sparet_resp_queue;
1853 rf_sparet_resp_queue = req->next;
1854 rf_unlock_mutex2(rf_sparet_wait_mutex);
1855
1856 retcode = req->fcol;
1857 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1858 * alloc'd */
1859 return (retcode);
1860 }
1861 #endif
1862
1863 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1864 * bp & passes it down.
1865 * any calls originating in the kernel must use non-blocking I/O
1866 * do some extra sanity checking to return "appropriate" error values for
1867 * certain conditions (to make some standard utilities work)
1868 *
1869 * Formerly known as: rf_DoAccessKernel
1870 */
1871 void
1872 raidstart(RF_Raid_t *raidPtr)
1873 {
1874 struct raid_softc *rs;
1875 struct dk_softc *dksc;
1876
1877 rs = raidPtr->softc;
1878 dksc = &rs->sc_dksc;
1879 /* quick check to see if anything has died recently */
1880 rf_lock_mutex2(raidPtr->mutex);
1881 if (raidPtr->numNewFailures > 0) {
1882 rf_unlock_mutex2(raidPtr->mutex);
1883 rf_update_component_labels(raidPtr,
1884 RF_NORMAL_COMPONENT_UPDATE);
1885 rf_lock_mutex2(raidPtr->mutex);
1886 raidPtr->numNewFailures--;
1887 }
1888 rf_unlock_mutex2(raidPtr->mutex);
1889
1890 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1891 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1892 return;
1893 }
1894
1895 dk_start(dksc, NULL);
1896 }
1897
1898 static int
1899 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1900 {
1901 RF_SectorCount_t num_blocks, pb, sum;
1902 RF_RaidAddr_t raid_addr;
1903 daddr_t blocknum;
1904 int do_async;
1905 int rc;
1906
1907 rf_lock_mutex2(raidPtr->mutex);
1908 if (raidPtr->openings == 0) {
1909 rf_unlock_mutex2(raidPtr->mutex);
1910 return EAGAIN;
1911 }
1912 rf_unlock_mutex2(raidPtr->mutex);
1913
1914 blocknum = bp->b_rawblkno;
1915
1916 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1917 (int) blocknum));
1918
1919 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1920 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1921
1922 /* *THIS* is where we adjust what block we're going to...
1923 * but DO NOT TOUCH bp->b_blkno!!! */
1924 raid_addr = blocknum;
1925
1926 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1927 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1928 sum = raid_addr + num_blocks + pb;
1929 if (1 || rf_debugKernelAccess) {
1930 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1931 (int) raid_addr, (int) sum, (int) num_blocks,
1932 (int) pb, (int) bp->b_resid));
1933 }
1934 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1935 || (sum < num_blocks) || (sum < pb)) {
1936 rc = ENOSPC;
1937 goto done;
1938 }
1939 /*
1940 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1941 */
1942
1943 if (bp->b_bcount & raidPtr->sectorMask) {
1944 rc = ENOSPC;
1945 goto done;
1946 }
1947 db1_printf(("Calling DoAccess..\n"));
1948
1949
1950 rf_lock_mutex2(raidPtr->mutex);
1951 raidPtr->openings--;
1952 rf_unlock_mutex2(raidPtr->mutex);
1953
1954 /*
1955 * Everything is async.
1956 */
1957 do_async = 1;
1958
1959 /* don't ever condition on bp->b_flags & B_WRITE.
1960 * always condition on B_READ instead */
1961
1962 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1963 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1964 do_async, raid_addr, num_blocks,
1965 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1966
1967 done:
1968 return rc;
1969 }
1970
1971 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1972
1973 int
1974 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1975 {
1976 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1977 struct buf *bp;
1978
1979 req->queue = queue;
1980 bp = req->bp;
1981
1982 switch (req->type) {
1983 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1984 /* XXX need to do something extra here.. */
1985 /* I'm leaving this in, as I've never actually seen it used,
1986 * and I'd like folks to report it... GO */
1987 printf(("WAKEUP CALLED\n"));
1988 queue->numOutstanding++;
1989
1990 bp->b_flags = 0;
1991 bp->b_private = req;
1992
1993 KernelWakeupFunc(bp);
1994 break;
1995
1996 case RF_IO_TYPE_READ:
1997 case RF_IO_TYPE_WRITE:
1998 #if RF_ACC_TRACE > 0
1999 if (req->tracerec) {
2000 RF_ETIMER_START(req->tracerec->timer);
2001 }
2002 #endif
2003 InitBP(bp, queue->rf_cinfo->ci_vp,
2004 op, queue->rf_cinfo->ci_dev,
2005 req->sectorOffset, req->numSector,
2006 req->buf, KernelWakeupFunc, (void *) req,
2007 queue->raidPtr->logBytesPerSector, req->b_proc);
2008
2009 if (rf_debugKernelAccess) {
2010 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2011 (long) bp->b_blkno));
2012 }
2013 queue->numOutstanding++;
2014 queue->last_deq_sector = req->sectorOffset;
2015 /* acc wouldn't have been let in if there were any pending
2016 * reqs at any other priority */
2017 queue->curPriority = req->priority;
2018
2019 db1_printf(("Going for %c to unit %d col %d\n",
2020 req->type, queue->raidPtr->raidid,
2021 queue->col));
2022 db1_printf(("sector %d count %d (%d bytes) %d\n",
2023 (int) req->sectorOffset, (int) req->numSector,
2024 (int) (req->numSector <<
2025 queue->raidPtr->logBytesPerSector),
2026 (int) queue->raidPtr->logBytesPerSector));
2027
2028 /*
2029 * XXX: drop lock here since this can block at
2030 * least with backing SCSI devices. Retake it
2031 * to minimize fuss with calling interfaces.
2032 */
2033
2034 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2035 bdev_strategy(bp);
2036 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2037 break;
2038
2039 default:
2040 panic("bad req->type in rf_DispatchKernelIO");
2041 }
2042 db1_printf(("Exiting from DispatchKernelIO\n"));
2043
2044 return (0);
2045 }
2046 /* this is the callback function associated with a I/O invoked from
2047 kernel code.
2048 */
2049 static void
2050 KernelWakeupFunc(struct buf *bp)
2051 {
2052 RF_DiskQueueData_t *req = NULL;
2053 RF_DiskQueue_t *queue;
2054
2055 db1_printf(("recovering the request queue:\n"));
2056
2057 req = bp->b_private;
2058
2059 queue = (RF_DiskQueue_t *) req->queue;
2060
2061 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2062
2063 #if RF_ACC_TRACE > 0
2064 if (req->tracerec) {
2065 RF_ETIMER_STOP(req->tracerec->timer);
2066 RF_ETIMER_EVAL(req->tracerec->timer);
2067 rf_lock_mutex2(rf_tracing_mutex);
2068 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2069 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2070 req->tracerec->num_phys_ios++;
2071 rf_unlock_mutex2(rf_tracing_mutex);
2072 }
2073 #endif
2074
2075 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2076 * ballistic, and mark the component as hosed... */
2077
2078 if (bp->b_error != 0) {
2079 /* Mark the disk as dead */
2080 /* but only mark it once... */
2081 /* and only if it wouldn't leave this RAID set
2082 completely broken */
2083 if (((queue->raidPtr->Disks[queue->col].status ==
2084 rf_ds_optimal) ||
2085 (queue->raidPtr->Disks[queue->col].status ==
2086 rf_ds_used_spare)) &&
2087 (queue->raidPtr->numFailures <
2088 queue->raidPtr->Layout.map->faultsTolerated)) {
2089 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2090 queue->raidPtr->raidid,
2091 bp->b_error,
2092 queue->raidPtr->Disks[queue->col].devname);
2093 queue->raidPtr->Disks[queue->col].status =
2094 rf_ds_failed;
2095 queue->raidPtr->status = rf_rs_degraded;
2096 queue->raidPtr->numFailures++;
2097 queue->raidPtr->numNewFailures++;
2098 } else { /* Disk is already dead... */
2099 /* printf("Disk already marked as dead!\n"); */
2100 }
2101
2102 }
2103
2104 /* Fill in the error value */
2105 req->error = bp->b_error;
2106
2107 /* Drop this one on the "finished" queue... */
2108 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2109
2110 /* Let the raidio thread know there is work to be done. */
2111 rf_signal_cond2(queue->raidPtr->iodone_cv);
2112
2113 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2114 }
2115
2116
2117 /*
2118 * initialize a buf structure for doing an I/O in the kernel.
2119 */
2120 static void
2121 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2122 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2123 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2124 struct proc *b_proc)
2125 {
2126 /* bp->b_flags = B_PHYS | rw_flag; */
2127 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2128 bp->b_oflags = 0;
2129 bp->b_cflags = 0;
2130 bp->b_bcount = numSect << logBytesPerSector;
2131 bp->b_bufsize = bp->b_bcount;
2132 bp->b_error = 0;
2133 bp->b_dev = dev;
2134 bp->b_data = bf;
2135 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2136 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2137 if (bp->b_bcount == 0) {
2138 panic("bp->b_bcount is zero in InitBP!!");
2139 }
2140 bp->b_proc = b_proc;
2141 bp->b_iodone = cbFunc;
2142 bp->b_private = cbArg;
2143 }
2144
2145 /*
2146 * Wait interruptibly for an exclusive lock.
2147 *
2148 * XXX
2149 * Several drivers do this; it should be abstracted and made MP-safe.
2150 * (Hmm... where have we seen this warning before :-> GO )
2151 */
2152 static int
2153 raidlock(struct raid_softc *rs)
2154 {
2155 int error;
2156
2157 error = 0;
2158 mutex_enter(&rs->sc_mutex);
2159 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2160 rs->sc_flags |= RAIDF_WANTED;
2161 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2162 if (error != 0)
2163 goto done;
2164 }
2165 rs->sc_flags |= RAIDF_LOCKED;
2166 done:
2167 mutex_exit(&rs->sc_mutex);
2168 return (error);
2169 }
2170 /*
2171 * Unlock and wake up any waiters.
2172 */
2173 static void
2174 raidunlock(struct raid_softc *rs)
2175 {
2176
2177 mutex_enter(&rs->sc_mutex);
2178 rs->sc_flags &= ~RAIDF_LOCKED;
2179 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2180 rs->sc_flags &= ~RAIDF_WANTED;
2181 cv_broadcast(&rs->sc_cv);
2182 }
2183 mutex_exit(&rs->sc_mutex);
2184 }
2185
2186
2187 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2188 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2189 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2190
2191 static daddr_t
2192 rf_component_info_offset(void)
2193 {
2194
2195 return RF_COMPONENT_INFO_OFFSET;
2196 }
2197
2198 static daddr_t
2199 rf_component_info_size(unsigned secsize)
2200 {
2201 daddr_t info_size;
2202
2203 KASSERT(secsize);
2204 if (secsize > RF_COMPONENT_INFO_SIZE)
2205 info_size = secsize;
2206 else
2207 info_size = RF_COMPONENT_INFO_SIZE;
2208
2209 return info_size;
2210 }
2211
2212 static daddr_t
2213 rf_parity_map_offset(RF_Raid_t *raidPtr)
2214 {
2215 daddr_t map_offset;
2216
2217 KASSERT(raidPtr->bytesPerSector);
2218 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2219 map_offset = raidPtr->bytesPerSector;
2220 else
2221 map_offset = RF_COMPONENT_INFO_SIZE;
2222 map_offset += rf_component_info_offset();
2223
2224 return map_offset;
2225 }
2226
2227 static daddr_t
2228 rf_parity_map_size(RF_Raid_t *raidPtr)
2229 {
2230 daddr_t map_size;
2231
2232 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2233 map_size = raidPtr->bytesPerSector;
2234 else
2235 map_size = RF_PARITY_MAP_SIZE;
2236
2237 return map_size;
2238 }
2239
2240 int
2241 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2242 {
2243 RF_ComponentLabel_t *clabel;
2244
2245 clabel = raidget_component_label(raidPtr, col);
2246 clabel->clean = RF_RAID_CLEAN;
2247 raidflush_component_label(raidPtr, col);
2248 return(0);
2249 }
2250
2251
2252 int
2253 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2254 {
2255 RF_ComponentLabel_t *clabel;
2256
2257 clabel = raidget_component_label(raidPtr, col);
2258 clabel->clean = RF_RAID_DIRTY;
2259 raidflush_component_label(raidPtr, col);
2260 return(0);
2261 }
2262
2263 int
2264 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2265 {
2266 KASSERT(raidPtr->bytesPerSector);
2267 return raidread_component_label(raidPtr->bytesPerSector,
2268 raidPtr->Disks[col].dev,
2269 raidPtr->raid_cinfo[col].ci_vp,
2270 &raidPtr->raid_cinfo[col].ci_label);
2271 }
2272
2273 RF_ComponentLabel_t *
2274 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2275 {
2276 return &raidPtr->raid_cinfo[col].ci_label;
2277 }
2278
2279 int
2280 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2281 {
2282 RF_ComponentLabel_t *label;
2283
2284 label = &raidPtr->raid_cinfo[col].ci_label;
2285 label->mod_counter = raidPtr->mod_counter;
2286 #ifndef RF_NO_PARITY_MAP
2287 label->parity_map_modcount = label->mod_counter;
2288 #endif
2289 return raidwrite_component_label(raidPtr->bytesPerSector,
2290 raidPtr->Disks[col].dev,
2291 raidPtr->raid_cinfo[col].ci_vp, label);
2292 }
2293
2294
2295 static int
2296 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2297 RF_ComponentLabel_t *clabel)
2298 {
2299 return raidread_component_area(dev, b_vp, clabel,
2300 sizeof(RF_ComponentLabel_t),
2301 rf_component_info_offset(),
2302 rf_component_info_size(secsize));
2303 }
2304
2305 /* ARGSUSED */
2306 static int
2307 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2308 size_t msize, daddr_t offset, daddr_t dsize)
2309 {
2310 struct buf *bp;
2311 int error;
2312
2313 /* XXX should probably ensure that we don't try to do this if
2314 someone has changed rf_protected_sectors. */
2315
2316 if (b_vp == NULL) {
2317 /* For whatever reason, this component is not valid.
2318 Don't try to read a component label from it. */
2319 return(EINVAL);
2320 }
2321
2322 /* get a block of the appropriate size... */
2323 bp = geteblk((int)dsize);
2324 bp->b_dev = dev;
2325
2326 /* get our ducks in a row for the read */
2327 bp->b_blkno = offset / DEV_BSIZE;
2328 bp->b_bcount = dsize;
2329 bp->b_flags |= B_READ;
2330 bp->b_resid = dsize;
2331
2332 bdev_strategy(bp);
2333 error = biowait(bp);
2334
2335 if (!error) {
2336 memcpy(data, bp->b_data, msize);
2337 }
2338
2339 brelse(bp, 0);
2340 return(error);
2341 }
2342
2343
2344 static int
2345 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2346 RF_ComponentLabel_t *clabel)
2347 {
2348 return raidwrite_component_area(dev, b_vp, clabel,
2349 sizeof(RF_ComponentLabel_t),
2350 rf_component_info_offset(),
2351 rf_component_info_size(secsize), 0);
2352 }
2353
2354 /* ARGSUSED */
2355 static int
2356 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2357 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2358 {
2359 struct buf *bp;
2360 int error;
2361
2362 /* get a block of the appropriate size... */
2363 bp = geteblk((int)dsize);
2364 bp->b_dev = dev;
2365
2366 /* get our ducks in a row for the write */
2367 bp->b_blkno = offset / DEV_BSIZE;
2368 bp->b_bcount = dsize;
2369 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2370 bp->b_resid = dsize;
2371
2372 memset(bp->b_data, 0, dsize);
2373 memcpy(bp->b_data, data, msize);
2374
2375 bdev_strategy(bp);
2376 if (asyncp)
2377 return 0;
2378 error = biowait(bp);
2379 brelse(bp, 0);
2380 if (error) {
2381 #if 1
2382 printf("Failed to write RAID component info!\n");
2383 #endif
2384 }
2385
2386 return(error);
2387 }
2388
2389 void
2390 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2391 {
2392 int c;
2393
2394 for (c = 0; c < raidPtr->numCol; c++) {
2395 /* Skip dead disks. */
2396 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2397 continue;
2398 /* XXXjld: what if an error occurs here? */
2399 raidwrite_component_area(raidPtr->Disks[c].dev,
2400 raidPtr->raid_cinfo[c].ci_vp, map,
2401 RF_PARITYMAP_NBYTE,
2402 rf_parity_map_offset(raidPtr),
2403 rf_parity_map_size(raidPtr), 0);
2404 }
2405 }
2406
2407 void
2408 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2409 {
2410 struct rf_paritymap_ondisk tmp;
2411 int c,first;
2412
2413 first=1;
2414 for (c = 0; c < raidPtr->numCol; c++) {
2415 /* Skip dead disks. */
2416 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2417 continue;
2418 raidread_component_area(raidPtr->Disks[c].dev,
2419 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2420 RF_PARITYMAP_NBYTE,
2421 rf_parity_map_offset(raidPtr),
2422 rf_parity_map_size(raidPtr));
2423 if (first) {
2424 memcpy(map, &tmp, sizeof(*map));
2425 first = 0;
2426 } else {
2427 rf_paritymap_merge(map, &tmp);
2428 }
2429 }
2430 }
2431
2432 void
2433 rf_markalldirty(RF_Raid_t *raidPtr)
2434 {
2435 RF_ComponentLabel_t *clabel;
2436 int sparecol;
2437 int c;
2438 int j;
2439 int scol = -1;
2440
2441 raidPtr->mod_counter++;
2442 for (c = 0; c < raidPtr->numCol; c++) {
2443 /* we don't want to touch (at all) a disk that has
2444 failed */
2445 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2446 clabel = raidget_component_label(raidPtr, c);
2447 if (clabel->status == rf_ds_spared) {
2448 /* XXX do something special...
2449 but whatever you do, don't
2450 try to access it!! */
2451 } else {
2452 raidmarkdirty(raidPtr, c);
2453 }
2454 }
2455 }
2456
2457 for( c = 0; c < raidPtr->numSpare ; c++) {
2458 sparecol = raidPtr->numCol + c;
2459 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2460 /*
2461
2462 we claim this disk is "optimal" if it's
2463 rf_ds_used_spare, as that means it should be
2464 directly substitutable for the disk it replaced.
2465 We note that too...
2466
2467 */
2468
2469 for(j=0;j<raidPtr->numCol;j++) {
2470 if (raidPtr->Disks[j].spareCol == sparecol) {
2471 scol = j;
2472 break;
2473 }
2474 }
2475
2476 clabel = raidget_component_label(raidPtr, sparecol);
2477 /* make sure status is noted */
2478
2479 raid_init_component_label(raidPtr, clabel);
2480
2481 clabel->row = 0;
2482 clabel->column = scol;
2483 /* Note: we *don't* change status from rf_ds_used_spare
2484 to rf_ds_optimal */
2485 /* clabel.status = rf_ds_optimal; */
2486
2487 raidmarkdirty(raidPtr, sparecol);
2488 }
2489 }
2490 }
2491
2492
2493 void
2494 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2495 {
2496 RF_ComponentLabel_t *clabel;
2497 int sparecol;
2498 int c;
2499 int j;
2500 int scol;
2501 struct raid_softc *rs = raidPtr->softc;
2502
2503 scol = -1;
2504
2505 /* XXX should do extra checks to make sure things really are clean,
2506 rather than blindly setting the clean bit... */
2507
2508 raidPtr->mod_counter++;
2509
2510 for (c = 0; c < raidPtr->numCol; c++) {
2511 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2512 clabel = raidget_component_label(raidPtr, c);
2513 /* make sure status is noted */
2514 clabel->status = rf_ds_optimal;
2515
2516 /* note what unit we are configured as */
2517 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2518 clabel->last_unit = raidPtr->raidid;
2519
2520 raidflush_component_label(raidPtr, c);
2521 if (final == RF_FINAL_COMPONENT_UPDATE) {
2522 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2523 raidmarkclean(raidPtr, c);
2524 }
2525 }
2526 }
2527 /* else we don't touch it.. */
2528 }
2529
2530 for( c = 0; c < raidPtr->numSpare ; c++) {
2531 sparecol = raidPtr->numCol + c;
2532 /* Need to ensure that the reconstruct actually completed! */
2533 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2534 /*
2535
2536 we claim this disk is "optimal" if it's
2537 rf_ds_used_spare, as that means it should be
2538 directly substitutable for the disk it replaced.
2539 We note that too...
2540
2541 */
2542
2543 for(j=0;j<raidPtr->numCol;j++) {
2544 if (raidPtr->Disks[j].spareCol == sparecol) {
2545 scol = j;
2546 break;
2547 }
2548 }
2549
2550 /* XXX shouldn't *really* need this... */
2551 clabel = raidget_component_label(raidPtr, sparecol);
2552 /* make sure status is noted */
2553
2554 raid_init_component_label(raidPtr, clabel);
2555
2556 clabel->column = scol;
2557 clabel->status = rf_ds_optimal;
2558 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2559 clabel->last_unit = raidPtr->raidid;
2560
2561 raidflush_component_label(raidPtr, sparecol);
2562 if (final == RF_FINAL_COMPONENT_UPDATE) {
2563 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2564 raidmarkclean(raidPtr, sparecol);
2565 }
2566 }
2567 }
2568 }
2569 }
2570
2571 void
2572 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2573 {
2574
2575 if (vp != NULL) {
2576 if (auto_configured == 1) {
2577 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2578 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2579 vput(vp);
2580
2581 } else {
2582 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2583 }
2584 }
2585 }
2586
2587
2588 void
2589 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2590 {
2591 int r,c;
2592 struct vnode *vp;
2593 int acd;
2594
2595
2596 /* We take this opportunity to close the vnodes like we should.. */
2597
2598 for (c = 0; c < raidPtr->numCol; c++) {
2599 vp = raidPtr->raid_cinfo[c].ci_vp;
2600 acd = raidPtr->Disks[c].auto_configured;
2601 rf_close_component(raidPtr, vp, acd);
2602 raidPtr->raid_cinfo[c].ci_vp = NULL;
2603 raidPtr->Disks[c].auto_configured = 0;
2604 }
2605
2606 for (r = 0; r < raidPtr->numSpare; r++) {
2607 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2608 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2609 rf_close_component(raidPtr, vp, acd);
2610 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2611 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2612 }
2613 }
2614
2615
2616 void
2617 rf_ReconThread(struct rf_recon_req_internal *req)
2618 {
2619 int s;
2620 RF_Raid_t *raidPtr;
2621
2622 s = splbio();
2623 raidPtr = (RF_Raid_t *) req->raidPtr;
2624 raidPtr->recon_in_progress = 1;
2625
2626 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2627 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2628
2629 RF_Free(req, sizeof(*req));
2630
2631 raidPtr->recon_in_progress = 0;
2632 splx(s);
2633
2634 /* That's all... */
2635 kthread_exit(0); /* does not return */
2636 }
2637
2638 void
2639 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2640 {
2641 int retcode;
2642 int s;
2643
2644 raidPtr->parity_rewrite_stripes_done = 0;
2645 raidPtr->parity_rewrite_in_progress = 1;
2646 s = splbio();
2647 retcode = rf_RewriteParity(raidPtr);
2648 splx(s);
2649 if (retcode) {
2650 printf("raid%d: Error re-writing parity (%d)!\n",
2651 raidPtr->raidid, retcode);
2652 } else {
2653 /* set the clean bit! If we shutdown correctly,
2654 the clean bit on each component label will get
2655 set */
2656 raidPtr->parity_good = RF_RAID_CLEAN;
2657 }
2658 raidPtr->parity_rewrite_in_progress = 0;
2659
2660 /* Anyone waiting for us to stop? If so, inform them... */
2661 if (raidPtr->waitShutdown) {
2662 rf_lock_mutex2(raidPtr->rad_lock);
2663 cv_broadcast(&raidPtr->parity_rewrite_cv);
2664 rf_unlock_mutex2(raidPtr->rad_lock);
2665 }
2666
2667 /* That's all... */
2668 kthread_exit(0); /* does not return */
2669 }
2670
2671
2672 void
2673 rf_CopybackThread(RF_Raid_t *raidPtr)
2674 {
2675 int s;
2676
2677 raidPtr->copyback_in_progress = 1;
2678 s = splbio();
2679 rf_CopybackReconstructedData(raidPtr);
2680 splx(s);
2681 raidPtr->copyback_in_progress = 0;
2682
2683 /* That's all... */
2684 kthread_exit(0); /* does not return */
2685 }
2686
2687
2688 void
2689 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2690 {
2691 int s;
2692 RF_Raid_t *raidPtr;
2693
2694 s = splbio();
2695 raidPtr = req->raidPtr;
2696 raidPtr->recon_in_progress = 1;
2697 rf_ReconstructInPlace(raidPtr, req->col);
2698 RF_Free(req, sizeof(*req));
2699 raidPtr->recon_in_progress = 0;
2700 splx(s);
2701
2702 /* That's all... */
2703 kthread_exit(0); /* does not return */
2704 }
2705
2706 static RF_AutoConfig_t *
2707 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2708 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2709 unsigned secsize)
2710 {
2711 int good_one = 0;
2712 RF_ComponentLabel_t *clabel;
2713 RF_AutoConfig_t *ac;
2714
2715 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2716
2717 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2718 /* Got the label. Does it look reasonable? */
2719 if (rf_reasonable_label(clabel, numsecs) &&
2720 (rf_component_label_partitionsize(clabel) <= size)) {
2721 #ifdef DEBUG
2722 printf("Component on: %s: %llu\n",
2723 cname, (unsigned long long)size);
2724 rf_print_component_label(clabel);
2725 #endif
2726 /* if it's reasonable, add it, else ignore it. */
2727 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2728 M_WAITOK);
2729 strlcpy(ac->devname, cname, sizeof(ac->devname));
2730 ac->dev = dev;
2731 ac->vp = vp;
2732 ac->clabel = clabel;
2733 ac->next = ac_list;
2734 ac_list = ac;
2735 good_one = 1;
2736 }
2737 }
2738 if (!good_one) {
2739 /* cleanup */
2740 free(clabel, M_RAIDFRAME);
2741 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2742 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2743 vput(vp);
2744 }
2745 return ac_list;
2746 }
2747
2748 RF_AutoConfig_t *
2749 rf_find_raid_components(void)
2750 {
2751 struct vnode *vp;
2752 struct disklabel label;
2753 device_t dv;
2754 deviter_t di;
2755 dev_t dev;
2756 int bmajor, bminor, wedge, rf_part_found;
2757 int error;
2758 int i;
2759 RF_AutoConfig_t *ac_list;
2760 uint64_t numsecs;
2761 unsigned secsize;
2762 int dowedges;
2763
2764 /* initialize the AutoConfig list */
2765 ac_list = NULL;
2766
2767 /*
2768 * we begin by trolling through *all* the devices on the system *twice*
2769 * first we scan for wedges, second for other devices. This avoids
2770 * using a raw partition instead of a wedge that covers the whole disk
2771 */
2772
2773 for (dowedges=1; dowedges>=0; --dowedges) {
2774 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2775 dv = deviter_next(&di)) {
2776
2777 /* we are only interested in disks... */
2778 if (device_class(dv) != DV_DISK)
2779 continue;
2780
2781 /* we don't care about floppies... */
2782 if (device_is_a(dv, "fd")) {
2783 continue;
2784 }
2785
2786 /* we don't care about CD's... */
2787 if (device_is_a(dv, "cd")) {
2788 continue;
2789 }
2790
2791 /* we don't care about md's... */
2792 if (device_is_a(dv, "md")) {
2793 continue;
2794 }
2795
2796 /* hdfd is the Atari/Hades floppy driver */
2797 if (device_is_a(dv, "hdfd")) {
2798 continue;
2799 }
2800
2801 /* fdisa is the Atari/Milan floppy driver */
2802 if (device_is_a(dv, "fdisa")) {
2803 continue;
2804 }
2805
2806 /* are we in the wedges pass ? */
2807 wedge = device_is_a(dv, "dk");
2808 if (wedge != dowedges) {
2809 continue;
2810 }
2811
2812 /* need to find the device_name_to_block_device_major stuff */
2813 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2814
2815 rf_part_found = 0; /*No raid partition as yet*/
2816
2817 /* get a vnode for the raw partition of this disk */
2818 bminor = minor(device_unit(dv));
2819 dev = wedge ? makedev(bmajor, bminor) :
2820 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2821 if (bdevvp(dev, &vp))
2822 panic("RAID can't alloc vnode");
2823
2824 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2825 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2826
2827 if (error) {
2828 /* "Who cares." Continue looking
2829 for something that exists*/
2830 vput(vp);
2831 continue;
2832 }
2833
2834 error = getdisksize(vp, &numsecs, &secsize);
2835 if (error) {
2836 /*
2837 * Pseudo devices like vnd and cgd can be
2838 * opened but may still need some configuration.
2839 * Ignore these quietly.
2840 */
2841 if (error != ENXIO)
2842 printf("RAIDframe: can't get disk size"
2843 " for dev %s (%d)\n",
2844 device_xname(dv), error);
2845 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2846 vput(vp);
2847 continue;
2848 }
2849 if (wedge) {
2850 struct dkwedge_info dkw;
2851 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2852 NOCRED);
2853 if (error) {
2854 printf("RAIDframe: can't get wedge info for "
2855 "dev %s (%d)\n", device_xname(dv), error);
2856 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2857 vput(vp);
2858 continue;
2859 }
2860
2861 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2862 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2863 vput(vp);
2864 continue;
2865 }
2866
2867 VOP_UNLOCK(vp);
2868 ac_list = rf_get_component(ac_list, dev, vp,
2869 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2870 rf_part_found = 1; /*There is a raid component on this disk*/
2871 continue;
2872 }
2873
2874 /* Ok, the disk exists. Go get the disklabel. */
2875 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2876 if (error) {
2877 /*
2878 * XXX can't happen - open() would
2879 * have errored out (or faked up one)
2880 */
2881 if (error != ENOTTY)
2882 printf("RAIDframe: can't get label for dev "
2883 "%s (%d)\n", device_xname(dv), error);
2884 }
2885
2886 /* don't need this any more. We'll allocate it again
2887 a little later if we really do... */
2888 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2889 vput(vp);
2890
2891 if (error)
2892 continue;
2893
2894 rf_part_found = 0; /*No raid partitions yet*/
2895 for (i = 0; i < label.d_npartitions; i++) {
2896 char cname[sizeof(ac_list->devname)];
2897
2898 /* We only support partitions marked as RAID */
2899 if (label.d_partitions[i].p_fstype != FS_RAID)
2900 continue;
2901
2902 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2903 if (bdevvp(dev, &vp))
2904 panic("RAID can't alloc vnode");
2905
2906 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2907 error = VOP_OPEN(vp, FREAD, NOCRED);
2908 if (error) {
2909 /* Whatever... */
2910 vput(vp);
2911 continue;
2912 }
2913 VOP_UNLOCK(vp);
2914 snprintf(cname, sizeof(cname), "%s%c",
2915 device_xname(dv), 'a' + i);
2916 ac_list = rf_get_component(ac_list, dev, vp, cname,
2917 label.d_partitions[i].p_size, numsecs, secsize);
2918 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2919 }
2920
2921 /*
2922 *If there is no raid component on this disk, either in a
2923 *disklabel or inside a wedge, check the raw partition as well,
2924 *as it is possible to configure raid components on raw disk
2925 *devices.
2926 */
2927
2928 if (!rf_part_found) {
2929 char cname[sizeof(ac_list->devname)];
2930
2931 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2932 if (bdevvp(dev, &vp))
2933 panic("RAID can't alloc vnode");
2934
2935 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2936
2937 error = VOP_OPEN(vp, FREAD, NOCRED);
2938 if (error) {
2939 /* Whatever... */
2940 vput(vp);
2941 continue;
2942 }
2943 VOP_UNLOCK(vp);
2944 snprintf(cname, sizeof(cname), "%s%c",
2945 device_xname(dv), 'a' + RAW_PART);
2946 ac_list = rf_get_component(ac_list, dev, vp, cname,
2947 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2948 }
2949 }
2950 deviter_release(&di);
2951 }
2952 return ac_list;
2953 }
2954
2955
2956 int
2957 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2958 {
2959
2960 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2961 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2962 ((clabel->clean == RF_RAID_CLEAN) ||
2963 (clabel->clean == RF_RAID_DIRTY)) &&
2964 clabel->row >=0 &&
2965 clabel->column >= 0 &&
2966 clabel->num_rows > 0 &&
2967 clabel->num_columns > 0 &&
2968 clabel->row < clabel->num_rows &&
2969 clabel->column < clabel->num_columns &&
2970 clabel->blockSize > 0 &&
2971 /*
2972 * numBlocksHi may contain garbage, but it is ok since
2973 * the type is unsigned. If it is really garbage,
2974 * rf_fix_old_label_size() will fix it.
2975 */
2976 rf_component_label_numblocks(clabel) > 0) {
2977 /*
2978 * label looks reasonable enough...
2979 * let's make sure it has no old garbage.
2980 */
2981 if (numsecs)
2982 rf_fix_old_label_size(clabel, numsecs);
2983 return(1);
2984 }
2985 return(0);
2986 }
2987
2988
2989 /*
2990 * For reasons yet unknown, some old component labels have garbage in
2991 * the newer numBlocksHi region, and this causes lossage. Since those
2992 * disks will also have numsecs set to less than 32 bits of sectors,
2993 * we can determine when this corruption has occurred, and fix it.
2994 *
2995 * The exact same problem, with the same unknown reason, happens to
2996 * the partitionSizeHi member as well.
2997 */
2998 static void
2999 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3000 {
3001
3002 if (numsecs < ((uint64_t)1 << 32)) {
3003 if (clabel->numBlocksHi) {
3004 printf("WARNING: total sectors < 32 bits, yet "
3005 "numBlocksHi set\n"
3006 "WARNING: resetting numBlocksHi to zero.\n");
3007 clabel->numBlocksHi = 0;
3008 }
3009
3010 if (clabel->partitionSizeHi) {
3011 printf("WARNING: total sectors < 32 bits, yet "
3012 "partitionSizeHi set\n"
3013 "WARNING: resetting partitionSizeHi to zero.\n");
3014 clabel->partitionSizeHi = 0;
3015 }
3016 }
3017 }
3018
3019
3020 #ifdef DEBUG
3021 void
3022 rf_print_component_label(RF_ComponentLabel_t *clabel)
3023 {
3024 uint64_t numBlocks;
3025 static const char *rp[] = {
3026 "No", "Force", "Soft", "*invalid*"
3027 };
3028
3029
3030 numBlocks = rf_component_label_numblocks(clabel);
3031
3032 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3033 clabel->row, clabel->column,
3034 clabel->num_rows, clabel->num_columns);
3035 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3036 clabel->version, clabel->serial_number,
3037 clabel->mod_counter);
3038 printf(" Clean: %s Status: %d\n",
3039 clabel->clean ? "Yes" : "No", clabel->status);
3040 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3041 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3042 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3043 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3044 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3045 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3046 printf(" Last configured as: raid%d\n", clabel->last_unit);
3047 #if 0
3048 printf(" Config order: %d\n", clabel->config_order);
3049 #endif
3050
3051 }
3052 #endif
3053
3054 RF_ConfigSet_t *
3055 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3056 {
3057 RF_AutoConfig_t *ac;
3058 RF_ConfigSet_t *config_sets;
3059 RF_ConfigSet_t *cset;
3060 RF_AutoConfig_t *ac_next;
3061
3062
3063 config_sets = NULL;
3064
3065 /* Go through the AutoConfig list, and figure out which components
3066 belong to what sets. */
3067 ac = ac_list;
3068 while(ac!=NULL) {
3069 /* we're going to putz with ac->next, so save it here
3070 for use at the end of the loop */
3071 ac_next = ac->next;
3072
3073 if (config_sets == NULL) {
3074 /* will need at least this one... */
3075 config_sets = malloc(sizeof(RF_ConfigSet_t),
3076 M_RAIDFRAME, M_WAITOK);
3077 /* this one is easy :) */
3078 config_sets->ac = ac;
3079 config_sets->next = NULL;
3080 config_sets->rootable = 0;
3081 ac->next = NULL;
3082 } else {
3083 /* which set does this component fit into? */
3084 cset = config_sets;
3085 while(cset!=NULL) {
3086 if (rf_does_it_fit(cset, ac)) {
3087 /* looks like it matches... */
3088 ac->next = cset->ac;
3089 cset->ac = ac;
3090 break;
3091 }
3092 cset = cset->next;
3093 }
3094 if (cset==NULL) {
3095 /* didn't find a match above... new set..*/
3096 cset = malloc(sizeof(RF_ConfigSet_t),
3097 M_RAIDFRAME, M_WAITOK);
3098 cset->ac = ac;
3099 ac->next = NULL;
3100 cset->next = config_sets;
3101 cset->rootable = 0;
3102 config_sets = cset;
3103 }
3104 }
3105 ac = ac_next;
3106 }
3107
3108
3109 return(config_sets);
3110 }
3111
3112 static int
3113 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3114 {
3115 RF_ComponentLabel_t *clabel1, *clabel2;
3116
3117 /* If this one matches the *first* one in the set, that's good
3118 enough, since the other members of the set would have been
3119 through here too... */
3120 /* note that we are not checking partitionSize here..
3121
3122 Note that we are also not checking the mod_counters here.
3123 If everything else matches except the mod_counter, that's
3124 good enough for this test. We will deal with the mod_counters
3125 a little later in the autoconfiguration process.
3126
3127 (clabel1->mod_counter == clabel2->mod_counter) &&
3128
3129 The reason we don't check for this is that failed disks
3130 will have lower modification counts. If those disks are
3131 not added to the set they used to belong to, then they will
3132 form their own set, which may result in 2 different sets,
3133 for example, competing to be configured at raid0, and
3134 perhaps competing to be the root filesystem set. If the
3135 wrong ones get configured, or both attempt to become /,
3136 weird behaviour and or serious lossage will occur. Thus we
3137 need to bring them into the fold here, and kick them out at
3138 a later point.
3139
3140 */
3141
3142 clabel1 = cset->ac->clabel;
3143 clabel2 = ac->clabel;
3144 if ((clabel1->version == clabel2->version) &&
3145 (clabel1->serial_number == clabel2->serial_number) &&
3146 (clabel1->num_rows == clabel2->num_rows) &&
3147 (clabel1->num_columns == clabel2->num_columns) &&
3148 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3149 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3150 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3151 (clabel1->parityConfig == clabel2->parityConfig) &&
3152 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3153 (clabel1->blockSize == clabel2->blockSize) &&
3154 rf_component_label_numblocks(clabel1) ==
3155 rf_component_label_numblocks(clabel2) &&
3156 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3157 (clabel1->root_partition == clabel2->root_partition) &&
3158 (clabel1->last_unit == clabel2->last_unit) &&
3159 (clabel1->config_order == clabel2->config_order)) {
3160 /* if it get's here, it almost *has* to be a match */
3161 } else {
3162 /* it's not consistent with somebody in the set..
3163 punt */
3164 return(0);
3165 }
3166 /* all was fine.. it must fit... */
3167 return(1);
3168 }
3169
3170 int
3171 rf_have_enough_components(RF_ConfigSet_t *cset)
3172 {
3173 RF_AutoConfig_t *ac;
3174 RF_AutoConfig_t *auto_config;
3175 RF_ComponentLabel_t *clabel;
3176 int c;
3177 int num_cols;
3178 int num_missing;
3179 int mod_counter;
3180 int mod_counter_found;
3181 int even_pair_failed;
3182 char parity_type;
3183
3184
3185 /* check to see that we have enough 'live' components
3186 of this set. If so, we can configure it if necessary */
3187
3188 num_cols = cset->ac->clabel->num_columns;
3189 parity_type = cset->ac->clabel->parityConfig;
3190
3191 /* XXX Check for duplicate components!?!?!? */
3192
3193 /* Determine what the mod_counter is supposed to be for this set. */
3194
3195 mod_counter_found = 0;
3196 mod_counter = 0;
3197 ac = cset->ac;
3198 while(ac!=NULL) {
3199 if (mod_counter_found==0) {
3200 mod_counter = ac->clabel->mod_counter;
3201 mod_counter_found = 1;
3202 } else {
3203 if (ac->clabel->mod_counter > mod_counter) {
3204 mod_counter = ac->clabel->mod_counter;
3205 }
3206 }
3207 ac = ac->next;
3208 }
3209
3210 num_missing = 0;
3211 auto_config = cset->ac;
3212
3213 even_pair_failed = 0;
3214 for(c=0; c<num_cols; c++) {
3215 ac = auto_config;
3216 while(ac!=NULL) {
3217 if ((ac->clabel->column == c) &&
3218 (ac->clabel->mod_counter == mod_counter)) {
3219 /* it's this one... */
3220 #ifdef DEBUG
3221 printf("Found: %s at %d\n",
3222 ac->devname,c);
3223 #endif
3224 break;
3225 }
3226 ac=ac->next;
3227 }
3228 if (ac==NULL) {
3229 /* Didn't find one here! */
3230 /* special case for RAID 1, especially
3231 where there are more than 2
3232 components (where RAIDframe treats
3233 things a little differently :( ) */
3234 if (parity_type == '1') {
3235 if (c%2 == 0) { /* even component */
3236 even_pair_failed = 1;
3237 } else { /* odd component. If
3238 we're failed, and
3239 so is the even
3240 component, it's
3241 "Good Night, Charlie" */
3242 if (even_pair_failed == 1) {
3243 return(0);
3244 }
3245 }
3246 } else {
3247 /* normal accounting */
3248 num_missing++;
3249 }
3250 }
3251 if ((parity_type == '1') && (c%2 == 1)) {
3252 /* Just did an even component, and we didn't
3253 bail.. reset the even_pair_failed flag,
3254 and go on to the next component.... */
3255 even_pair_failed = 0;
3256 }
3257 }
3258
3259 clabel = cset->ac->clabel;
3260
3261 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3262 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3263 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3264 /* XXX this needs to be made *much* more general */
3265 /* Too many failures */
3266 return(0);
3267 }
3268 /* otherwise, all is well, and we've got enough to take a kick
3269 at autoconfiguring this set */
3270 return(1);
3271 }
3272
3273 void
3274 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3275 RF_Raid_t *raidPtr)
3276 {
3277 RF_ComponentLabel_t *clabel;
3278 int i;
3279
3280 clabel = ac->clabel;
3281
3282 /* 1. Fill in the common stuff */
3283 config->numCol = clabel->num_columns;
3284 config->numSpare = 0; /* XXX should this be set here? */
3285 config->sectPerSU = clabel->sectPerSU;
3286 config->SUsPerPU = clabel->SUsPerPU;
3287 config->SUsPerRU = clabel->SUsPerRU;
3288 config->parityConfig = clabel->parityConfig;
3289 /* XXX... */
3290 strcpy(config->diskQueueType,"fifo");
3291 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3292 config->layoutSpecificSize = 0; /* XXX ?? */
3293
3294 while(ac!=NULL) {
3295 /* row/col values will be in range due to the checks
3296 in reasonable_label() */
3297 strcpy(config->devnames[0][ac->clabel->column],
3298 ac->devname);
3299 ac = ac->next;
3300 }
3301
3302 for(i=0;i<RF_MAXDBGV;i++) {
3303 config->debugVars[i][0] = 0;
3304 }
3305 }
3306
3307 int
3308 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3309 {
3310 RF_ComponentLabel_t *clabel;
3311 int column;
3312 int sparecol;
3313
3314 raidPtr->autoconfigure = new_value;
3315
3316 for(column=0; column<raidPtr->numCol; column++) {
3317 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3318 clabel = raidget_component_label(raidPtr, column);
3319 clabel->autoconfigure = new_value;
3320 raidflush_component_label(raidPtr, column);
3321 }
3322 }
3323 for(column = 0; column < raidPtr->numSpare ; column++) {
3324 sparecol = raidPtr->numCol + column;
3325 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3326 clabel = raidget_component_label(raidPtr, sparecol);
3327 clabel->autoconfigure = new_value;
3328 raidflush_component_label(raidPtr, sparecol);
3329 }
3330 }
3331 return(new_value);
3332 }
3333
3334 int
3335 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3336 {
3337 RF_ComponentLabel_t *clabel;
3338 int column;
3339 int sparecol;
3340
3341 raidPtr->root_partition = new_value;
3342 for(column=0; column<raidPtr->numCol; column++) {
3343 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3344 clabel = raidget_component_label(raidPtr, column);
3345 clabel->root_partition = new_value;
3346 raidflush_component_label(raidPtr, column);
3347 }
3348 }
3349 for(column = 0; column < raidPtr->numSpare ; column++) {
3350 sparecol = raidPtr->numCol + column;
3351 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3352 clabel = raidget_component_label(raidPtr, sparecol);
3353 clabel->root_partition = new_value;
3354 raidflush_component_label(raidPtr, sparecol);
3355 }
3356 }
3357 return(new_value);
3358 }
3359
3360 void
3361 rf_release_all_vps(RF_ConfigSet_t *cset)
3362 {
3363 RF_AutoConfig_t *ac;
3364
3365 ac = cset->ac;
3366 while(ac!=NULL) {
3367 /* Close the vp, and give it back */
3368 if (ac->vp) {
3369 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3370 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3371 vput(ac->vp);
3372 ac->vp = NULL;
3373 }
3374 ac = ac->next;
3375 }
3376 }
3377
3378
3379 void
3380 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3381 {
3382 RF_AutoConfig_t *ac;
3383 RF_AutoConfig_t *next_ac;
3384
3385 ac = cset->ac;
3386 while(ac!=NULL) {
3387 next_ac = ac->next;
3388 /* nuke the label */
3389 free(ac->clabel, M_RAIDFRAME);
3390 /* cleanup the config structure */
3391 free(ac, M_RAIDFRAME);
3392 /* "next.." */
3393 ac = next_ac;
3394 }
3395 /* and, finally, nuke the config set */
3396 free(cset, M_RAIDFRAME);
3397 }
3398
3399
3400 void
3401 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3402 {
3403 /* current version number */
3404 clabel->version = RF_COMPONENT_LABEL_VERSION;
3405 clabel->serial_number = raidPtr->serial_number;
3406 clabel->mod_counter = raidPtr->mod_counter;
3407
3408 clabel->num_rows = 1;
3409 clabel->num_columns = raidPtr->numCol;
3410 clabel->clean = RF_RAID_DIRTY; /* not clean */
3411 clabel->status = rf_ds_optimal; /* "It's good!" */
3412
3413 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3414 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3415 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3416
3417 clabel->blockSize = raidPtr->bytesPerSector;
3418 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3419
3420 /* XXX not portable */
3421 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3422 clabel->maxOutstanding = raidPtr->maxOutstanding;
3423 clabel->autoconfigure = raidPtr->autoconfigure;
3424 clabel->root_partition = raidPtr->root_partition;
3425 clabel->last_unit = raidPtr->raidid;
3426 clabel->config_order = raidPtr->config_order;
3427
3428 #ifndef RF_NO_PARITY_MAP
3429 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3430 #endif
3431 }
3432
3433 struct raid_softc *
3434 rf_auto_config_set(RF_ConfigSet_t *cset)
3435 {
3436 RF_Raid_t *raidPtr;
3437 RF_Config_t *config;
3438 int raidID;
3439 struct raid_softc *sc;
3440
3441 #ifdef DEBUG
3442 printf("RAID autoconfigure\n");
3443 #endif
3444
3445 /* 1. Create a config structure */
3446 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3447
3448 /*
3449 2. Figure out what RAID ID this one is supposed to live at
3450 See if we can get the same RAID dev that it was configured
3451 on last time..
3452 */
3453
3454 raidID = cset->ac->clabel->last_unit;
3455 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3456 sc = raidget(++raidID, false))
3457 continue;
3458 #ifdef DEBUG
3459 printf("Configuring raid%d:\n",raidID);
3460 #endif
3461
3462 if (sc == NULL)
3463 sc = raidget(raidID, true);
3464 raidPtr = &sc->sc_r;
3465
3466 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3467 raidPtr->softc = sc;
3468 raidPtr->raidid = raidID;
3469 raidPtr->openings = RAIDOUTSTANDING;
3470
3471 /* 3. Build the configuration structure */
3472 rf_create_configuration(cset->ac, config, raidPtr);
3473
3474 /* 4. Do the configuration */
3475 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3476 raidinit(sc);
3477
3478 rf_markalldirty(raidPtr);
3479 raidPtr->autoconfigure = 1; /* XXX do this here? */
3480 switch (cset->ac->clabel->root_partition) {
3481 case 1: /* Force Root */
3482 case 2: /* Soft Root: root when boot partition part of raid */
3483 /*
3484 * everything configured just fine. Make a note
3485 * that this set is eligible to be root,
3486 * or forced to be root
3487 */
3488 cset->rootable = cset->ac->clabel->root_partition;
3489 /* XXX do this here? */
3490 raidPtr->root_partition = cset->rootable;
3491 break;
3492 default:
3493 break;
3494 }
3495 } else {
3496 raidput(sc);
3497 sc = NULL;
3498 }
3499
3500 /* 5. Cleanup */
3501 free(config, M_RAIDFRAME);
3502 return sc;
3503 }
3504
3505 void
3506 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3507 size_t xmin, size_t xmax)
3508 {
3509
3510 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3511 pool_sethiwat(p, xmax);
3512 pool_prime(p, xmin);
3513 }
3514
3515 /*
3516 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3517 * to see if there is IO pending and if that IO could possibly be done
3518 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3519 * otherwise.
3520 *
3521 */
3522 int
3523 rf_buf_queue_check(RF_Raid_t *raidPtr)
3524 {
3525 struct raid_softc *rs;
3526 struct dk_softc *dksc;
3527
3528 rs = raidPtr->softc;
3529 dksc = &rs->sc_dksc;
3530
3531 if ((rs->sc_flags & RAIDF_INITED) == 0)
3532 return 1;
3533
3534 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3535 /* there is work to do */
3536 return 0;
3537 }
3538 /* default is nothing to do */
3539 return 1;
3540 }
3541
3542 int
3543 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3544 {
3545 uint64_t numsecs;
3546 unsigned secsize;
3547 int error;
3548
3549 error = getdisksize(vp, &numsecs, &secsize);
3550 if (error == 0) {
3551 diskPtr->blockSize = secsize;
3552 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3553 diskPtr->partitionSize = numsecs;
3554 return 0;
3555 }
3556 return error;
3557 }
3558
3559 static int
3560 raid_match(device_t self, cfdata_t cfdata, void *aux)
3561 {
3562 return 1;
3563 }
3564
3565 static void
3566 raid_attach(device_t parent, device_t self, void *aux)
3567 {
3568 }
3569
3570
3571 static int
3572 raid_detach(device_t self, int flags)
3573 {
3574 int error;
3575 struct raid_softc *rs = raidsoftc(self);
3576
3577 if (rs == NULL)
3578 return ENXIO;
3579
3580 if ((error = raidlock(rs)) != 0)
3581 return (error);
3582
3583 error = raid_detach_unlocked(rs);
3584
3585 raidunlock(rs);
3586
3587 /* XXX raid can be referenced here */
3588
3589 if (error)
3590 return error;
3591
3592 /* Free the softc */
3593 raidput(rs);
3594
3595 return 0;
3596 }
3597
3598 static void
3599 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3600 {
3601 struct dk_softc *dksc = &rs->sc_dksc;
3602 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3603
3604 memset(dg, 0, sizeof(*dg));
3605
3606 dg->dg_secperunit = raidPtr->totalSectors;
3607 dg->dg_secsize = raidPtr->bytesPerSector;
3608 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3609 dg->dg_ntracks = 4 * raidPtr->numCol;
3610
3611 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3612 }
3613
3614 /*
3615 * Get cache info for all the components (including spares).
3616 * Returns intersection of all the cache flags of all disks, or first
3617 * error if any encountered.
3618 * XXXfua feature flags can change as spares are added - lock down somehow
3619 */
3620 static int
3621 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3622 {
3623 int c;
3624 int error;
3625 int dkwhole = 0, dkpart;
3626
3627 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3628 /*
3629 * Check any non-dead disk, even when currently being
3630 * reconstructed.
3631 */
3632 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3633 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3634 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3635 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3636 if (error) {
3637 if (error != ENODEV) {
3638 printf("raid%d: get cache for component %s failed\n",
3639 raidPtr->raidid,
3640 raidPtr->Disks[c].devname);
3641 }
3642
3643 return error;
3644 }
3645
3646 if (c == 0)
3647 dkwhole = dkpart;
3648 else
3649 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3650 }
3651 }
3652
3653 *data = dkwhole;
3654
3655 return 0;
3656 }
3657
3658 /*
3659 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3660 * We end up returning whatever error was returned by the first cache flush
3661 * that fails.
3662 */
3663
3664 int
3665 rf_sync_component_caches(RF_Raid_t *raidPtr)
3666 {
3667 int c, sparecol;
3668 int e,error;
3669 int force = 1;
3670
3671 error = 0;
3672 for (c = 0; c < raidPtr->numCol; c++) {
3673 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3674 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3675 &force, FWRITE, NOCRED);
3676 if (e) {
3677 if (e != ENODEV)
3678 printf("raid%d: cache flush to component %s failed.\n",
3679 raidPtr->raidid, raidPtr->Disks[c].devname);
3680 if (error == 0) {
3681 error = e;
3682 }
3683 }
3684 }
3685 }
3686
3687 for( c = 0; c < raidPtr->numSpare ; c++) {
3688 sparecol = raidPtr->numCol + c;
3689 /* Need to ensure that the reconstruct actually completed! */
3690 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3691 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3692 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3693 if (e) {
3694 if (e != ENODEV)
3695 printf("raid%d: cache flush to component %s failed.\n",
3696 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3697 if (error == 0) {
3698 error = e;
3699 }
3700 }
3701 }
3702 }
3703 return error;
3704 }
3705
3706 /* Fill in info with the current status */
3707 void
3708 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3709 {
3710
3711 if (raidPtr->status != rf_rs_reconstructing) {
3712 info->total = 100;
3713 info->completed = 100;
3714 } else {
3715 info->total = raidPtr->reconControl->numRUsTotal;
3716 info->completed = raidPtr->reconControl->numRUsComplete;
3717 }
3718 info->remaining = info->total - info->completed;
3719 }
3720
3721 /* Fill in info with the current status */
3722 void
3723 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3724 {
3725
3726 if (raidPtr->parity_rewrite_in_progress == 1) {
3727 info->total = raidPtr->Layout.numStripe;
3728 info->completed = raidPtr->parity_rewrite_stripes_done;
3729 } else {
3730 info->completed = 100;
3731 info->total = 100;
3732 }
3733 info->remaining = info->total - info->completed;
3734 }
3735
3736 /* Fill in info with the current status */
3737 void
3738 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3739 {
3740
3741 if (raidPtr->copyback_in_progress == 1) {
3742 info->total = raidPtr->Layout.numStripe;
3743 info->completed = raidPtr->copyback_stripes_done;
3744 info->remaining = info->total - info->completed;
3745 } else {
3746 info->remaining = 0;
3747 info->completed = 100;
3748 info->total = 100;
3749 }
3750 }
3751
3752 /* Fill in config with the current info */
3753 int
3754 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3755 {
3756 int d, i, j;
3757
3758 if (!raidPtr->valid)
3759 return (ENODEV);
3760 config->cols = raidPtr->numCol;
3761 config->ndevs = raidPtr->numCol;
3762 if (config->ndevs >= RF_MAX_DISKS)
3763 return (ENOMEM);
3764 config->nspares = raidPtr->numSpare;
3765 if (config->nspares >= RF_MAX_DISKS)
3766 return (ENOMEM);
3767 config->maxqdepth = raidPtr->maxQueueDepth;
3768 d = 0;
3769 for (j = 0; j < config->cols; j++) {
3770 config->devs[d] = raidPtr->Disks[j];
3771 d++;
3772 }
3773 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3774 config->spares[i] = raidPtr->Disks[j];
3775 if (config->spares[i].status == rf_ds_rebuilding_spare) {
3776 /* XXX: raidctl(8) expects to see this as a used spare */
3777 config->spares[i].status = rf_ds_used_spare;
3778 }
3779 }
3780 return 0;
3781 }
3782
3783 int
3784 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3785 {
3786 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3787 RF_ComponentLabel_t *raid_clabel;
3788 int column = clabel->column;
3789
3790 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3791 return EINVAL;
3792 raid_clabel = raidget_component_label(raidPtr, column);
3793 memcpy(clabel, raid_clabel, sizeof *clabel);
3794
3795 return 0;
3796 }
3797
3798 /*
3799 * Module interface
3800 */
3801
3802 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3803
3804 #ifdef _MODULE
3805 CFDRIVER_DECL(raid, DV_DISK, NULL);
3806 #endif
3807
3808 static int raid_modcmd(modcmd_t, void *);
3809 static int raid_modcmd_init(void);
3810 static int raid_modcmd_fini(void);
3811
3812 static int
3813 raid_modcmd(modcmd_t cmd, void *data)
3814 {
3815 int error;
3816
3817 error = 0;
3818 switch (cmd) {
3819 case MODULE_CMD_INIT:
3820 error = raid_modcmd_init();
3821 break;
3822 case MODULE_CMD_FINI:
3823 error = raid_modcmd_fini();
3824 break;
3825 default:
3826 error = ENOTTY;
3827 break;
3828 }
3829 return error;
3830 }
3831
3832 static int
3833 raid_modcmd_init(void)
3834 {
3835 int error;
3836 int bmajor, cmajor;
3837
3838 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3839 mutex_enter(&raid_lock);
3840 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3841 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3842 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3843 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3844
3845 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3846 #endif
3847
3848 bmajor = cmajor = -1;
3849 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3850 &raid_cdevsw, &cmajor);
3851 if (error != 0 && error != EEXIST) {
3852 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3853 mutex_exit(&raid_lock);
3854 return error;
3855 }
3856 #ifdef _MODULE
3857 error = config_cfdriver_attach(&raid_cd);
3858 if (error != 0) {
3859 aprint_error("%s: config_cfdriver_attach failed %d\n",
3860 __func__, error);
3861 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3862 mutex_exit(&raid_lock);
3863 return error;
3864 }
3865 #endif
3866 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3867 if (error != 0) {
3868 aprint_error("%s: config_cfattach_attach failed %d\n",
3869 __func__, error);
3870 #ifdef _MODULE
3871 config_cfdriver_detach(&raid_cd);
3872 #endif
3873 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3874 mutex_exit(&raid_lock);
3875 return error;
3876 }
3877
3878 raidautoconfigdone = false;
3879
3880 mutex_exit(&raid_lock);
3881
3882 if (error == 0) {
3883 if (rf_BootRaidframe(true) == 0)
3884 aprint_verbose("Kernelized RAIDframe activated\n");
3885 else
3886 panic("Serious error activating RAID!!");
3887 }
3888
3889 /*
3890 * Register a finalizer which will be used to auto-config RAID
3891 * sets once all real hardware devices have been found.
3892 */
3893 error = config_finalize_register(NULL, rf_autoconfig);
3894 if (error != 0) {
3895 aprint_error("WARNING: unable to register RAIDframe "
3896 "finalizer\n");
3897 error = 0;
3898 }
3899
3900 return error;
3901 }
3902
3903 static int
3904 raid_modcmd_fini(void)
3905 {
3906 int error;
3907
3908 mutex_enter(&raid_lock);
3909
3910 /* Don't allow unload if raid device(s) exist. */
3911 if (!LIST_EMPTY(&raids)) {
3912 mutex_exit(&raid_lock);
3913 return EBUSY;
3914 }
3915
3916 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3917 if (error != 0) {
3918 aprint_error("%s: cannot detach cfattach\n",__func__);
3919 mutex_exit(&raid_lock);
3920 return error;
3921 }
3922 #ifdef _MODULE
3923 error = config_cfdriver_detach(&raid_cd);
3924 if (error != 0) {
3925 aprint_error("%s: cannot detach cfdriver\n",__func__);
3926 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3927 mutex_exit(&raid_lock);
3928 return error;
3929 }
3930 #endif
3931 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3932 if (error != 0) {
3933 aprint_error("%s: cannot detach devsw\n",__func__);
3934 #ifdef _MODULE
3935 config_cfdriver_attach(&raid_cd);
3936 #endif
3937 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3938 mutex_exit(&raid_lock);
3939 return error;
3940 }
3941 rf_BootRaidframe(false);
3942 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3943 rf_destroy_mutex2(rf_sparet_wait_mutex);
3944 rf_destroy_cond2(rf_sparet_wait_cv);
3945 rf_destroy_cond2(rf_sparet_resp_cv);
3946 #endif
3947 mutex_exit(&raid_lock);
3948 mutex_destroy(&raid_lock);
3949
3950 return error;
3951 }
3952