rf_netbsdkintf.c revision 1.368 1 /* $NetBSD: rf_netbsdkintf.c,v 1.368 2019/02/06 02:49:09 oster Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.368 2019/02/06 02:49:09 oster Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171
172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
173 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
175 * installation process */
176 #endif
177
178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
179
180 /* prototypes */
181 static void KernelWakeupFunc(struct buf *);
182 static void InitBP(struct buf *, struct vnode *, unsigned,
183 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
184 void *, int, struct proc *);
185 static void raidinit(struct raid_softc *);
186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
188
189 static int raid_match(device_t, cfdata_t, void *);
190 static void raid_attach(device_t, device_t, void *);
191 static int raid_detach(device_t, int);
192
193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
194 daddr_t, daddr_t);
195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
196 daddr_t, daddr_t, int);
197
198 static int raidwrite_component_label(unsigned,
199 dev_t, struct vnode *, RF_ComponentLabel_t *);
200 static int raidread_component_label(unsigned,
201 dev_t, struct vnode *, RF_ComponentLabel_t *);
202
203 static int raid_diskstart(device_t, struct buf *bp);
204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
205 static int raid_lastclose(device_t);
206
207 static dev_type_open(raidopen);
208 static dev_type_close(raidclose);
209 static dev_type_read(raidread);
210 static dev_type_write(raidwrite);
211 static dev_type_ioctl(raidioctl);
212 static dev_type_strategy(raidstrategy);
213 static dev_type_dump(raiddump);
214 static dev_type_size(raidsize);
215
216 const struct bdevsw raid_bdevsw = {
217 .d_open = raidopen,
218 .d_close = raidclose,
219 .d_strategy = raidstrategy,
220 .d_ioctl = raidioctl,
221 .d_dump = raiddump,
222 .d_psize = raidsize,
223 .d_discard = nodiscard,
224 .d_flag = D_DISK
225 };
226
227 const struct cdevsw raid_cdevsw = {
228 .d_open = raidopen,
229 .d_close = raidclose,
230 .d_read = raidread,
231 .d_write = raidwrite,
232 .d_ioctl = raidioctl,
233 .d_stop = nostop,
234 .d_tty = notty,
235 .d_poll = nopoll,
236 .d_mmap = nommap,
237 .d_kqfilter = nokqfilter,
238 .d_discard = nodiscard,
239 .d_flag = D_DISK
240 };
241
242 static struct dkdriver rf_dkdriver = {
243 .d_open = raidopen,
244 .d_close = raidclose,
245 .d_strategy = raidstrategy,
246 .d_diskstart = raid_diskstart,
247 .d_dumpblocks = raid_dumpblocks,
248 .d_lastclose = raid_lastclose,
249 .d_minphys = minphys
250 };
251
252 #define raidunit(x) DISKUNIT(x)
253 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
254
255 extern struct cfdriver raid_cd;
256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
257 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
258 DVF_DETACH_SHUTDOWN);
259
260 /* Internal representation of a rf_recon_req */
261 struct rf_recon_req_internal {
262 RF_RowCol_t col;
263 RF_ReconReqFlags_t flags;
264 void *raidPtr;
265 };
266
267 /*
268 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
269 * Be aware that large numbers can allow the driver to consume a lot of
270 * kernel memory, especially on writes, and in degraded mode reads.
271 *
272 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
273 * a single 64K write will typically require 64K for the old data,
274 * 64K for the old parity, and 64K for the new parity, for a total
275 * of 192K (if the parity buffer is not re-used immediately).
276 * Even it if is used immediately, that's still 128K, which when multiplied
277 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
278 *
279 * Now in degraded mode, for example, a 64K read on the above setup may
280 * require data reconstruction, which will require *all* of the 4 remaining
281 * disks to participate -- 4 * 32K/disk == 128K again.
282 */
283
284 #ifndef RAIDOUTSTANDING
285 #define RAIDOUTSTANDING 6
286 #endif
287
288 #define RAIDLABELDEV(dev) \
289 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
290
291 /* declared here, and made public, for the benefit of KVM stuff.. */
292
293 static int raidlock(struct raid_softc *);
294 static void raidunlock(struct raid_softc *);
295
296 static int raid_detach_unlocked(struct raid_softc *);
297
298 static void rf_markalldirty(RF_Raid_t *);
299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
300
301 void rf_ReconThread(struct rf_recon_req_internal *);
302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
303 void rf_CopybackThread(RF_Raid_t *raidPtr);
304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
305 int rf_autoconfig(device_t);
306 void rf_buildroothack(RF_ConfigSet_t *);
307
308 RF_AutoConfig_t *rf_find_raid_components(void);
309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
313 int rf_set_autoconfig(RF_Raid_t *, int);
314 int rf_set_rootpartition(RF_Raid_t *, int);
315 void rf_release_all_vps(RF_ConfigSet_t *);
316 void rf_cleanup_config_set(RF_ConfigSet_t *);
317 int rf_have_enough_components(RF_ConfigSet_t *);
318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
320
321 /*
322 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
323 * Note that this is overridden by having RAID_AUTOCONFIG as an option
324 * in the kernel config file.
325 */
326 #ifdef RAID_AUTOCONFIG
327 int raidautoconfig = 1;
328 #else
329 int raidautoconfig = 0;
330 #endif
331 static bool raidautoconfigdone = false;
332
333 struct RF_Pools_s rf_pools;
334
335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
336 static kmutex_t raid_lock;
337
338 static struct raid_softc *
339 raidcreate(int unit) {
340 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
341 sc->sc_unit = unit;
342 cv_init(&sc->sc_cv, "raidunit");
343 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
344 return sc;
345 }
346
347 static void
348 raiddestroy(struct raid_softc *sc) {
349 cv_destroy(&sc->sc_cv);
350 mutex_destroy(&sc->sc_mutex);
351 kmem_free(sc, sizeof(*sc));
352 }
353
354 static struct raid_softc *
355 raidget(int unit, bool create) {
356 struct raid_softc *sc;
357 if (unit < 0) {
358 #ifdef DIAGNOSTIC
359 panic("%s: unit %d!", __func__, unit);
360 #endif
361 return NULL;
362 }
363 mutex_enter(&raid_lock);
364 LIST_FOREACH(sc, &raids, sc_link) {
365 if (sc->sc_unit == unit) {
366 mutex_exit(&raid_lock);
367 return sc;
368 }
369 }
370 mutex_exit(&raid_lock);
371 if (!create)
372 return NULL;
373 if ((sc = raidcreate(unit)) == NULL)
374 return NULL;
375 mutex_enter(&raid_lock);
376 LIST_INSERT_HEAD(&raids, sc, sc_link);
377 mutex_exit(&raid_lock);
378 return sc;
379 }
380
381 static void
382 raidput(struct raid_softc *sc) {
383 mutex_enter(&raid_lock);
384 LIST_REMOVE(sc, sc_link);
385 mutex_exit(&raid_lock);
386 raiddestroy(sc);
387 }
388
389 void
390 raidattach(int num)
391 {
392
393 /*
394 * Device attachment and associated initialization now occurs
395 * as part of the module initialization.
396 */
397 }
398
399 int
400 rf_autoconfig(device_t self)
401 {
402 RF_AutoConfig_t *ac_list;
403 RF_ConfigSet_t *config_sets;
404
405 if (!raidautoconfig || raidautoconfigdone == true)
406 return (0);
407
408 /* XXX This code can only be run once. */
409 raidautoconfigdone = true;
410
411 #ifdef __HAVE_CPU_BOOTCONF
412 /*
413 * 0. find the boot device if needed first so we can use it later
414 * this needs to be done before we autoconfigure any raid sets,
415 * because if we use wedges we are not going to be able to open
416 * the boot device later
417 */
418 if (booted_device == NULL)
419 cpu_bootconf();
420 #endif
421 /* 1. locate all RAID components on the system */
422 aprint_debug("Searching for RAID components...\n");
423 ac_list = rf_find_raid_components();
424
425 /* 2. Sort them into their respective sets. */
426 config_sets = rf_create_auto_sets(ac_list);
427
428 /*
429 * 3. Evaluate each set and configure the valid ones.
430 * This gets done in rf_buildroothack().
431 */
432 rf_buildroothack(config_sets);
433
434 return 1;
435 }
436
437 int
438 rf_inited(const struct raid_softc *rs) {
439 return (rs->sc_flags & RAIDF_INITED) != 0;
440 }
441
442 RF_Raid_t *
443 rf_get_raid(struct raid_softc *rs) {
444 return &rs->sc_r;
445 }
446
447 int
448 rf_get_unit(const struct raid_softc *rs) {
449 return rs->sc_unit;
450 }
451
452 static int
453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
454 const char *bootname;
455 size_t len;
456
457 /* if bdv is NULL, the set can't contain it. exit early. */
458 if (bdv == NULL)
459 return 0;
460
461 bootname = device_xname(bdv);
462 len = strlen(bootname);
463
464 for (int col = 0; col < r->numCol; col++) {
465 const char *devname = r->Disks[col].devname;
466 devname += sizeof("/dev/") - 1;
467 if (strncmp(devname, "dk", 2) == 0) {
468 const char *parent =
469 dkwedge_get_parent_name(r->Disks[col].dev);
470 if (parent != NULL)
471 devname = parent;
472 }
473 if (strncmp(devname, bootname, len) == 0) {
474 struct raid_softc *sc = r->softc;
475 aprint_debug("raid%d includes boot device %s\n",
476 sc->sc_unit, devname);
477 return 1;
478 }
479 }
480 return 0;
481 }
482
483 void
484 rf_buildroothack(RF_ConfigSet_t *config_sets)
485 {
486 RF_ConfigSet_t *cset;
487 RF_ConfigSet_t *next_cset;
488 int num_root;
489 struct raid_softc *sc, *rsc;
490 struct dk_softc *dksc;
491
492 sc = rsc = NULL;
493 num_root = 0;
494 cset = config_sets;
495 while (cset != NULL) {
496 next_cset = cset->next;
497 if (rf_have_enough_components(cset) &&
498 cset->ac->clabel->autoconfigure == 1) {
499 sc = rf_auto_config_set(cset);
500 if (sc != NULL) {
501 aprint_debug("raid%d: configured ok, rootable %d\n",
502 sc->sc_unit, cset->rootable);
503 if (cset->rootable) {
504 rsc = sc;
505 num_root++;
506 }
507 } else {
508 /* The autoconfig didn't work :( */
509 aprint_debug("Autoconfig failed\n");
510 rf_release_all_vps(cset);
511 }
512 } else {
513 /* we're not autoconfiguring this set...
514 release the associated resources */
515 rf_release_all_vps(cset);
516 }
517 /* cleanup */
518 rf_cleanup_config_set(cset);
519 cset = next_cset;
520 }
521 dksc = &rsc->sc_dksc;
522
523 /* if the user has specified what the root device should be
524 then we don't touch booted_device or boothowto... */
525
526 if (rootspec != NULL) {
527 DPRINTF("%s: rootspec %s\n", __func__, rootspec);
528 return;
529 }
530
531 /* we found something bootable... */
532
533 /*
534 * XXX: The following code assumes that the root raid
535 * is the first ('a') partition. This is about the best
536 * we can do with a BSD disklabel, but we might be able
537 * to do better with a GPT label, by setting a specified
538 * attribute to indicate the root partition. We can then
539 * stash the partition number in the r->root_partition
540 * high bits (the bottom 2 bits are already used). For
541 * now we just set booted_partition to 0 when we override
542 * root.
543 */
544 if (num_root == 1) {
545 device_t candidate_root;
546 if (dksc->sc_dkdev.dk_nwedges != 0) {
547 char cname[sizeof(cset->ac->devname)];
548 /* XXX: assume partition 'a' first */
549 snprintf(cname, sizeof(cname), "%s%c",
550 device_xname(dksc->sc_dev), 'a');
551 candidate_root = dkwedge_find_by_wname(cname);
552 DPRINTF("%s: candidate wedge root=%s\n", __func__,
553 cname);
554 if (candidate_root == NULL) {
555 /*
556 * If that is not found, because we don't use
557 * disklabel, return the first dk child
558 * XXX: we can skip the 'a' check above
559 * and always do this...
560 */
561 size_t i = 0;
562 candidate_root = dkwedge_find_by_parent(
563 device_xname(dksc->sc_dev), &i);
564 }
565 DPRINTF("%s: candidate wedge root=%p\n", __func__,
566 candidate_root);
567 } else
568 candidate_root = dksc->sc_dev;
569 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
570 DPRINTF("%s: booted_device=%p root_partition=%d "
571 "contains_boot=%d",
572 __func__, booted_device, rsc->sc_r.root_partition,
573 rf_containsboot(&rsc->sc_r, booted_device));
574 /* XXX the check for booted_device == NULL can probably be
575 * dropped, now that rf_containsboot handles that case.
576 */
577 if (booted_device == NULL ||
578 rsc->sc_r.root_partition == 1 ||
579 rf_containsboot(&rsc->sc_r, booted_device)) {
580 booted_device = candidate_root;
581 booted_method = "raidframe/single";
582 booted_partition = 0; /* XXX assume 'a' */
583 }
584 } else if (num_root > 1) {
585 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
586 booted_device);
587
588 /*
589 * Maybe the MD code can help. If it cannot, then
590 * setroot() will discover that we have no
591 * booted_device and will ask the user if nothing was
592 * hardwired in the kernel config file
593 */
594 if (booted_device == NULL)
595 return;
596
597 num_root = 0;
598 mutex_enter(&raid_lock);
599 LIST_FOREACH(sc, &raids, sc_link) {
600 RF_Raid_t *r = &sc->sc_r;
601 if (r->valid == 0)
602 continue;
603
604 if (r->root_partition == 0)
605 continue;
606
607 if (rf_containsboot(r, booted_device)) {
608 num_root++;
609 rsc = sc;
610 dksc = &rsc->sc_dksc;
611 }
612 }
613 mutex_exit(&raid_lock);
614
615 if (num_root == 1) {
616 booted_device = dksc->sc_dev;
617 booted_method = "raidframe/multi";
618 booted_partition = 0; /* XXX assume 'a' */
619 } else {
620 /* we can't guess.. require the user to answer... */
621 boothowto |= RB_ASKNAME;
622 }
623 }
624 }
625
626 static int
627 raidsize(dev_t dev)
628 {
629 struct raid_softc *rs;
630 struct dk_softc *dksc;
631 unsigned int unit;
632
633 unit = raidunit(dev);
634 if ((rs = raidget(unit, false)) == NULL)
635 return -1;
636 dksc = &rs->sc_dksc;
637
638 if ((rs->sc_flags & RAIDF_INITED) == 0)
639 return -1;
640
641 return dk_size(dksc, dev);
642 }
643
644 static int
645 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
646 {
647 unsigned int unit;
648 struct raid_softc *rs;
649 struct dk_softc *dksc;
650
651 unit = raidunit(dev);
652 if ((rs = raidget(unit, false)) == NULL)
653 return ENXIO;
654 dksc = &rs->sc_dksc;
655
656 if ((rs->sc_flags & RAIDF_INITED) == 0)
657 return ENODEV;
658
659 /*
660 Note that blkno is relative to this particular partition.
661 By adding adding RF_PROTECTED_SECTORS, we get a value that
662 is relative to the partition used for the underlying component.
663 */
664 blkno += RF_PROTECTED_SECTORS;
665
666 return dk_dump(dksc, dev, blkno, va, size);
667 }
668
669 static int
670 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
671 {
672 struct raid_softc *rs = raidsoftc(dev);
673 const struct bdevsw *bdev;
674 RF_Raid_t *raidPtr;
675 int c, sparecol, j, scol, dumpto;
676 int error = 0;
677
678 raidPtr = &rs->sc_r;
679
680 /* we only support dumping to RAID 1 sets */
681 if (raidPtr->Layout.numDataCol != 1 ||
682 raidPtr->Layout.numParityCol != 1)
683 return EINVAL;
684
685 if ((error = raidlock(rs)) != 0)
686 return error;
687
688 /* figure out what device is alive.. */
689
690 /*
691 Look for a component to dump to. The preference for the
692 component to dump to is as follows:
693 1) the master
694 2) a used_spare of the master
695 3) the slave
696 4) a used_spare of the slave
697 */
698
699 dumpto = -1;
700 for (c = 0; c < raidPtr->numCol; c++) {
701 if (raidPtr->Disks[c].status == rf_ds_optimal) {
702 /* this might be the one */
703 dumpto = c;
704 break;
705 }
706 }
707
708 /*
709 At this point we have possibly selected a live master or a
710 live slave. We now check to see if there is a spared
711 master (or a spared slave), if we didn't find a live master
712 or a live slave.
713 */
714
715 for (c = 0; c < raidPtr->numSpare; c++) {
716 sparecol = raidPtr->numCol + c;
717 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
718 /* How about this one? */
719 scol = -1;
720 for(j=0;j<raidPtr->numCol;j++) {
721 if (raidPtr->Disks[j].spareCol == sparecol) {
722 scol = j;
723 break;
724 }
725 }
726 if (scol == 0) {
727 /*
728 We must have found a spared master!
729 We'll take that over anything else
730 found so far. (We couldn't have
731 found a real master before, since
732 this is a used spare, and it's
733 saying that it's replacing the
734 master.) On reboot (with
735 autoconfiguration turned on)
736 sparecol will become the 1st
737 component (component0) of this set.
738 */
739 dumpto = sparecol;
740 break;
741 } else if (scol != -1) {
742 /*
743 Must be a spared slave. We'll dump
744 to that if we havn't found anything
745 else so far.
746 */
747 if (dumpto == -1)
748 dumpto = sparecol;
749 }
750 }
751 }
752
753 if (dumpto == -1) {
754 /* we couldn't find any live components to dump to!?!?
755 */
756 error = EINVAL;
757 goto out;
758 }
759
760 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
761 if (bdev == NULL) {
762 error = ENXIO;
763 goto out;
764 }
765
766 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
767 blkno, va, nblk * raidPtr->bytesPerSector);
768
769 out:
770 raidunlock(rs);
771
772 return error;
773 }
774
775 /* ARGSUSED */
776 static int
777 raidopen(dev_t dev, int flags, int fmt,
778 struct lwp *l)
779 {
780 int unit = raidunit(dev);
781 struct raid_softc *rs;
782 struct dk_softc *dksc;
783 int error = 0;
784 int part, pmask;
785
786 if ((rs = raidget(unit, true)) == NULL)
787 return ENXIO;
788 if ((error = raidlock(rs)) != 0)
789 return (error);
790
791 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
792 error = EBUSY;
793 goto bad;
794 }
795
796 dksc = &rs->sc_dksc;
797
798 part = DISKPART(dev);
799 pmask = (1 << part);
800
801 if (!DK_BUSY(dksc, pmask) &&
802 ((rs->sc_flags & RAIDF_INITED) != 0)) {
803 /* First one... mark things as dirty... Note that we *MUST*
804 have done a configure before this. I DO NOT WANT TO BE
805 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
806 THAT THEY BELONG TOGETHER!!!!! */
807 /* XXX should check to see if we're only open for reading
808 here... If so, we needn't do this, but then need some
809 other way of keeping track of what's happened.. */
810
811 rf_markalldirty(&rs->sc_r);
812 }
813
814 if ((rs->sc_flags & RAIDF_INITED) != 0)
815 error = dk_open(dksc, dev, flags, fmt, l);
816
817 bad:
818 raidunlock(rs);
819
820 return (error);
821
822
823 }
824
825 static int
826 raid_lastclose(device_t self)
827 {
828 struct raid_softc *rs = raidsoftc(self);
829
830 /* Last one... device is not unconfigured yet.
831 Device shutdown has taken care of setting the
832 clean bits if RAIDF_INITED is not set
833 mark things as clean... */
834
835 rf_update_component_labels(&rs->sc_r,
836 RF_FINAL_COMPONENT_UPDATE);
837
838 /* pass to unlocked code */
839 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
840 rs->sc_flags |= RAIDF_DETACH;
841
842 return 0;
843 }
844
845 /* ARGSUSED */
846 static int
847 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
848 {
849 int unit = raidunit(dev);
850 struct raid_softc *rs;
851 struct dk_softc *dksc;
852 cfdata_t cf;
853 int error = 0, do_detach = 0, do_put = 0;
854
855 if ((rs = raidget(unit, false)) == NULL)
856 return ENXIO;
857 dksc = &rs->sc_dksc;
858
859 if ((error = raidlock(rs)) != 0)
860 return (error);
861
862 if ((rs->sc_flags & RAIDF_INITED) != 0) {
863 error = dk_close(dksc, dev, flags, fmt, l);
864 if ((rs->sc_flags & RAIDF_DETACH) != 0)
865 do_detach = 1;
866 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
867 do_put = 1;
868
869 raidunlock(rs);
870
871 if (do_detach) {
872 /* free the pseudo device attach bits */
873 cf = device_cfdata(dksc->sc_dev);
874 error = config_detach(dksc->sc_dev, 0);
875 if (error == 0)
876 free(cf, M_RAIDFRAME);
877 } else if (do_put) {
878 raidput(rs);
879 }
880
881 return (error);
882
883 }
884
885 static void
886 raid_wakeup(RF_Raid_t *raidPtr)
887 {
888 rf_lock_mutex2(raidPtr->iodone_lock);
889 rf_signal_cond2(raidPtr->iodone_cv);
890 rf_unlock_mutex2(raidPtr->iodone_lock);
891 }
892
893 static void
894 raidstrategy(struct buf *bp)
895 {
896 unsigned int unit;
897 struct raid_softc *rs;
898 struct dk_softc *dksc;
899 RF_Raid_t *raidPtr;
900
901 unit = raidunit(bp->b_dev);
902 if ((rs = raidget(unit, false)) == NULL) {
903 bp->b_error = ENXIO;
904 goto fail;
905 }
906 if ((rs->sc_flags & RAIDF_INITED) == 0) {
907 bp->b_error = ENXIO;
908 goto fail;
909 }
910 dksc = &rs->sc_dksc;
911 raidPtr = &rs->sc_r;
912
913 /* Queue IO only */
914 if (dk_strategy_defer(dksc, bp))
915 goto done;
916
917 /* schedule the IO to happen at the next convenient time */
918 raid_wakeup(raidPtr);
919
920 done:
921 return;
922
923 fail:
924 bp->b_resid = bp->b_bcount;
925 biodone(bp);
926 }
927
928 static int
929 raid_diskstart(device_t dev, struct buf *bp)
930 {
931 struct raid_softc *rs = raidsoftc(dev);
932 RF_Raid_t *raidPtr;
933
934 raidPtr = &rs->sc_r;
935 if (!raidPtr->valid) {
936 db1_printf(("raid is not valid..\n"));
937 return ENODEV;
938 }
939
940 /* XXX */
941 bp->b_resid = 0;
942
943 return raiddoaccess(raidPtr, bp);
944 }
945
946 void
947 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
948 {
949 struct raid_softc *rs;
950 struct dk_softc *dksc;
951
952 rs = raidPtr->softc;
953 dksc = &rs->sc_dksc;
954
955 dk_done(dksc, bp);
956
957 rf_lock_mutex2(raidPtr->mutex);
958 raidPtr->openings++;
959 rf_unlock_mutex2(raidPtr->mutex);
960
961 /* schedule more IO */
962 raid_wakeup(raidPtr);
963 }
964
965 /* ARGSUSED */
966 static int
967 raidread(dev_t dev, struct uio *uio, int flags)
968 {
969 int unit = raidunit(dev);
970 struct raid_softc *rs;
971
972 if ((rs = raidget(unit, false)) == NULL)
973 return ENXIO;
974
975 if ((rs->sc_flags & RAIDF_INITED) == 0)
976 return (ENXIO);
977
978 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
979
980 }
981
982 /* ARGSUSED */
983 static int
984 raidwrite(dev_t dev, struct uio *uio, int flags)
985 {
986 int unit = raidunit(dev);
987 struct raid_softc *rs;
988
989 if ((rs = raidget(unit, false)) == NULL)
990 return ENXIO;
991
992 if ((rs->sc_flags & RAIDF_INITED) == 0)
993 return (ENXIO);
994
995 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
996
997 }
998
999 static int
1000 raid_detach_unlocked(struct raid_softc *rs)
1001 {
1002 struct dk_softc *dksc = &rs->sc_dksc;
1003 RF_Raid_t *raidPtr;
1004 int error;
1005
1006 raidPtr = &rs->sc_r;
1007
1008 if (DK_BUSY(dksc, 0) ||
1009 raidPtr->recon_in_progress != 0 ||
1010 raidPtr->parity_rewrite_in_progress != 0 ||
1011 raidPtr->copyback_in_progress != 0)
1012 return EBUSY;
1013
1014 if ((rs->sc_flags & RAIDF_INITED) == 0)
1015 return 0;
1016
1017 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1018
1019 if ((error = rf_Shutdown(raidPtr)) != 0)
1020 return error;
1021
1022 rs->sc_flags &= ~RAIDF_INITED;
1023
1024 /* Kill off any queued buffers */
1025 dk_drain(dksc);
1026 bufq_free(dksc->sc_bufq);
1027
1028 /* Detach the disk. */
1029 dkwedge_delall(&dksc->sc_dkdev);
1030 disk_detach(&dksc->sc_dkdev);
1031 disk_destroy(&dksc->sc_dkdev);
1032 dk_detach(dksc);
1033
1034 return 0;
1035 }
1036
1037 static bool
1038 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1039 {
1040 switch (cmd) {
1041 case RAIDFRAME_ADD_HOT_SPARE:
1042 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1043 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1044 case RAIDFRAME_CHECK_PARITY:
1045 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1046 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1047 case RAIDFRAME_CHECK_RECON_STATUS:
1048 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1049 case RAIDFRAME_COPYBACK:
1050 case RAIDFRAME_DELETE_COMPONENT:
1051 case RAIDFRAME_FAIL_DISK:
1052 case RAIDFRAME_GET_ACCTOTALS:
1053 case RAIDFRAME_GET_COMPONENT_LABEL:
1054 case RAIDFRAME_GET_INFO:
1055 case RAIDFRAME_GET_SIZE:
1056 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1057 case RAIDFRAME_INIT_LABELS:
1058 case RAIDFRAME_KEEP_ACCTOTALS:
1059 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1060 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1061 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1062 case RAIDFRAME_PARITYMAP_STATUS:
1063 case RAIDFRAME_REBUILD_IN_PLACE:
1064 case RAIDFRAME_REMOVE_HOT_SPARE:
1065 case RAIDFRAME_RESET_ACCTOTALS:
1066 case RAIDFRAME_REWRITEPARITY:
1067 case RAIDFRAME_SET_AUTOCONFIG:
1068 case RAIDFRAME_SET_COMPONENT_LABEL:
1069 case RAIDFRAME_SET_ROOT:
1070 return (rs->sc_flags & RAIDF_INITED) != 0;
1071 }
1072 return false;
1073 }
1074
1075 int
1076 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1077 {
1078 struct rf_recon_req_internal *rrint;
1079
1080 if (raidPtr->Layout.map->faultsTolerated == 0) {
1081 /* Can't do this on a RAID 0!! */
1082 return EINVAL;
1083 }
1084
1085 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1086 /* bad column */
1087 return EINVAL;
1088 }
1089
1090 rf_lock_mutex2(raidPtr->mutex);
1091 if (raidPtr->status == rf_rs_reconstructing) {
1092 /* you can't fail a disk while we're reconstructing! */
1093 /* XXX wrong for RAID6 */
1094 goto out;
1095 }
1096 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1097 (raidPtr->numFailures > 0)) {
1098 /* some other component has failed. Let's not make
1099 things worse. XXX wrong for RAID6 */
1100 goto out;
1101 }
1102 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1103 /* Can't fail a spared disk! */
1104 goto out;
1105 }
1106 rf_unlock_mutex2(raidPtr->mutex);
1107
1108 /* make a copy of the recon request so that we don't rely on
1109 * the user's buffer */
1110 RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
1111 if (rrint == NULL)
1112 return(ENOMEM);
1113 rrint->col = rr->col;
1114 rrint->flags = rr->flags;
1115 rrint->raidPtr = raidPtr;
1116
1117 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1118 rrint, "raid_recon");
1119 out:
1120 rf_unlock_mutex2(raidPtr->mutex);
1121 return EINVAL;
1122 }
1123
1124 static int
1125 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1126 {
1127 /* allocate a buffer for the layout-specific data, and copy it in */
1128 if (k_cfg->layoutSpecificSize == 0)
1129 return 0;
1130
1131 if (k_cfg->layoutSpecificSize > 10000) {
1132 /* sanity check */
1133 return EINVAL;
1134 }
1135
1136 u_char *specific_buf;
1137 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, (u_char *));
1138 if (specific_buf == NULL)
1139 return ENOMEM;
1140
1141 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1142 k_cfg->layoutSpecificSize);
1143 if (retcode) {
1144 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1145 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1146 return retcode;
1147 }
1148
1149 k_cfg->layoutSpecific = specific_buf;
1150 return 0;
1151 }
1152
1153 static int
1154 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1155 {
1156 if (rs->sc_r.valid) {
1157 /* There is a valid RAID set running on this unit! */
1158 printf("raid%d: Device already configured!\n", rs->sc_unit);
1159 return EINVAL;
1160 }
1161
1162 /* copy-in the configuration information */
1163 /* data points to a pointer to the configuration structure */
1164 RF_Malloc(*k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1165 if (*k_cfg == NULL) {
1166 return ENOMEM;
1167 }
1168 int retcode = copyin(data, k_cfg, sizeof(RF_Config_t));
1169 if (retcode == 0)
1170 return 0;
1171 RF_Free(*k_cfg, sizeof(RF_Config_t));
1172 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1173 rs->sc_flags |= RAIDF_SHUTDOWN;
1174 return retcode;
1175 }
1176
1177 int
1178 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1179 {
1180 int retcode;
1181 RF_Raid_t *raidPtr = &rs->sc_r;
1182
1183 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1184
1185 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1186 goto out;
1187
1188 /* should do some kind of sanity check on the configuration.
1189 * Store the sum of all the bytes in the last byte? */
1190
1191 /* configure the system */
1192
1193 /*
1194 * Clear the entire RAID descriptor, just to make sure
1195 * there is no stale data left in the case of a
1196 * reconfiguration
1197 */
1198 memset(raidPtr, 0, sizeof(*raidPtr));
1199 raidPtr->softc = rs;
1200 raidPtr->raidid = rs->sc_unit;
1201
1202 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1203
1204 if (retcode == 0) {
1205 /* allow this many simultaneous IO's to
1206 this RAID device */
1207 raidPtr->openings = RAIDOUTSTANDING;
1208
1209 raidinit(rs);
1210 raid_wakeup(raidPtr);
1211 rf_markalldirty(raidPtr);
1212 }
1213
1214 /* free the buffers. No return code here. */
1215 if (k_cfg->layoutSpecificSize) {
1216 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1217 }
1218 out:
1219 RF_Free(k_cfg, sizeof(RF_Config_t));
1220 if (retcode) {
1221 /*
1222 * If configuration failed, set sc_flags so that we
1223 * will detach the device when we close it.
1224 */
1225 rs->sc_flags |= RAIDF_SHUTDOWN;
1226 }
1227 return retcode;
1228 }
1229
1230 #if RF_DISABLED
1231 static int
1232 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1233 {
1234
1235 /* XXX check the label for valid stuff... */
1236 /* Note that some things *should not* get modified --
1237 the user should be re-initing the labels instead of
1238 trying to patch things.
1239 */
1240 #ifdef DEBUG
1241 int raidid = raidPtr->raidid;
1242 printf("raid%d: Got component label:\n", raidid);
1243 printf("raid%d: Version: %d\n", raidid, clabel->version);
1244 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1245 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1246 printf("raid%d: Column: %d\n", raidid, clabel->column);
1247 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1248 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1249 printf("raid%d: Status: %d\n", raidid, clabel->status);
1250 #endif /* DEBUG */
1251 clabel->row = 0;
1252 int column = clabel->column;
1253
1254 if ((column < 0) || (column >= raidPtr->numCol)) {
1255 return(EINVAL);
1256 }
1257
1258 /* XXX this isn't allowed to do anything for now :-) */
1259
1260 /* XXX and before it is, we need to fill in the rest
1261 of the fields!?!?!?! */
1262 memcpy(raidget_component_label(raidPtr, column),
1263 clabel, sizeof(*clabel));
1264 raidflush_component_label(raidPtr, column);
1265 return 0;
1266 }
1267 #endif
1268
1269 static int
1270 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1271 {
1272 /*
1273 we only want the serial number from
1274 the above. We get all the rest of the information
1275 from the config that was used to create this RAID
1276 set.
1277 */
1278
1279 raidPtr->serial_number = clabel->serial_number;
1280
1281 for (int column = 0; column < raidPtr->numCol; column++) {
1282 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1283 if (RF_DEAD_DISK(diskPtr->status))
1284 continue;
1285 RF_ComponentLabel_t *ci_label = raidget_component_label(
1286 raidPtr, column);
1287 /* Zeroing this is important. */
1288 memset(ci_label, 0, sizeof(*ci_label));
1289 raid_init_component_label(raidPtr, ci_label);
1290 ci_label->serial_number = raidPtr->serial_number;
1291 ci_label->row = 0; /* we dont' pretend to support more */
1292 rf_component_label_set_partitionsize(ci_label,
1293 diskPtr->partitionSize);
1294 ci_label->column = column;
1295 raidflush_component_label(raidPtr, column);
1296 /* XXXjld what about the spares? */
1297 }
1298
1299 return 0;
1300 }
1301
1302 static int
1303 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1304 {
1305
1306 if (raidPtr->Layout.map->faultsTolerated == 0) {
1307 /* Can't do this on a RAID 0!! */
1308 return EINVAL;
1309 }
1310
1311 if (raidPtr->recon_in_progress == 1) {
1312 /* a reconstruct is already in progress! */
1313 return EINVAL;
1314 }
1315
1316 RF_SingleComponent_t component;
1317 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1318 component.row = 0; /* we don't support any more */
1319 int column = component.column;
1320
1321 if ((column < 0) || (column >= raidPtr->numCol)) {
1322 return EINVAL;
1323 }
1324
1325 rf_lock_mutex2(raidPtr->mutex);
1326 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1327 (raidPtr->numFailures > 0)) {
1328 /* XXX 0 above shouldn't be constant!!! */
1329 /* some component other than this has failed.
1330 Let's not make things worse than they already
1331 are... */
1332 printf("raid%d: Unable to reconstruct to disk at:\n",
1333 raidPtr->raidid);
1334 printf("raid%d: Col: %d Too many failures.\n",
1335 raidPtr->raidid, column);
1336 rf_unlock_mutex2(raidPtr->mutex);
1337 return EINVAL;
1338 }
1339
1340 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1341 printf("raid%d: Unable to reconstruct to disk at:\n",
1342 raidPtr->raidid);
1343 printf("raid%d: Col: %d "
1344 "Reconstruction already occurring!\n",
1345 raidPtr->raidid, column);
1346
1347 rf_unlock_mutex2(raidPtr->mutex);
1348 return EINVAL;
1349 }
1350
1351 if (raidPtr->Disks[column].status == rf_ds_spared) {
1352 rf_unlock_mutex2(raidPtr->mutex);
1353 return EINVAL;
1354 }
1355
1356 rf_unlock_mutex2(raidPtr->mutex);
1357
1358 struct rf_recon_req_internal *rrint;
1359 RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
1360 if (rrint == NULL)
1361 return ENOMEM;
1362
1363 rrint->col = column;
1364 rrint->raidPtr = raidPtr;
1365
1366 return RF_CREATE_THREAD(raidPtr->recon_thread,
1367 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1368 }
1369
1370 static int
1371 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1372 {
1373 /*
1374 * This makes no sense on a RAID 0, or if we are not reconstructing
1375 * so tell the user it's done.
1376 */
1377 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1378 raidPtr->status != rf_rs_reconstructing) {
1379 *data = 100;
1380 return 0;
1381 }
1382 if (raidPtr->reconControl->numRUsTotal == 0) {
1383 *data = 0;
1384 return 0;
1385 }
1386 *data = (raidPtr->reconControl->numRUsComplete * 100
1387 / raidPtr->reconControl->numRUsTotal);
1388 return 0;
1389 }
1390
1391 static int
1392 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1393 {
1394 int unit = raidunit(dev);
1395 int part, pmask;
1396 struct raid_softc *rs;
1397 struct dk_softc *dksc;
1398 RF_Config_t *k_cfg;
1399 RF_Raid_t *raidPtr;
1400 RF_AccTotals_t *totals;
1401 RF_SingleComponent_t component;
1402 RF_DeviceConfig_t *d_cfg, *ucfgp = data;
1403 int retcode = 0;
1404 int column;
1405 RF_ComponentLabel_t *clabel;
1406 RF_SingleComponent_t *sparePtr,*componentPtr;
1407 int d;
1408
1409 if ((rs = raidget(unit, false)) == NULL)
1410 return ENXIO;
1411
1412 dksc = &rs->sc_dksc;
1413 raidPtr = &rs->sc_r;
1414
1415 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1416 (int) DISKPART(dev), (int) unit, cmd));
1417
1418 /* Must be initialized for these... */
1419 if (rf_must_be_initialized(rs, cmd))
1420 return ENXIO;
1421
1422 switch (cmd) {
1423 /* configure the system */
1424 case RAIDFRAME_CONFIGURE:
1425 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1426 return retcode;
1427 return rf_construct(rs, k_cfg);
1428
1429 /* shutdown the system */
1430 case RAIDFRAME_SHUTDOWN:
1431
1432 part = DISKPART(dev);
1433 pmask = (1 << part);
1434
1435 if ((retcode = raidlock(rs)) != 0)
1436 return retcode;
1437
1438 if (DK_BUSY(dksc, pmask) ||
1439 raidPtr->recon_in_progress != 0 ||
1440 raidPtr->parity_rewrite_in_progress != 0 ||
1441 raidPtr->copyback_in_progress != 0)
1442 retcode = EBUSY;
1443 else {
1444 /* detach and free on close */
1445 rs->sc_flags |= RAIDF_SHUTDOWN;
1446 retcode = 0;
1447 }
1448
1449 raidunlock(rs);
1450
1451 return retcode;
1452 case RAIDFRAME_GET_COMPONENT_LABEL:
1453 return rf_get_component_label(raidPtr, data);
1454
1455 #if RF_DISABLED
1456 case RAIDFRAME_SET_COMPONENT_LABEL:
1457 return rf_set_component_label(raidPtr, data);
1458 #endif
1459
1460 case RAIDFRAME_INIT_LABELS:
1461 return rf_init_component_label(raidPtr, data);
1462
1463 case RAIDFRAME_SET_AUTOCONFIG:
1464 d = rf_set_autoconfig(raidPtr, *(int *) data);
1465 printf("raid%d: New autoconfig value is: %d\n",
1466 raidPtr->raidid, d);
1467 *(int *) data = d;
1468 return retcode;
1469
1470 case RAIDFRAME_SET_ROOT:
1471 d = rf_set_rootpartition(raidPtr, *(int *) data);
1472 printf("raid%d: New rootpartition value is: %d\n",
1473 raidPtr->raidid, d);
1474 *(int *) data = d;
1475 return retcode;
1476
1477 /* initialize all parity */
1478 case RAIDFRAME_REWRITEPARITY:
1479
1480 if (raidPtr->Layout.map->faultsTolerated == 0) {
1481 /* Parity for RAID 0 is trivially correct */
1482 raidPtr->parity_good = RF_RAID_CLEAN;
1483 return 0;
1484 }
1485
1486 if (raidPtr->parity_rewrite_in_progress == 1) {
1487 /* Re-write is already in progress! */
1488 return EINVAL;
1489 }
1490
1491 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1492 rf_RewriteParityThread, raidPtr,"raid_parity");
1493
1494 case RAIDFRAME_ADD_HOT_SPARE:
1495 sparePtr = (RF_SingleComponent_t *) data;
1496 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1497 return rf_add_hot_spare(raidPtr, &component);
1498
1499 case RAIDFRAME_REMOVE_HOT_SPARE:
1500 return retcode;
1501
1502 case RAIDFRAME_DELETE_COMPONENT:
1503 componentPtr = (RF_SingleComponent_t *)data;
1504 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1505 return rf_delete_component(raidPtr, &component);
1506
1507 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1508 componentPtr = (RF_SingleComponent_t *)data;
1509 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1510 return rf_incorporate_hot_spare(raidPtr, &component);
1511
1512 case RAIDFRAME_REBUILD_IN_PLACE:
1513 return rf_rebuild_in_place(raidPtr, data);
1514
1515 case RAIDFRAME_GET_INFO:
1516 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1517 (RF_DeviceConfig_t *));
1518 if (d_cfg == NULL)
1519 return ENOMEM;
1520 retcode = rf_get_info(raidPtr, d_cfg);
1521 if (retcode == 0) {
1522 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1523 }
1524 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1525 return retcode;
1526
1527 case RAIDFRAME_CHECK_PARITY:
1528 *(int *) data = raidPtr->parity_good;
1529 return 0;
1530
1531 case RAIDFRAME_PARITYMAP_STATUS:
1532 if (rf_paritymap_ineligible(raidPtr))
1533 return EINVAL;
1534 rf_paritymap_status(raidPtr->parity_map, data);
1535 return 0;
1536
1537 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1538 if (rf_paritymap_ineligible(raidPtr))
1539 return EINVAL;
1540 if (raidPtr->parity_map == NULL)
1541 return ENOENT; /* ??? */
1542 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1543 return EINVAL;
1544 return 0;
1545
1546 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1547 if (rf_paritymap_ineligible(raidPtr))
1548 return EINVAL;
1549 *(int *) data = rf_paritymap_get_disable(raidPtr);
1550 return 0;
1551
1552 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1553 if (rf_paritymap_ineligible(raidPtr))
1554 return EINVAL;
1555 rf_paritymap_set_disable(raidPtr, *(int *)data);
1556 /* XXX should errors be passed up? */
1557 return 0;
1558
1559 case RAIDFRAME_RESET_ACCTOTALS:
1560 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1561 return 0;
1562
1563 case RAIDFRAME_GET_ACCTOTALS:
1564 totals = (RF_AccTotals_t *) data;
1565 *totals = raidPtr->acc_totals;
1566 return 0;
1567
1568 case RAIDFRAME_KEEP_ACCTOTALS:
1569 raidPtr->keep_acc_totals = *(int *)data;
1570 return 0;
1571
1572 case RAIDFRAME_GET_SIZE:
1573 *(int *) data = raidPtr->totalSectors;
1574 return 0;
1575
1576 case RAIDFRAME_FAIL_DISK:
1577 return rf_fail_disk(raidPtr, data);
1578
1579 /* invoke a copyback operation after recon on whatever disk
1580 * needs it, if any */
1581 case RAIDFRAME_COPYBACK:
1582
1583 if (raidPtr->Layout.map->faultsTolerated == 0) {
1584 /* This makes no sense on a RAID 0!! */
1585 return EINVAL;
1586 }
1587
1588 if (raidPtr->copyback_in_progress == 1) {
1589 /* Copyback is already in progress! */
1590 return EINVAL;
1591 }
1592
1593 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1594 rf_CopybackThread, raidPtr, "raid_copyback");
1595
1596 /* return the percentage completion of reconstruction */
1597 case RAIDFRAME_CHECK_RECON_STATUS:
1598 return rf_check_recon_status(raidPtr, data);
1599
1600 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1601 rf_check_recon_status_ext(raidPtr, data);
1602 return 0;
1603
1604 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1605 if (raidPtr->Layout.map->faultsTolerated == 0) {
1606 /* This makes no sense on a RAID 0, so tell the
1607 user it's done. */
1608 *(int *) data = 100;
1609 return 0;
1610 }
1611 if (raidPtr->parity_rewrite_in_progress == 1) {
1612 *(int *) data = 100 *
1613 raidPtr->parity_rewrite_stripes_done /
1614 raidPtr->Layout.numStripe;
1615 } else {
1616 *(int *) data = 100;
1617 }
1618 return 0;
1619
1620 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1621 rf_check_parityrewrite_status_ext(raidPtr, data);
1622 return 0;
1623
1624 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1625 if (raidPtr->Layout.map->faultsTolerated == 0) {
1626 /* This makes no sense on a RAID 0 */
1627 *(int *) data = 100;
1628 return 0;
1629 }
1630 if (raidPtr->copyback_in_progress == 1) {
1631 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1632 raidPtr->Layout.numStripe;
1633 } else {
1634 *(int *) data = 100;
1635 }
1636 return 0;
1637
1638 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1639 rf_check_copyback_status_ext(raidPtr, data);
1640 return 0;
1641
1642 case RAIDFRAME_SET_LAST_UNIT:
1643 for (column = 0; column < raidPtr->numCol; column++)
1644 if (raidPtr->Disks[column].status != rf_ds_optimal)
1645 return EBUSY;
1646
1647 for (column = 0; column < raidPtr->numCol; column++) {
1648 clabel = raidget_component_label(raidPtr, column);
1649 clabel->last_unit = *(int *)data;
1650 raidflush_component_label(raidPtr, column);
1651 }
1652 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1653 return 0;
1654
1655 /* the sparetable daemon calls this to wait for the kernel to
1656 * need a spare table. this ioctl does not return until a
1657 * spare table is needed. XXX -- calling mpsleep here in the
1658 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1659 * -- I should either compute the spare table in the kernel,
1660 * or have a different -- XXX XXX -- interface (a different
1661 * character device) for delivering the table -- XXX */
1662 #if RF_DISABLED
1663 case RAIDFRAME_SPARET_WAIT:
1664 rf_lock_mutex2(rf_sparet_wait_mutex);
1665 while (!rf_sparet_wait_queue)
1666 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1667 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1668 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1669 rf_unlock_mutex2(rf_sparet_wait_mutex);
1670
1671 /* structure assignment */
1672 *((RF_SparetWait_t *) data) = *waitreq;
1673
1674 RF_Free(waitreq, sizeof(*waitreq));
1675 return 0;
1676
1677 /* wakes up a process waiting on SPARET_WAIT and puts an error
1678 * code in it that will cause the dameon to exit */
1679 case RAIDFRAME_ABORT_SPARET_WAIT:
1680 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1681 waitreq->fcol = -1;
1682 rf_lock_mutex2(rf_sparet_wait_mutex);
1683 waitreq->next = rf_sparet_wait_queue;
1684 rf_sparet_wait_queue = waitreq;
1685 rf_broadcast_cond2(rf_sparet_wait_cv);
1686 rf_unlock_mutex2(rf_sparet_wait_mutex);
1687 return 0;
1688
1689 /* used by the spare table daemon to deliver a spare table
1690 * into the kernel */
1691 case RAIDFRAME_SEND_SPARET:
1692
1693 /* install the spare table */
1694 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1695
1696 /* respond to the requestor. the return status of the spare
1697 * table installation is passed in the "fcol" field */
1698 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1699 waitreq->fcol = retcode;
1700 rf_lock_mutex2(rf_sparet_wait_mutex);
1701 waitreq->next = rf_sparet_resp_queue;
1702 rf_sparet_resp_queue = waitreq;
1703 rf_broadcast_cond2(rf_sparet_resp_cv);
1704 rf_unlock_mutex2(rf_sparet_wait_mutex);
1705
1706 return retcode;
1707 #endif
1708 default:
1709 #ifdef _LP64
1710 if ((l->l_proc->p_flag & PK_32) != 0) {
1711 module_autoload("compat_netbsd32_raid",
1712 MODULE_CLASS_EXEC);
1713 MODULE_CALL_HOOK(raidframe_netbsd32_ioctl_hook,
1714 (rs, cmd, data), enosys(), retcode);
1715 if (retcode != EPASSTHROUGH)
1716 return retcode;
1717 }
1718 #endif
1719 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1720 MODULE_CALL_HOOK(raidframe_ioctl_80_hook,
1721 (rs, cmd, data), enosys(), retcode);
1722 if (retcode != EPASSTHROUGH)
1723 return retcode;
1724
1725 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1726 MODULE_CALL_HOOK(raidframe_ioctl_50_hook,
1727 (rs, cmd, data), enosys(), retcode);
1728 if (retcode != EPASSTHROUGH)
1729 return retcode;
1730 break; /* fall through to the os-specific code below */
1731
1732 }
1733
1734 if (!raidPtr->valid)
1735 return (EINVAL);
1736
1737 /*
1738 * Add support for "regular" device ioctls here.
1739 */
1740
1741 switch (cmd) {
1742 case DIOCGCACHE:
1743 retcode = rf_get_component_caches(raidPtr, (int *)data);
1744 break;
1745
1746 case DIOCCACHESYNC:
1747 retcode = rf_sync_component_caches(raidPtr);
1748 break;
1749
1750 default:
1751 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1752 break;
1753 }
1754
1755 return (retcode);
1756
1757 }
1758
1759
1760 /* raidinit -- complete the rest of the initialization for the
1761 RAIDframe device. */
1762
1763
1764 static void
1765 raidinit(struct raid_softc *rs)
1766 {
1767 cfdata_t cf;
1768 unsigned int unit;
1769 struct dk_softc *dksc = &rs->sc_dksc;
1770 RF_Raid_t *raidPtr = &rs->sc_r;
1771 device_t dev;
1772
1773 unit = raidPtr->raidid;
1774
1775 /* XXX doesn't check bounds. */
1776 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1777
1778 /* attach the pseudo device */
1779 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1780 cf->cf_name = raid_cd.cd_name;
1781 cf->cf_atname = raid_cd.cd_name;
1782 cf->cf_unit = unit;
1783 cf->cf_fstate = FSTATE_STAR;
1784
1785 dev = config_attach_pseudo(cf);
1786 if (dev == NULL) {
1787 printf("raid%d: config_attach_pseudo failed\n",
1788 raidPtr->raidid);
1789 free(cf, M_RAIDFRAME);
1790 return;
1791 }
1792
1793 /* provide a backpointer to the real softc */
1794 raidsoftc(dev) = rs;
1795
1796 /* disk_attach actually creates space for the CPU disklabel, among
1797 * other things, so it's critical to call this *BEFORE* we try putzing
1798 * with disklabels. */
1799 dk_init(dksc, dev, DKTYPE_RAID);
1800 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1801
1802 /* XXX There may be a weird interaction here between this, and
1803 * protectedSectors, as used in RAIDframe. */
1804
1805 rs->sc_size = raidPtr->totalSectors;
1806
1807 /* Attach dk and disk subsystems */
1808 dk_attach(dksc);
1809 disk_attach(&dksc->sc_dkdev);
1810 rf_set_geometry(rs, raidPtr);
1811
1812 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1813
1814 /* mark unit as usuable */
1815 rs->sc_flags |= RAIDF_INITED;
1816
1817 dkwedge_discover(&dksc->sc_dkdev);
1818 }
1819
1820 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1821 /* wake up the daemon & tell it to get us a spare table
1822 * XXX
1823 * the entries in the queues should be tagged with the raidPtr
1824 * so that in the extremely rare case that two recons happen at once,
1825 * we know for which device were requesting a spare table
1826 * XXX
1827 *
1828 * XXX This code is not currently used. GO
1829 */
1830 int
1831 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1832 {
1833 int retcode;
1834
1835 rf_lock_mutex2(rf_sparet_wait_mutex);
1836 req->next = rf_sparet_wait_queue;
1837 rf_sparet_wait_queue = req;
1838 rf_broadcast_cond2(rf_sparet_wait_cv);
1839
1840 /* mpsleep unlocks the mutex */
1841 while (!rf_sparet_resp_queue) {
1842 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1843 }
1844 req = rf_sparet_resp_queue;
1845 rf_sparet_resp_queue = req->next;
1846 rf_unlock_mutex2(rf_sparet_wait_mutex);
1847
1848 retcode = req->fcol;
1849 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1850 * alloc'd */
1851 return (retcode);
1852 }
1853 #endif
1854
1855 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1856 * bp & passes it down.
1857 * any calls originating in the kernel must use non-blocking I/O
1858 * do some extra sanity checking to return "appropriate" error values for
1859 * certain conditions (to make some standard utilities work)
1860 *
1861 * Formerly known as: rf_DoAccessKernel
1862 */
1863 void
1864 raidstart(RF_Raid_t *raidPtr)
1865 {
1866 struct raid_softc *rs;
1867 struct dk_softc *dksc;
1868
1869 rs = raidPtr->softc;
1870 dksc = &rs->sc_dksc;
1871 /* quick check to see if anything has died recently */
1872 rf_lock_mutex2(raidPtr->mutex);
1873 if (raidPtr->numNewFailures > 0) {
1874 rf_unlock_mutex2(raidPtr->mutex);
1875 rf_update_component_labels(raidPtr,
1876 RF_NORMAL_COMPONENT_UPDATE);
1877 rf_lock_mutex2(raidPtr->mutex);
1878 raidPtr->numNewFailures--;
1879 }
1880 rf_unlock_mutex2(raidPtr->mutex);
1881
1882 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1883 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1884 return;
1885 }
1886
1887 dk_start(dksc, NULL);
1888 }
1889
1890 static int
1891 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1892 {
1893 RF_SectorCount_t num_blocks, pb, sum;
1894 RF_RaidAddr_t raid_addr;
1895 daddr_t blocknum;
1896 int do_async;
1897 int rc;
1898
1899 rf_lock_mutex2(raidPtr->mutex);
1900 if (raidPtr->openings == 0) {
1901 rf_unlock_mutex2(raidPtr->mutex);
1902 return EAGAIN;
1903 }
1904 rf_unlock_mutex2(raidPtr->mutex);
1905
1906 blocknum = bp->b_rawblkno;
1907
1908 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1909 (int) blocknum));
1910
1911 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1912 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1913
1914 /* *THIS* is where we adjust what block we're going to...
1915 * but DO NOT TOUCH bp->b_blkno!!! */
1916 raid_addr = blocknum;
1917
1918 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1919 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1920 sum = raid_addr + num_blocks + pb;
1921 if (1 || rf_debugKernelAccess) {
1922 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1923 (int) raid_addr, (int) sum, (int) num_blocks,
1924 (int) pb, (int) bp->b_resid));
1925 }
1926 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1927 || (sum < num_blocks) || (sum < pb)) {
1928 rc = ENOSPC;
1929 goto done;
1930 }
1931 /*
1932 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1933 */
1934
1935 if (bp->b_bcount & raidPtr->sectorMask) {
1936 rc = ENOSPC;
1937 goto done;
1938 }
1939 db1_printf(("Calling DoAccess..\n"));
1940
1941
1942 rf_lock_mutex2(raidPtr->mutex);
1943 raidPtr->openings--;
1944 rf_unlock_mutex2(raidPtr->mutex);
1945
1946 /*
1947 * Everything is async.
1948 */
1949 do_async = 1;
1950
1951 /* don't ever condition on bp->b_flags & B_WRITE.
1952 * always condition on B_READ instead */
1953
1954 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1955 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1956 do_async, raid_addr, num_blocks,
1957 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1958
1959 done:
1960 return rc;
1961 }
1962
1963 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1964
1965 int
1966 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1967 {
1968 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1969 struct buf *bp;
1970
1971 req->queue = queue;
1972 bp = req->bp;
1973
1974 switch (req->type) {
1975 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1976 /* XXX need to do something extra here.. */
1977 /* I'm leaving this in, as I've never actually seen it used,
1978 * and I'd like folks to report it... GO */
1979 printf(("WAKEUP CALLED\n"));
1980 queue->numOutstanding++;
1981
1982 bp->b_flags = 0;
1983 bp->b_private = req;
1984
1985 KernelWakeupFunc(bp);
1986 break;
1987
1988 case RF_IO_TYPE_READ:
1989 case RF_IO_TYPE_WRITE:
1990 #if RF_ACC_TRACE > 0
1991 if (req->tracerec) {
1992 RF_ETIMER_START(req->tracerec->timer);
1993 }
1994 #endif
1995 InitBP(bp, queue->rf_cinfo->ci_vp,
1996 op, queue->rf_cinfo->ci_dev,
1997 req->sectorOffset, req->numSector,
1998 req->buf, KernelWakeupFunc, (void *) req,
1999 queue->raidPtr->logBytesPerSector, req->b_proc);
2000
2001 if (rf_debugKernelAccess) {
2002 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2003 (long) bp->b_blkno));
2004 }
2005 queue->numOutstanding++;
2006 queue->last_deq_sector = req->sectorOffset;
2007 /* acc wouldn't have been let in if there were any pending
2008 * reqs at any other priority */
2009 queue->curPriority = req->priority;
2010
2011 db1_printf(("Going for %c to unit %d col %d\n",
2012 req->type, queue->raidPtr->raidid,
2013 queue->col));
2014 db1_printf(("sector %d count %d (%d bytes) %d\n",
2015 (int) req->sectorOffset, (int) req->numSector,
2016 (int) (req->numSector <<
2017 queue->raidPtr->logBytesPerSector),
2018 (int) queue->raidPtr->logBytesPerSector));
2019
2020 /*
2021 * XXX: drop lock here since this can block at
2022 * least with backing SCSI devices. Retake it
2023 * to minimize fuss with calling interfaces.
2024 */
2025
2026 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2027 bdev_strategy(bp);
2028 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2029 break;
2030
2031 default:
2032 panic("bad req->type in rf_DispatchKernelIO");
2033 }
2034 db1_printf(("Exiting from DispatchKernelIO\n"));
2035
2036 return (0);
2037 }
2038 /* this is the callback function associated with a I/O invoked from
2039 kernel code.
2040 */
2041 static void
2042 KernelWakeupFunc(struct buf *bp)
2043 {
2044 RF_DiskQueueData_t *req = NULL;
2045 RF_DiskQueue_t *queue;
2046
2047 db1_printf(("recovering the request queue:\n"));
2048
2049 req = bp->b_private;
2050
2051 queue = (RF_DiskQueue_t *) req->queue;
2052
2053 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2054
2055 #if RF_ACC_TRACE > 0
2056 if (req->tracerec) {
2057 RF_ETIMER_STOP(req->tracerec->timer);
2058 RF_ETIMER_EVAL(req->tracerec->timer);
2059 rf_lock_mutex2(rf_tracing_mutex);
2060 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2061 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2062 req->tracerec->num_phys_ios++;
2063 rf_unlock_mutex2(rf_tracing_mutex);
2064 }
2065 #endif
2066
2067 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2068 * ballistic, and mark the component as hosed... */
2069
2070 if (bp->b_error != 0) {
2071 /* Mark the disk as dead */
2072 /* but only mark it once... */
2073 /* and only if it wouldn't leave this RAID set
2074 completely broken */
2075 if (((queue->raidPtr->Disks[queue->col].status ==
2076 rf_ds_optimal) ||
2077 (queue->raidPtr->Disks[queue->col].status ==
2078 rf_ds_used_spare)) &&
2079 (queue->raidPtr->numFailures <
2080 queue->raidPtr->Layout.map->faultsTolerated)) {
2081 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2082 queue->raidPtr->raidid,
2083 bp->b_error,
2084 queue->raidPtr->Disks[queue->col].devname);
2085 queue->raidPtr->Disks[queue->col].status =
2086 rf_ds_failed;
2087 queue->raidPtr->status = rf_rs_degraded;
2088 queue->raidPtr->numFailures++;
2089 queue->raidPtr->numNewFailures++;
2090 } else { /* Disk is already dead... */
2091 /* printf("Disk already marked as dead!\n"); */
2092 }
2093
2094 }
2095
2096 /* Fill in the error value */
2097 req->error = bp->b_error;
2098
2099 /* Drop this one on the "finished" queue... */
2100 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2101
2102 /* Let the raidio thread know there is work to be done. */
2103 rf_signal_cond2(queue->raidPtr->iodone_cv);
2104
2105 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2106 }
2107
2108
2109 /*
2110 * initialize a buf structure for doing an I/O in the kernel.
2111 */
2112 static void
2113 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2114 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2115 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2116 struct proc *b_proc)
2117 {
2118 /* bp->b_flags = B_PHYS | rw_flag; */
2119 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2120 bp->b_oflags = 0;
2121 bp->b_cflags = 0;
2122 bp->b_bcount = numSect << logBytesPerSector;
2123 bp->b_bufsize = bp->b_bcount;
2124 bp->b_error = 0;
2125 bp->b_dev = dev;
2126 bp->b_data = bf;
2127 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2128 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2129 if (bp->b_bcount == 0) {
2130 panic("bp->b_bcount is zero in InitBP!!");
2131 }
2132 bp->b_proc = b_proc;
2133 bp->b_iodone = cbFunc;
2134 bp->b_private = cbArg;
2135 }
2136
2137 /*
2138 * Wait interruptibly for an exclusive lock.
2139 *
2140 * XXX
2141 * Several drivers do this; it should be abstracted and made MP-safe.
2142 * (Hmm... where have we seen this warning before :-> GO )
2143 */
2144 static int
2145 raidlock(struct raid_softc *rs)
2146 {
2147 int error;
2148
2149 error = 0;
2150 mutex_enter(&rs->sc_mutex);
2151 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2152 rs->sc_flags |= RAIDF_WANTED;
2153 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2154 if (error != 0)
2155 goto done;
2156 }
2157 rs->sc_flags |= RAIDF_LOCKED;
2158 done:
2159 mutex_exit(&rs->sc_mutex);
2160 return (error);
2161 }
2162 /*
2163 * Unlock and wake up any waiters.
2164 */
2165 static void
2166 raidunlock(struct raid_softc *rs)
2167 {
2168
2169 mutex_enter(&rs->sc_mutex);
2170 rs->sc_flags &= ~RAIDF_LOCKED;
2171 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2172 rs->sc_flags &= ~RAIDF_WANTED;
2173 cv_broadcast(&rs->sc_cv);
2174 }
2175 mutex_exit(&rs->sc_mutex);
2176 }
2177
2178
2179 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2180 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2181 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2182
2183 static daddr_t
2184 rf_component_info_offset(void)
2185 {
2186
2187 return RF_COMPONENT_INFO_OFFSET;
2188 }
2189
2190 static daddr_t
2191 rf_component_info_size(unsigned secsize)
2192 {
2193 daddr_t info_size;
2194
2195 KASSERT(secsize);
2196 if (secsize > RF_COMPONENT_INFO_SIZE)
2197 info_size = secsize;
2198 else
2199 info_size = RF_COMPONENT_INFO_SIZE;
2200
2201 return info_size;
2202 }
2203
2204 static daddr_t
2205 rf_parity_map_offset(RF_Raid_t *raidPtr)
2206 {
2207 daddr_t map_offset;
2208
2209 KASSERT(raidPtr->bytesPerSector);
2210 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2211 map_offset = raidPtr->bytesPerSector;
2212 else
2213 map_offset = RF_COMPONENT_INFO_SIZE;
2214 map_offset += rf_component_info_offset();
2215
2216 return map_offset;
2217 }
2218
2219 static daddr_t
2220 rf_parity_map_size(RF_Raid_t *raidPtr)
2221 {
2222 daddr_t map_size;
2223
2224 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2225 map_size = raidPtr->bytesPerSector;
2226 else
2227 map_size = RF_PARITY_MAP_SIZE;
2228
2229 return map_size;
2230 }
2231
2232 int
2233 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2234 {
2235 RF_ComponentLabel_t *clabel;
2236
2237 clabel = raidget_component_label(raidPtr, col);
2238 clabel->clean = RF_RAID_CLEAN;
2239 raidflush_component_label(raidPtr, col);
2240 return(0);
2241 }
2242
2243
2244 int
2245 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2246 {
2247 RF_ComponentLabel_t *clabel;
2248
2249 clabel = raidget_component_label(raidPtr, col);
2250 clabel->clean = RF_RAID_DIRTY;
2251 raidflush_component_label(raidPtr, col);
2252 return(0);
2253 }
2254
2255 int
2256 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2257 {
2258 KASSERT(raidPtr->bytesPerSector);
2259 return raidread_component_label(raidPtr->bytesPerSector,
2260 raidPtr->Disks[col].dev,
2261 raidPtr->raid_cinfo[col].ci_vp,
2262 &raidPtr->raid_cinfo[col].ci_label);
2263 }
2264
2265 RF_ComponentLabel_t *
2266 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2267 {
2268 return &raidPtr->raid_cinfo[col].ci_label;
2269 }
2270
2271 int
2272 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2273 {
2274 RF_ComponentLabel_t *label;
2275
2276 label = &raidPtr->raid_cinfo[col].ci_label;
2277 label->mod_counter = raidPtr->mod_counter;
2278 #ifndef RF_NO_PARITY_MAP
2279 label->parity_map_modcount = label->mod_counter;
2280 #endif
2281 return raidwrite_component_label(raidPtr->bytesPerSector,
2282 raidPtr->Disks[col].dev,
2283 raidPtr->raid_cinfo[col].ci_vp, label);
2284 }
2285
2286
2287 static int
2288 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2289 RF_ComponentLabel_t *clabel)
2290 {
2291 return raidread_component_area(dev, b_vp, clabel,
2292 sizeof(RF_ComponentLabel_t),
2293 rf_component_info_offset(),
2294 rf_component_info_size(secsize));
2295 }
2296
2297 /* ARGSUSED */
2298 static int
2299 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2300 size_t msize, daddr_t offset, daddr_t dsize)
2301 {
2302 struct buf *bp;
2303 int error;
2304
2305 /* XXX should probably ensure that we don't try to do this if
2306 someone has changed rf_protected_sectors. */
2307
2308 if (b_vp == NULL) {
2309 /* For whatever reason, this component is not valid.
2310 Don't try to read a component label from it. */
2311 return(EINVAL);
2312 }
2313
2314 /* get a block of the appropriate size... */
2315 bp = geteblk((int)dsize);
2316 bp->b_dev = dev;
2317
2318 /* get our ducks in a row for the read */
2319 bp->b_blkno = offset / DEV_BSIZE;
2320 bp->b_bcount = dsize;
2321 bp->b_flags |= B_READ;
2322 bp->b_resid = dsize;
2323
2324 bdev_strategy(bp);
2325 error = biowait(bp);
2326
2327 if (!error) {
2328 memcpy(data, bp->b_data, msize);
2329 }
2330
2331 brelse(bp, 0);
2332 return(error);
2333 }
2334
2335
2336 static int
2337 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2338 RF_ComponentLabel_t *clabel)
2339 {
2340 return raidwrite_component_area(dev, b_vp, clabel,
2341 sizeof(RF_ComponentLabel_t),
2342 rf_component_info_offset(),
2343 rf_component_info_size(secsize), 0);
2344 }
2345
2346 /* ARGSUSED */
2347 static int
2348 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2349 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2350 {
2351 struct buf *bp;
2352 int error;
2353
2354 /* get a block of the appropriate size... */
2355 bp = geteblk((int)dsize);
2356 bp->b_dev = dev;
2357
2358 /* get our ducks in a row for the write */
2359 bp->b_blkno = offset / DEV_BSIZE;
2360 bp->b_bcount = dsize;
2361 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2362 bp->b_resid = dsize;
2363
2364 memset(bp->b_data, 0, dsize);
2365 memcpy(bp->b_data, data, msize);
2366
2367 bdev_strategy(bp);
2368 if (asyncp)
2369 return 0;
2370 error = biowait(bp);
2371 brelse(bp, 0);
2372 if (error) {
2373 #if 1
2374 printf("Failed to write RAID component info!\n");
2375 #endif
2376 }
2377
2378 return(error);
2379 }
2380
2381 void
2382 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2383 {
2384 int c;
2385
2386 for (c = 0; c < raidPtr->numCol; c++) {
2387 /* Skip dead disks. */
2388 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2389 continue;
2390 /* XXXjld: what if an error occurs here? */
2391 raidwrite_component_area(raidPtr->Disks[c].dev,
2392 raidPtr->raid_cinfo[c].ci_vp, map,
2393 RF_PARITYMAP_NBYTE,
2394 rf_parity_map_offset(raidPtr),
2395 rf_parity_map_size(raidPtr), 0);
2396 }
2397 }
2398
2399 void
2400 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2401 {
2402 struct rf_paritymap_ondisk tmp;
2403 int c,first;
2404
2405 first=1;
2406 for (c = 0; c < raidPtr->numCol; c++) {
2407 /* Skip dead disks. */
2408 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2409 continue;
2410 raidread_component_area(raidPtr->Disks[c].dev,
2411 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2412 RF_PARITYMAP_NBYTE,
2413 rf_parity_map_offset(raidPtr),
2414 rf_parity_map_size(raidPtr));
2415 if (first) {
2416 memcpy(map, &tmp, sizeof(*map));
2417 first = 0;
2418 } else {
2419 rf_paritymap_merge(map, &tmp);
2420 }
2421 }
2422 }
2423
2424 void
2425 rf_markalldirty(RF_Raid_t *raidPtr)
2426 {
2427 RF_ComponentLabel_t *clabel;
2428 int sparecol;
2429 int c;
2430 int j;
2431 int scol = -1;
2432
2433 raidPtr->mod_counter++;
2434 for (c = 0; c < raidPtr->numCol; c++) {
2435 /* we don't want to touch (at all) a disk that has
2436 failed */
2437 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2438 clabel = raidget_component_label(raidPtr, c);
2439 if (clabel->status == rf_ds_spared) {
2440 /* XXX do something special...
2441 but whatever you do, don't
2442 try to access it!! */
2443 } else {
2444 raidmarkdirty(raidPtr, c);
2445 }
2446 }
2447 }
2448
2449 for( c = 0; c < raidPtr->numSpare ; c++) {
2450 sparecol = raidPtr->numCol + c;
2451 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2452 /*
2453
2454 we claim this disk is "optimal" if it's
2455 rf_ds_used_spare, as that means it should be
2456 directly substitutable for the disk it replaced.
2457 We note that too...
2458
2459 */
2460
2461 for(j=0;j<raidPtr->numCol;j++) {
2462 if (raidPtr->Disks[j].spareCol == sparecol) {
2463 scol = j;
2464 break;
2465 }
2466 }
2467
2468 clabel = raidget_component_label(raidPtr, sparecol);
2469 /* make sure status is noted */
2470
2471 raid_init_component_label(raidPtr, clabel);
2472
2473 clabel->row = 0;
2474 clabel->column = scol;
2475 /* Note: we *don't* change status from rf_ds_used_spare
2476 to rf_ds_optimal */
2477 /* clabel.status = rf_ds_optimal; */
2478
2479 raidmarkdirty(raidPtr, sparecol);
2480 }
2481 }
2482 }
2483
2484
2485 void
2486 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2487 {
2488 RF_ComponentLabel_t *clabel;
2489 int sparecol;
2490 int c;
2491 int j;
2492 int scol;
2493 struct raid_softc *rs = raidPtr->softc;
2494
2495 scol = -1;
2496
2497 /* XXX should do extra checks to make sure things really are clean,
2498 rather than blindly setting the clean bit... */
2499
2500 raidPtr->mod_counter++;
2501
2502 for (c = 0; c < raidPtr->numCol; c++) {
2503 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2504 clabel = raidget_component_label(raidPtr, c);
2505 /* make sure status is noted */
2506 clabel->status = rf_ds_optimal;
2507
2508 /* note what unit we are configured as */
2509 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2510 clabel->last_unit = raidPtr->raidid;
2511
2512 raidflush_component_label(raidPtr, c);
2513 if (final == RF_FINAL_COMPONENT_UPDATE) {
2514 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2515 raidmarkclean(raidPtr, c);
2516 }
2517 }
2518 }
2519 /* else we don't touch it.. */
2520 }
2521
2522 for( c = 0; c < raidPtr->numSpare ; c++) {
2523 sparecol = raidPtr->numCol + c;
2524 /* Need to ensure that the reconstruct actually completed! */
2525 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2526 /*
2527
2528 we claim this disk is "optimal" if it's
2529 rf_ds_used_spare, as that means it should be
2530 directly substitutable for the disk it replaced.
2531 We note that too...
2532
2533 */
2534
2535 for(j=0;j<raidPtr->numCol;j++) {
2536 if (raidPtr->Disks[j].spareCol == sparecol) {
2537 scol = j;
2538 break;
2539 }
2540 }
2541
2542 /* XXX shouldn't *really* need this... */
2543 clabel = raidget_component_label(raidPtr, sparecol);
2544 /* make sure status is noted */
2545
2546 raid_init_component_label(raidPtr, clabel);
2547
2548 clabel->column = scol;
2549 clabel->status = rf_ds_optimal;
2550 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2551 clabel->last_unit = raidPtr->raidid;
2552
2553 raidflush_component_label(raidPtr, sparecol);
2554 if (final == RF_FINAL_COMPONENT_UPDATE) {
2555 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2556 raidmarkclean(raidPtr, sparecol);
2557 }
2558 }
2559 }
2560 }
2561 }
2562
2563 void
2564 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2565 {
2566
2567 if (vp != NULL) {
2568 if (auto_configured == 1) {
2569 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2570 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2571 vput(vp);
2572
2573 } else {
2574 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2575 }
2576 }
2577 }
2578
2579
2580 void
2581 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2582 {
2583 int r,c;
2584 struct vnode *vp;
2585 int acd;
2586
2587
2588 /* We take this opportunity to close the vnodes like we should.. */
2589
2590 for (c = 0; c < raidPtr->numCol; c++) {
2591 vp = raidPtr->raid_cinfo[c].ci_vp;
2592 acd = raidPtr->Disks[c].auto_configured;
2593 rf_close_component(raidPtr, vp, acd);
2594 raidPtr->raid_cinfo[c].ci_vp = NULL;
2595 raidPtr->Disks[c].auto_configured = 0;
2596 }
2597
2598 for (r = 0; r < raidPtr->numSpare; r++) {
2599 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2600 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2601 rf_close_component(raidPtr, vp, acd);
2602 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2603 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2604 }
2605 }
2606
2607
2608 void
2609 rf_ReconThread(struct rf_recon_req_internal *req)
2610 {
2611 int s;
2612 RF_Raid_t *raidPtr;
2613
2614 s = splbio();
2615 raidPtr = (RF_Raid_t *) req->raidPtr;
2616 raidPtr->recon_in_progress = 1;
2617
2618 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2619 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2620
2621 RF_Free(req, sizeof(*req));
2622
2623 raidPtr->recon_in_progress = 0;
2624 splx(s);
2625
2626 /* That's all... */
2627 kthread_exit(0); /* does not return */
2628 }
2629
2630 void
2631 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2632 {
2633 int retcode;
2634 int s;
2635
2636 raidPtr->parity_rewrite_stripes_done = 0;
2637 raidPtr->parity_rewrite_in_progress = 1;
2638 s = splbio();
2639 retcode = rf_RewriteParity(raidPtr);
2640 splx(s);
2641 if (retcode) {
2642 printf("raid%d: Error re-writing parity (%d)!\n",
2643 raidPtr->raidid, retcode);
2644 } else {
2645 /* set the clean bit! If we shutdown correctly,
2646 the clean bit on each component label will get
2647 set */
2648 raidPtr->parity_good = RF_RAID_CLEAN;
2649 }
2650 raidPtr->parity_rewrite_in_progress = 0;
2651
2652 /* Anyone waiting for us to stop? If so, inform them... */
2653 if (raidPtr->waitShutdown) {
2654 rf_lock_mutex2(raidPtr->rad_lock);
2655 cv_broadcast(&raidPtr->parity_rewrite_cv);
2656 rf_unlock_mutex2(raidPtr->rad_lock);
2657 }
2658
2659 /* That's all... */
2660 kthread_exit(0); /* does not return */
2661 }
2662
2663
2664 void
2665 rf_CopybackThread(RF_Raid_t *raidPtr)
2666 {
2667 int s;
2668
2669 raidPtr->copyback_in_progress = 1;
2670 s = splbio();
2671 rf_CopybackReconstructedData(raidPtr);
2672 splx(s);
2673 raidPtr->copyback_in_progress = 0;
2674
2675 /* That's all... */
2676 kthread_exit(0); /* does not return */
2677 }
2678
2679
2680 void
2681 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2682 {
2683 int s;
2684 RF_Raid_t *raidPtr;
2685
2686 s = splbio();
2687 raidPtr = req->raidPtr;
2688 raidPtr->recon_in_progress = 1;
2689 rf_ReconstructInPlace(raidPtr, req->col);
2690 RF_Free(req, sizeof(*req));
2691 raidPtr->recon_in_progress = 0;
2692 splx(s);
2693
2694 /* That's all... */
2695 kthread_exit(0); /* does not return */
2696 }
2697
2698 static RF_AutoConfig_t *
2699 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2700 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2701 unsigned secsize)
2702 {
2703 int good_one = 0;
2704 RF_ComponentLabel_t *clabel;
2705 RF_AutoConfig_t *ac;
2706
2707 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2708 if (clabel == NULL) {
2709 oomem:
2710 while(ac_list) {
2711 ac = ac_list;
2712 if (ac->clabel)
2713 free(ac->clabel, M_RAIDFRAME);
2714 ac_list = ac_list->next;
2715 free(ac, M_RAIDFRAME);
2716 }
2717 printf("RAID auto config: out of memory!\n");
2718 return NULL; /* XXX probably should panic? */
2719 }
2720
2721 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2722 /* Got the label. Does it look reasonable? */
2723 if (rf_reasonable_label(clabel, numsecs) &&
2724 (rf_component_label_partitionsize(clabel) <= size)) {
2725 #ifdef DEBUG
2726 printf("Component on: %s: %llu\n",
2727 cname, (unsigned long long)size);
2728 rf_print_component_label(clabel);
2729 #endif
2730 /* if it's reasonable, add it, else ignore it. */
2731 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2732 M_NOWAIT);
2733 if (ac == NULL) {
2734 free(clabel, M_RAIDFRAME);
2735 goto oomem;
2736 }
2737 strlcpy(ac->devname, cname, sizeof(ac->devname));
2738 ac->dev = dev;
2739 ac->vp = vp;
2740 ac->clabel = clabel;
2741 ac->next = ac_list;
2742 ac_list = ac;
2743 good_one = 1;
2744 }
2745 }
2746 if (!good_one) {
2747 /* cleanup */
2748 free(clabel, M_RAIDFRAME);
2749 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2750 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2751 vput(vp);
2752 }
2753 return ac_list;
2754 }
2755
2756 RF_AutoConfig_t *
2757 rf_find_raid_components(void)
2758 {
2759 struct vnode *vp;
2760 struct disklabel label;
2761 device_t dv;
2762 deviter_t di;
2763 dev_t dev;
2764 int bmajor, bminor, wedge, rf_part_found;
2765 int error;
2766 int i;
2767 RF_AutoConfig_t *ac_list;
2768 uint64_t numsecs;
2769 unsigned secsize;
2770 int dowedges;
2771
2772 /* initialize the AutoConfig list */
2773 ac_list = NULL;
2774
2775 /*
2776 * we begin by trolling through *all* the devices on the system *twice*
2777 * first we scan for wedges, second for other devices. This avoids
2778 * using a raw partition instead of a wedge that covers the whole disk
2779 */
2780
2781 for (dowedges=1; dowedges>=0; --dowedges) {
2782 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2783 dv = deviter_next(&di)) {
2784
2785 /* we are only interested in disks... */
2786 if (device_class(dv) != DV_DISK)
2787 continue;
2788
2789 /* we don't care about floppies... */
2790 if (device_is_a(dv, "fd")) {
2791 continue;
2792 }
2793
2794 /* we don't care about CD's... */
2795 if (device_is_a(dv, "cd")) {
2796 continue;
2797 }
2798
2799 /* we don't care about md's... */
2800 if (device_is_a(dv, "md")) {
2801 continue;
2802 }
2803
2804 /* hdfd is the Atari/Hades floppy driver */
2805 if (device_is_a(dv, "hdfd")) {
2806 continue;
2807 }
2808
2809 /* fdisa is the Atari/Milan floppy driver */
2810 if (device_is_a(dv, "fdisa")) {
2811 continue;
2812 }
2813
2814 /* are we in the wedges pass ? */
2815 wedge = device_is_a(dv, "dk");
2816 if (wedge != dowedges) {
2817 continue;
2818 }
2819
2820 /* need to find the device_name_to_block_device_major stuff */
2821 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2822
2823 rf_part_found = 0; /*No raid partition as yet*/
2824
2825 /* get a vnode for the raw partition of this disk */
2826 bminor = minor(device_unit(dv));
2827 dev = wedge ? makedev(bmajor, bminor) :
2828 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2829 if (bdevvp(dev, &vp))
2830 panic("RAID can't alloc vnode");
2831
2832 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2833
2834 if (error) {
2835 /* "Who cares." Continue looking
2836 for something that exists*/
2837 vput(vp);
2838 continue;
2839 }
2840
2841 error = getdisksize(vp, &numsecs, &secsize);
2842 if (error) {
2843 /*
2844 * Pseudo devices like vnd and cgd can be
2845 * opened but may still need some configuration.
2846 * Ignore these quietly.
2847 */
2848 if (error != ENXIO)
2849 printf("RAIDframe: can't get disk size"
2850 " for dev %s (%d)\n",
2851 device_xname(dv), error);
2852 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2853 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2854 vput(vp);
2855 continue;
2856 }
2857 if (wedge) {
2858 struct dkwedge_info dkw;
2859 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2860 NOCRED);
2861 if (error) {
2862 printf("RAIDframe: can't get wedge info for "
2863 "dev %s (%d)\n", device_xname(dv), error);
2864 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2865 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2866 vput(vp);
2867 continue;
2868 }
2869
2870 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2871 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2872 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2873 vput(vp);
2874 continue;
2875 }
2876
2877 ac_list = rf_get_component(ac_list, dev, vp,
2878 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2879 rf_part_found = 1; /*There is a raid component on this disk*/
2880 continue;
2881 }
2882
2883 /* Ok, the disk exists. Go get the disklabel. */
2884 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2885 if (error) {
2886 /*
2887 * XXX can't happen - open() would
2888 * have errored out (or faked up one)
2889 */
2890 if (error != ENOTTY)
2891 printf("RAIDframe: can't get label for dev "
2892 "%s (%d)\n", device_xname(dv), error);
2893 }
2894
2895 /* don't need this any more. We'll allocate it again
2896 a little later if we really do... */
2897 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2898 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2899 vput(vp);
2900
2901 if (error)
2902 continue;
2903
2904 rf_part_found = 0; /*No raid partitions yet*/
2905 for (i = 0; i < label.d_npartitions; i++) {
2906 char cname[sizeof(ac_list->devname)];
2907
2908 /* We only support partitions marked as RAID */
2909 if (label.d_partitions[i].p_fstype != FS_RAID)
2910 continue;
2911
2912 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2913 if (bdevvp(dev, &vp))
2914 panic("RAID can't alloc vnode");
2915
2916 error = VOP_OPEN(vp, FREAD, NOCRED);
2917 if (error) {
2918 /* Whatever... */
2919 vput(vp);
2920 continue;
2921 }
2922 snprintf(cname, sizeof(cname), "%s%c",
2923 device_xname(dv), 'a' + i);
2924 ac_list = rf_get_component(ac_list, dev, vp, cname,
2925 label.d_partitions[i].p_size, numsecs, secsize);
2926 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2927 }
2928
2929 /*
2930 *If there is no raid component on this disk, either in a
2931 *disklabel or inside a wedge, check the raw partition as well,
2932 *as it is possible to configure raid components on raw disk
2933 *devices.
2934 */
2935
2936 if (!rf_part_found) {
2937 char cname[sizeof(ac_list->devname)];
2938
2939 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2940 if (bdevvp(dev, &vp))
2941 panic("RAID can't alloc vnode");
2942
2943 error = VOP_OPEN(vp, FREAD, NOCRED);
2944 if (error) {
2945 /* Whatever... */
2946 vput(vp);
2947 continue;
2948 }
2949 snprintf(cname, sizeof(cname), "%s%c",
2950 device_xname(dv), 'a' + RAW_PART);
2951 ac_list = rf_get_component(ac_list, dev, vp, cname,
2952 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2953 }
2954 }
2955 deviter_release(&di);
2956 }
2957 return ac_list;
2958 }
2959
2960
2961 int
2962 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2963 {
2964
2965 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2966 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2967 ((clabel->clean == RF_RAID_CLEAN) ||
2968 (clabel->clean == RF_RAID_DIRTY)) &&
2969 clabel->row >=0 &&
2970 clabel->column >= 0 &&
2971 clabel->num_rows > 0 &&
2972 clabel->num_columns > 0 &&
2973 clabel->row < clabel->num_rows &&
2974 clabel->column < clabel->num_columns &&
2975 clabel->blockSize > 0 &&
2976 /*
2977 * numBlocksHi may contain garbage, but it is ok since
2978 * the type is unsigned. If it is really garbage,
2979 * rf_fix_old_label_size() will fix it.
2980 */
2981 rf_component_label_numblocks(clabel) > 0) {
2982 /*
2983 * label looks reasonable enough...
2984 * let's make sure it has no old garbage.
2985 */
2986 if (numsecs)
2987 rf_fix_old_label_size(clabel, numsecs);
2988 return(1);
2989 }
2990 return(0);
2991 }
2992
2993
2994 /*
2995 * For reasons yet unknown, some old component labels have garbage in
2996 * the newer numBlocksHi region, and this causes lossage. Since those
2997 * disks will also have numsecs set to less than 32 bits of sectors,
2998 * we can determine when this corruption has occurred, and fix it.
2999 *
3000 * The exact same problem, with the same unknown reason, happens to
3001 * the partitionSizeHi member as well.
3002 */
3003 static void
3004 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3005 {
3006
3007 if (numsecs < ((uint64_t)1 << 32)) {
3008 if (clabel->numBlocksHi) {
3009 printf("WARNING: total sectors < 32 bits, yet "
3010 "numBlocksHi set\n"
3011 "WARNING: resetting numBlocksHi to zero.\n");
3012 clabel->numBlocksHi = 0;
3013 }
3014
3015 if (clabel->partitionSizeHi) {
3016 printf("WARNING: total sectors < 32 bits, yet "
3017 "partitionSizeHi set\n"
3018 "WARNING: resetting partitionSizeHi to zero.\n");
3019 clabel->partitionSizeHi = 0;
3020 }
3021 }
3022 }
3023
3024
3025 #ifdef DEBUG
3026 void
3027 rf_print_component_label(RF_ComponentLabel_t *clabel)
3028 {
3029 uint64_t numBlocks;
3030 static const char *rp[] = {
3031 "No", "Force", "Soft", "*invalid*"
3032 };
3033
3034
3035 numBlocks = rf_component_label_numblocks(clabel);
3036
3037 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3038 clabel->row, clabel->column,
3039 clabel->num_rows, clabel->num_columns);
3040 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3041 clabel->version, clabel->serial_number,
3042 clabel->mod_counter);
3043 printf(" Clean: %s Status: %d\n",
3044 clabel->clean ? "Yes" : "No", clabel->status);
3045 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3046 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3047 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3048 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3049 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3050 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3051 printf(" Last configured as: raid%d\n", clabel->last_unit);
3052 #if 0
3053 printf(" Config order: %d\n", clabel->config_order);
3054 #endif
3055
3056 }
3057 #endif
3058
3059 RF_ConfigSet_t *
3060 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3061 {
3062 RF_AutoConfig_t *ac;
3063 RF_ConfigSet_t *config_sets;
3064 RF_ConfigSet_t *cset;
3065 RF_AutoConfig_t *ac_next;
3066
3067
3068 config_sets = NULL;
3069
3070 /* Go through the AutoConfig list, and figure out which components
3071 belong to what sets. */
3072 ac = ac_list;
3073 while(ac!=NULL) {
3074 /* we're going to putz with ac->next, so save it here
3075 for use at the end of the loop */
3076 ac_next = ac->next;
3077
3078 if (config_sets == NULL) {
3079 /* will need at least this one... */
3080 config_sets = (RF_ConfigSet_t *)
3081 malloc(sizeof(RF_ConfigSet_t),
3082 M_RAIDFRAME, M_NOWAIT);
3083 if (config_sets == NULL) {
3084 panic("rf_create_auto_sets: No memory!");
3085 }
3086 /* this one is easy :) */
3087 config_sets->ac = ac;
3088 config_sets->next = NULL;
3089 config_sets->rootable = 0;
3090 ac->next = NULL;
3091 } else {
3092 /* which set does this component fit into? */
3093 cset = config_sets;
3094 while(cset!=NULL) {
3095 if (rf_does_it_fit(cset, ac)) {
3096 /* looks like it matches... */
3097 ac->next = cset->ac;
3098 cset->ac = ac;
3099 break;
3100 }
3101 cset = cset->next;
3102 }
3103 if (cset==NULL) {
3104 /* didn't find a match above... new set..*/
3105 cset = (RF_ConfigSet_t *)
3106 malloc(sizeof(RF_ConfigSet_t),
3107 M_RAIDFRAME, M_NOWAIT);
3108 if (cset == NULL) {
3109 panic("rf_create_auto_sets: No memory!");
3110 }
3111 cset->ac = ac;
3112 ac->next = NULL;
3113 cset->next = config_sets;
3114 cset->rootable = 0;
3115 config_sets = cset;
3116 }
3117 }
3118 ac = ac_next;
3119 }
3120
3121
3122 return(config_sets);
3123 }
3124
3125 static int
3126 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3127 {
3128 RF_ComponentLabel_t *clabel1, *clabel2;
3129
3130 /* If this one matches the *first* one in the set, that's good
3131 enough, since the other members of the set would have been
3132 through here too... */
3133 /* note that we are not checking partitionSize here..
3134
3135 Note that we are also not checking the mod_counters here.
3136 If everything else matches except the mod_counter, that's
3137 good enough for this test. We will deal with the mod_counters
3138 a little later in the autoconfiguration process.
3139
3140 (clabel1->mod_counter == clabel2->mod_counter) &&
3141
3142 The reason we don't check for this is that failed disks
3143 will have lower modification counts. If those disks are
3144 not added to the set they used to belong to, then they will
3145 form their own set, which may result in 2 different sets,
3146 for example, competing to be configured at raid0, and
3147 perhaps competing to be the root filesystem set. If the
3148 wrong ones get configured, or both attempt to become /,
3149 weird behaviour and or serious lossage will occur. Thus we
3150 need to bring them into the fold here, and kick them out at
3151 a later point.
3152
3153 */
3154
3155 clabel1 = cset->ac->clabel;
3156 clabel2 = ac->clabel;
3157 if ((clabel1->version == clabel2->version) &&
3158 (clabel1->serial_number == clabel2->serial_number) &&
3159 (clabel1->num_rows == clabel2->num_rows) &&
3160 (clabel1->num_columns == clabel2->num_columns) &&
3161 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3162 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3163 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3164 (clabel1->parityConfig == clabel2->parityConfig) &&
3165 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3166 (clabel1->blockSize == clabel2->blockSize) &&
3167 rf_component_label_numblocks(clabel1) ==
3168 rf_component_label_numblocks(clabel2) &&
3169 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3170 (clabel1->root_partition == clabel2->root_partition) &&
3171 (clabel1->last_unit == clabel2->last_unit) &&
3172 (clabel1->config_order == clabel2->config_order)) {
3173 /* if it get's here, it almost *has* to be a match */
3174 } else {
3175 /* it's not consistent with somebody in the set..
3176 punt */
3177 return(0);
3178 }
3179 /* all was fine.. it must fit... */
3180 return(1);
3181 }
3182
3183 int
3184 rf_have_enough_components(RF_ConfigSet_t *cset)
3185 {
3186 RF_AutoConfig_t *ac;
3187 RF_AutoConfig_t *auto_config;
3188 RF_ComponentLabel_t *clabel;
3189 int c;
3190 int num_cols;
3191 int num_missing;
3192 int mod_counter;
3193 int mod_counter_found;
3194 int even_pair_failed;
3195 char parity_type;
3196
3197
3198 /* check to see that we have enough 'live' components
3199 of this set. If so, we can configure it if necessary */
3200
3201 num_cols = cset->ac->clabel->num_columns;
3202 parity_type = cset->ac->clabel->parityConfig;
3203
3204 /* XXX Check for duplicate components!?!?!? */
3205
3206 /* Determine what the mod_counter is supposed to be for this set. */
3207
3208 mod_counter_found = 0;
3209 mod_counter = 0;
3210 ac = cset->ac;
3211 while(ac!=NULL) {
3212 if (mod_counter_found==0) {
3213 mod_counter = ac->clabel->mod_counter;
3214 mod_counter_found = 1;
3215 } else {
3216 if (ac->clabel->mod_counter > mod_counter) {
3217 mod_counter = ac->clabel->mod_counter;
3218 }
3219 }
3220 ac = ac->next;
3221 }
3222
3223 num_missing = 0;
3224 auto_config = cset->ac;
3225
3226 even_pair_failed = 0;
3227 for(c=0; c<num_cols; c++) {
3228 ac = auto_config;
3229 while(ac!=NULL) {
3230 if ((ac->clabel->column == c) &&
3231 (ac->clabel->mod_counter == mod_counter)) {
3232 /* it's this one... */
3233 #ifdef DEBUG
3234 printf("Found: %s at %d\n",
3235 ac->devname,c);
3236 #endif
3237 break;
3238 }
3239 ac=ac->next;
3240 }
3241 if (ac==NULL) {
3242 /* Didn't find one here! */
3243 /* special case for RAID 1, especially
3244 where there are more than 2
3245 components (where RAIDframe treats
3246 things a little differently :( ) */
3247 if (parity_type == '1') {
3248 if (c%2 == 0) { /* even component */
3249 even_pair_failed = 1;
3250 } else { /* odd component. If
3251 we're failed, and
3252 so is the even
3253 component, it's
3254 "Good Night, Charlie" */
3255 if (even_pair_failed == 1) {
3256 return(0);
3257 }
3258 }
3259 } else {
3260 /* normal accounting */
3261 num_missing++;
3262 }
3263 }
3264 if ((parity_type == '1') && (c%2 == 1)) {
3265 /* Just did an even component, and we didn't
3266 bail.. reset the even_pair_failed flag,
3267 and go on to the next component.... */
3268 even_pair_failed = 0;
3269 }
3270 }
3271
3272 clabel = cset->ac->clabel;
3273
3274 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3275 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3276 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3277 /* XXX this needs to be made *much* more general */
3278 /* Too many failures */
3279 return(0);
3280 }
3281 /* otherwise, all is well, and we've got enough to take a kick
3282 at autoconfiguring this set */
3283 return(1);
3284 }
3285
3286 void
3287 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3288 RF_Raid_t *raidPtr)
3289 {
3290 RF_ComponentLabel_t *clabel;
3291 int i;
3292
3293 clabel = ac->clabel;
3294
3295 /* 1. Fill in the common stuff */
3296 config->numCol = clabel->num_columns;
3297 config->numSpare = 0; /* XXX should this be set here? */
3298 config->sectPerSU = clabel->sectPerSU;
3299 config->SUsPerPU = clabel->SUsPerPU;
3300 config->SUsPerRU = clabel->SUsPerRU;
3301 config->parityConfig = clabel->parityConfig;
3302 /* XXX... */
3303 strcpy(config->diskQueueType,"fifo");
3304 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3305 config->layoutSpecificSize = 0; /* XXX ?? */
3306
3307 while(ac!=NULL) {
3308 /* row/col values will be in range due to the checks
3309 in reasonable_label() */
3310 strcpy(config->devnames[0][ac->clabel->column],
3311 ac->devname);
3312 ac = ac->next;
3313 }
3314
3315 for(i=0;i<RF_MAXDBGV;i++) {
3316 config->debugVars[i][0] = 0;
3317 }
3318 }
3319
3320 int
3321 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3322 {
3323 RF_ComponentLabel_t *clabel;
3324 int column;
3325 int sparecol;
3326
3327 raidPtr->autoconfigure = new_value;
3328
3329 for(column=0; column<raidPtr->numCol; column++) {
3330 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3331 clabel = raidget_component_label(raidPtr, column);
3332 clabel->autoconfigure = new_value;
3333 raidflush_component_label(raidPtr, column);
3334 }
3335 }
3336 for(column = 0; column < raidPtr->numSpare ; column++) {
3337 sparecol = raidPtr->numCol + column;
3338 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3339 clabel = raidget_component_label(raidPtr, sparecol);
3340 clabel->autoconfigure = new_value;
3341 raidflush_component_label(raidPtr, sparecol);
3342 }
3343 }
3344 return(new_value);
3345 }
3346
3347 int
3348 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3349 {
3350 RF_ComponentLabel_t *clabel;
3351 int column;
3352 int sparecol;
3353
3354 raidPtr->root_partition = new_value;
3355 for(column=0; column<raidPtr->numCol; column++) {
3356 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3357 clabel = raidget_component_label(raidPtr, column);
3358 clabel->root_partition = new_value;
3359 raidflush_component_label(raidPtr, column);
3360 }
3361 }
3362 for(column = 0; column < raidPtr->numSpare ; column++) {
3363 sparecol = raidPtr->numCol + column;
3364 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3365 clabel = raidget_component_label(raidPtr, sparecol);
3366 clabel->root_partition = new_value;
3367 raidflush_component_label(raidPtr, sparecol);
3368 }
3369 }
3370 return(new_value);
3371 }
3372
3373 void
3374 rf_release_all_vps(RF_ConfigSet_t *cset)
3375 {
3376 RF_AutoConfig_t *ac;
3377
3378 ac = cset->ac;
3379 while(ac!=NULL) {
3380 /* Close the vp, and give it back */
3381 if (ac->vp) {
3382 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3383 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3384 vput(ac->vp);
3385 ac->vp = NULL;
3386 }
3387 ac = ac->next;
3388 }
3389 }
3390
3391
3392 void
3393 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3394 {
3395 RF_AutoConfig_t *ac;
3396 RF_AutoConfig_t *next_ac;
3397
3398 ac = cset->ac;
3399 while(ac!=NULL) {
3400 next_ac = ac->next;
3401 /* nuke the label */
3402 free(ac->clabel, M_RAIDFRAME);
3403 /* cleanup the config structure */
3404 free(ac, M_RAIDFRAME);
3405 /* "next.." */
3406 ac = next_ac;
3407 }
3408 /* and, finally, nuke the config set */
3409 free(cset, M_RAIDFRAME);
3410 }
3411
3412
3413 void
3414 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3415 {
3416 /* current version number */
3417 clabel->version = RF_COMPONENT_LABEL_VERSION;
3418 clabel->serial_number = raidPtr->serial_number;
3419 clabel->mod_counter = raidPtr->mod_counter;
3420
3421 clabel->num_rows = 1;
3422 clabel->num_columns = raidPtr->numCol;
3423 clabel->clean = RF_RAID_DIRTY; /* not clean */
3424 clabel->status = rf_ds_optimal; /* "It's good!" */
3425
3426 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3427 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3428 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3429
3430 clabel->blockSize = raidPtr->bytesPerSector;
3431 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3432
3433 /* XXX not portable */
3434 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3435 clabel->maxOutstanding = raidPtr->maxOutstanding;
3436 clabel->autoconfigure = raidPtr->autoconfigure;
3437 clabel->root_partition = raidPtr->root_partition;
3438 clabel->last_unit = raidPtr->raidid;
3439 clabel->config_order = raidPtr->config_order;
3440
3441 #ifndef RF_NO_PARITY_MAP
3442 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3443 #endif
3444 }
3445
3446 struct raid_softc *
3447 rf_auto_config_set(RF_ConfigSet_t *cset)
3448 {
3449 RF_Raid_t *raidPtr;
3450 RF_Config_t *config;
3451 int raidID;
3452 struct raid_softc *sc;
3453
3454 #ifdef DEBUG
3455 printf("RAID autoconfigure\n");
3456 #endif
3457
3458 /* 1. Create a config structure */
3459 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3460 if (config == NULL) {
3461 printf("%s: Out of mem - config!?!?\n", __func__);
3462 /* XXX do something more intelligent here. */
3463 return NULL;
3464 }
3465
3466 /*
3467 2. Figure out what RAID ID this one is supposed to live at
3468 See if we can get the same RAID dev that it was configured
3469 on last time..
3470 */
3471
3472 raidID = cset->ac->clabel->last_unit;
3473 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3474 sc = raidget(++raidID, false))
3475 continue;
3476 #ifdef DEBUG
3477 printf("Configuring raid%d:\n",raidID);
3478 #endif
3479
3480 if (sc == NULL)
3481 sc = raidget(raidID, true);
3482 if (sc == NULL) {
3483 printf("%s: Out of mem - softc!?!?\n", __func__);
3484 /* XXX do something more intelligent here. */
3485 free(config, M_RAIDFRAME);
3486 return NULL;
3487 }
3488
3489 raidPtr = &sc->sc_r;
3490
3491 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3492 raidPtr->softc = sc;
3493 raidPtr->raidid = raidID;
3494 raidPtr->openings = RAIDOUTSTANDING;
3495
3496 /* 3. Build the configuration structure */
3497 rf_create_configuration(cset->ac, config, raidPtr);
3498
3499 /* 4. Do the configuration */
3500 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3501 raidinit(sc);
3502
3503 rf_markalldirty(raidPtr);
3504 raidPtr->autoconfigure = 1; /* XXX do this here? */
3505 switch (cset->ac->clabel->root_partition) {
3506 case 1: /* Force Root */
3507 case 2: /* Soft Root: root when boot partition part of raid */
3508 /*
3509 * everything configured just fine. Make a note
3510 * that this set is eligible to be root,
3511 * or forced to be root
3512 */
3513 cset->rootable = cset->ac->clabel->root_partition;
3514 /* XXX do this here? */
3515 raidPtr->root_partition = cset->rootable;
3516 break;
3517 default:
3518 break;
3519 }
3520 } else {
3521 raidput(sc);
3522 sc = NULL;
3523 }
3524
3525 /* 5. Cleanup */
3526 free(config, M_RAIDFRAME);
3527 return sc;
3528 }
3529
3530 void
3531 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3532 size_t xmin, size_t xmax)
3533 {
3534 int error;
3535
3536 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3537 pool_sethiwat(p, xmax);
3538 if ((error = pool_prime(p, xmin)) != 0)
3539 panic("%s: failed to prime pool: %d", __func__, error);
3540 pool_setlowat(p, xmin);
3541 }
3542
3543 /*
3544 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3545 * to see if there is IO pending and if that IO could possibly be done
3546 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3547 * otherwise.
3548 *
3549 */
3550 int
3551 rf_buf_queue_check(RF_Raid_t *raidPtr)
3552 {
3553 struct raid_softc *rs;
3554 struct dk_softc *dksc;
3555
3556 rs = raidPtr->softc;
3557 dksc = &rs->sc_dksc;
3558
3559 if ((rs->sc_flags & RAIDF_INITED) == 0)
3560 return 1;
3561
3562 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3563 /* there is work to do */
3564 return 0;
3565 }
3566 /* default is nothing to do */
3567 return 1;
3568 }
3569
3570 int
3571 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3572 {
3573 uint64_t numsecs;
3574 unsigned secsize;
3575 int error;
3576
3577 error = getdisksize(vp, &numsecs, &secsize);
3578 if (error == 0) {
3579 diskPtr->blockSize = secsize;
3580 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3581 diskPtr->partitionSize = numsecs;
3582 return 0;
3583 }
3584 return error;
3585 }
3586
3587 static int
3588 raid_match(device_t self, cfdata_t cfdata, void *aux)
3589 {
3590 return 1;
3591 }
3592
3593 static void
3594 raid_attach(device_t parent, device_t self, void *aux)
3595 {
3596 }
3597
3598
3599 static int
3600 raid_detach(device_t self, int flags)
3601 {
3602 int error;
3603 struct raid_softc *rs = raidsoftc(self);
3604
3605 if (rs == NULL)
3606 return ENXIO;
3607
3608 if ((error = raidlock(rs)) != 0)
3609 return (error);
3610
3611 error = raid_detach_unlocked(rs);
3612
3613 raidunlock(rs);
3614
3615 /* XXX raid can be referenced here */
3616
3617 if (error)
3618 return error;
3619
3620 /* Free the softc */
3621 raidput(rs);
3622
3623 return 0;
3624 }
3625
3626 static void
3627 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3628 {
3629 struct dk_softc *dksc = &rs->sc_dksc;
3630 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3631
3632 memset(dg, 0, sizeof(*dg));
3633
3634 dg->dg_secperunit = raidPtr->totalSectors;
3635 dg->dg_secsize = raidPtr->bytesPerSector;
3636 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3637 dg->dg_ntracks = 4 * raidPtr->numCol;
3638
3639 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3640 }
3641
3642 /*
3643 * Get cache info for all the components (including spares).
3644 * Returns intersection of all the cache flags of all disks, or first
3645 * error if any encountered.
3646 * XXXfua feature flags can change as spares are added - lock down somehow
3647 */
3648 static int
3649 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3650 {
3651 int c;
3652 int error;
3653 int dkwhole = 0, dkpart;
3654
3655 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3656 /*
3657 * Check any non-dead disk, even when currently being
3658 * reconstructed.
3659 */
3660 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3661 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3662 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3663 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3664 if (error) {
3665 if (error != ENODEV) {
3666 printf("raid%d: get cache for component %s failed\n",
3667 raidPtr->raidid,
3668 raidPtr->Disks[c].devname);
3669 }
3670
3671 return error;
3672 }
3673
3674 if (c == 0)
3675 dkwhole = dkpart;
3676 else
3677 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3678 }
3679 }
3680
3681 *data = dkwhole;
3682
3683 return 0;
3684 }
3685
3686 /*
3687 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3688 * We end up returning whatever error was returned by the first cache flush
3689 * that fails.
3690 */
3691
3692 int
3693 rf_sync_component_caches(RF_Raid_t *raidPtr)
3694 {
3695 int c, sparecol;
3696 int e,error;
3697 int force = 1;
3698
3699 error = 0;
3700 for (c = 0; c < raidPtr->numCol; c++) {
3701 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3702 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3703 &force, FWRITE, NOCRED);
3704 if (e) {
3705 if (e != ENODEV)
3706 printf("raid%d: cache flush to component %s failed.\n",
3707 raidPtr->raidid, raidPtr->Disks[c].devname);
3708 if (error == 0) {
3709 error = e;
3710 }
3711 }
3712 }
3713 }
3714
3715 for( c = 0; c < raidPtr->numSpare ; c++) {
3716 sparecol = raidPtr->numCol + c;
3717 /* Need to ensure that the reconstruct actually completed! */
3718 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3719 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3720 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3721 if (e) {
3722 if (e != ENODEV)
3723 printf("raid%d: cache flush to component %s failed.\n",
3724 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3725 if (error == 0) {
3726 error = e;
3727 }
3728 }
3729 }
3730 }
3731 return error;
3732 }
3733
3734 /* Fill in info with the current status */
3735 void
3736 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3737 {
3738
3739 if (raidPtr->status != rf_rs_reconstructing) {
3740 info->total = 100;
3741 info->completed = 100;
3742 } else {
3743 info->total = raidPtr->reconControl->numRUsTotal;
3744 info->completed = raidPtr->reconControl->numRUsComplete;
3745 }
3746 info->remaining = info->total - info->completed;
3747 }
3748
3749 /* Fill in info with the current status */
3750 void
3751 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3752 {
3753
3754 if (raidPtr->parity_rewrite_in_progress == 1) {
3755 info->total = raidPtr->Layout.numStripe;
3756 info->completed = raidPtr->parity_rewrite_stripes_done;
3757 } else {
3758 info->completed = 100;
3759 info->total = 100;
3760 }
3761 info->remaining = info->total - info->completed;
3762 }
3763
3764 /* Fill in info with the current status */
3765 void
3766 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3767 {
3768
3769 if (raidPtr->copyback_in_progress == 1) {
3770 info->total = raidPtr->Layout.numStripe;
3771 info->completed = raidPtr->copyback_stripes_done;
3772 info->remaining = info->total - info->completed;
3773 } else {
3774 info->remaining = 0;
3775 info->completed = 100;
3776 info->total = 100;
3777 }
3778 }
3779
3780 /* Fill in config with the current info */
3781 int
3782 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3783 {
3784 int d, i, j;
3785
3786 if (!raidPtr->valid)
3787 return (ENODEV);
3788 config->cols = raidPtr->numCol;
3789 config->ndevs = raidPtr->numCol;
3790 if (config->ndevs >= RF_MAX_DISKS)
3791 return (ENOMEM);
3792 config->nspares = raidPtr->numSpare;
3793 if (config->nspares >= RF_MAX_DISKS)
3794 return (ENOMEM);
3795 config->maxqdepth = raidPtr->maxQueueDepth;
3796 d = 0;
3797 for (j = 0; j < config->cols; j++) {
3798 config->devs[d] = raidPtr->Disks[j];
3799 d++;
3800 }
3801 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3802 config->spares[i] = raidPtr->Disks[j];
3803 if (config->spares[i].status == rf_ds_rebuilding_spare) {
3804 /* XXX: raidctl(8) expects to see this as a used spare */
3805 config->spares[i].status = rf_ds_used_spare;
3806 }
3807 }
3808 return 0;
3809 }
3810
3811 int
3812 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3813 {
3814 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3815 RF_ComponentLabel_t *raid_clabel;
3816 int column = clabel->column;
3817
3818 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3819 return EINVAL;
3820 raid_clabel = raidget_component_label(raidPtr, column);
3821 memcpy(clabel, raid_clabel, sizeof *clabel);
3822
3823 return 0;
3824 }
3825
3826 /*
3827 * Module interface
3828 */
3829
3830 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3831
3832 #ifdef _MODULE
3833 CFDRIVER_DECL(raid, DV_DISK, NULL);
3834 #endif
3835
3836 static int raid_modcmd(modcmd_t, void *);
3837 static int raid_modcmd_init(void);
3838 static int raid_modcmd_fini(void);
3839
3840 static int
3841 raid_modcmd(modcmd_t cmd, void *data)
3842 {
3843 int error;
3844
3845 error = 0;
3846 switch (cmd) {
3847 case MODULE_CMD_INIT:
3848 error = raid_modcmd_init();
3849 break;
3850 case MODULE_CMD_FINI:
3851 error = raid_modcmd_fini();
3852 break;
3853 default:
3854 error = ENOTTY;
3855 break;
3856 }
3857 return error;
3858 }
3859
3860 static int
3861 raid_modcmd_init(void)
3862 {
3863 int error;
3864 int bmajor, cmajor;
3865
3866 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3867 mutex_enter(&raid_lock);
3868 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3869 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3870 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3871 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3872
3873 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3874 #endif
3875
3876 bmajor = cmajor = -1;
3877 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3878 &raid_cdevsw, &cmajor);
3879 if (error != 0 && error != EEXIST) {
3880 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3881 mutex_exit(&raid_lock);
3882 return error;
3883 }
3884 #ifdef _MODULE
3885 error = config_cfdriver_attach(&raid_cd);
3886 if (error != 0) {
3887 aprint_error("%s: config_cfdriver_attach failed %d\n",
3888 __func__, error);
3889 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3890 mutex_exit(&raid_lock);
3891 return error;
3892 }
3893 #endif
3894 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3895 if (error != 0) {
3896 aprint_error("%s: config_cfattach_attach failed %d\n",
3897 __func__, error);
3898 #ifdef _MODULE
3899 config_cfdriver_detach(&raid_cd);
3900 #endif
3901 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3902 mutex_exit(&raid_lock);
3903 return error;
3904 }
3905
3906 raidautoconfigdone = false;
3907
3908 mutex_exit(&raid_lock);
3909
3910 if (error == 0) {
3911 if (rf_BootRaidframe(true) == 0)
3912 aprint_verbose("Kernelized RAIDframe activated\n");
3913 else
3914 panic("Serious error activating RAID!!");
3915 }
3916
3917 /*
3918 * Register a finalizer which will be used to auto-config RAID
3919 * sets once all real hardware devices have been found.
3920 */
3921 error = config_finalize_register(NULL, rf_autoconfig);
3922 if (error != 0) {
3923 aprint_error("WARNING: unable to register RAIDframe "
3924 "finalizer\n");
3925 error = 0;
3926 }
3927
3928 return error;
3929 }
3930
3931 static int
3932 raid_modcmd_fini(void)
3933 {
3934 int error;
3935
3936 mutex_enter(&raid_lock);
3937
3938 /* Don't allow unload if raid device(s) exist. */
3939 if (!LIST_EMPTY(&raids)) {
3940 mutex_exit(&raid_lock);
3941 return EBUSY;
3942 }
3943
3944 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3945 if (error != 0) {
3946 aprint_error("%s: cannot detach cfattach\n",__func__);
3947 mutex_exit(&raid_lock);
3948 return error;
3949 }
3950 #ifdef _MODULE
3951 error = config_cfdriver_detach(&raid_cd);
3952 if (error != 0) {
3953 aprint_error("%s: cannot detach cfdriver\n",__func__);
3954 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3955 mutex_exit(&raid_lock);
3956 return error;
3957 }
3958 #endif
3959 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3960 if (error != 0) {
3961 aprint_error("%s: cannot detach devsw\n",__func__);
3962 #ifdef _MODULE
3963 config_cfdriver_attach(&raid_cd);
3964 #endif
3965 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3966 mutex_exit(&raid_lock);
3967 return error;
3968 }
3969 rf_BootRaidframe(false);
3970 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3971 rf_destroy_mutex2(rf_sparet_wait_mutex);
3972 rf_destroy_cond2(rf_sparet_wait_cv);
3973 rf_destroy_cond2(rf_sparet_resp_cv);
3974 #endif
3975 mutex_exit(&raid_lock);
3976 mutex_destroy(&raid_lock);
3977
3978 return error;
3979 }
3980