rf_netbsdkintf.c revision 1.346 1 /* $NetBSD: rf_netbsdkintf.c,v 1.346 2016/09/19 23:32:30 jdolecek Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.346 2016/09/19 23:32:30 jdolecek Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #include "ioconf.h"
156
157 #ifdef DEBUG
158 int rf_kdebug_level = 0;
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #else /* DEBUG */
161 #define db1_printf(a) { }
162 #endif /* DEBUG */
163
164 #ifdef DEBUG_ROOT
165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
166 #else
167 #define DPRINTF(a, ...)
168 #endif
169
170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
171 static rf_declare_mutex2(rf_sparet_wait_mutex);
172 static rf_declare_cond2(rf_sparet_wait_cv);
173 static rf_declare_cond2(rf_sparet_resp_cv);
174
175 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
176 * spare table */
177 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
178 * installation process */
179 #endif
180
181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
182
183 /* prototypes */
184 static void KernelWakeupFunc(struct buf *);
185 static void InitBP(struct buf *, struct vnode *, unsigned,
186 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
187 void *, int, struct proc *);
188 struct raid_softc;
189 static void raidinit(struct raid_softc *);
190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
191
192 static int raid_match(device_t, cfdata_t, void *);
193 static void raid_attach(device_t, device_t, void *);
194 static int raid_detach(device_t, int);
195
196 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
197 daddr_t, daddr_t);
198 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
199 daddr_t, daddr_t, int);
200
201 static int raidwrite_component_label(unsigned,
202 dev_t, struct vnode *, RF_ComponentLabel_t *);
203 static int raidread_component_label(unsigned,
204 dev_t, struct vnode *, RF_ComponentLabel_t *);
205
206 static int raid_diskstart(device_t, struct buf *bp);
207 static int raid_dumpblocks(device_t, void *, daddr_t, int);
208 static int raid_lastclose(device_t);
209
210 static dev_type_open(raidopen);
211 static dev_type_close(raidclose);
212 static dev_type_read(raidread);
213 static dev_type_write(raidwrite);
214 static dev_type_ioctl(raidioctl);
215 static dev_type_strategy(raidstrategy);
216 static dev_type_dump(raiddump);
217 static dev_type_size(raidsize);
218
219 const struct bdevsw raid_bdevsw = {
220 .d_open = raidopen,
221 .d_close = raidclose,
222 .d_strategy = raidstrategy,
223 .d_ioctl = raidioctl,
224 .d_dump = raiddump,
225 .d_psize = raidsize,
226 .d_discard = nodiscard,
227 .d_flag = D_DISK
228 };
229
230 const struct cdevsw raid_cdevsw = {
231 .d_open = raidopen,
232 .d_close = raidclose,
233 .d_read = raidread,
234 .d_write = raidwrite,
235 .d_ioctl = raidioctl,
236 .d_stop = nostop,
237 .d_tty = notty,
238 .d_poll = nopoll,
239 .d_mmap = nommap,
240 .d_kqfilter = nokqfilter,
241 .d_discard = nodiscard,
242 .d_flag = D_DISK
243 };
244
245 static struct dkdriver rf_dkdriver = {
246 .d_open = raidopen,
247 .d_close = raidclose,
248 .d_strategy = raidstrategy,
249 .d_diskstart = raid_diskstart,
250 .d_dumpblocks = raid_dumpblocks,
251 .d_lastclose = raid_lastclose,
252 .d_minphys = minphys
253 };
254
255 struct raid_softc {
256 struct dk_softc sc_dksc;
257 int sc_unit;
258 int sc_flags; /* flags */
259 int sc_cflags; /* configuration flags */
260 kmutex_t sc_mutex; /* interlock mutex */
261 kcondvar_t sc_cv; /* and the condvar */
262 uint64_t sc_size; /* size of the raid device */
263 char sc_xname[20]; /* XXX external name */
264 RF_Raid_t sc_r;
265 LIST_ENTRY(raid_softc) sc_link;
266 };
267 /* sc_flags */
268 #define RAIDF_INITED 0x01 /* unit has been initialized */
269 #define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */
270 #define RAIDF_DETACH 0x04 /* detach after final close */
271 #define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */
272 #define RAIDF_LOCKED 0x10 /* unit is locked */
273 #define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */
274
275 #define raidunit(x) DISKUNIT(x)
276 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
277
278 extern struct cfdriver raid_cd;
279 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
280 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
281 DVF_DETACH_SHUTDOWN);
282
283 /*
284 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
285 * Be aware that large numbers can allow the driver to consume a lot of
286 * kernel memory, especially on writes, and in degraded mode reads.
287 *
288 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
289 * a single 64K write will typically require 64K for the old data,
290 * 64K for the old parity, and 64K for the new parity, for a total
291 * of 192K (if the parity buffer is not re-used immediately).
292 * Even it if is used immediately, that's still 128K, which when multiplied
293 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
294 *
295 * Now in degraded mode, for example, a 64K read on the above setup may
296 * require data reconstruction, which will require *all* of the 4 remaining
297 * disks to participate -- 4 * 32K/disk == 128K again.
298 */
299
300 #ifndef RAIDOUTSTANDING
301 #define RAIDOUTSTANDING 6
302 #endif
303
304 #define RAIDLABELDEV(dev) \
305 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
306
307 /* declared here, and made public, for the benefit of KVM stuff.. */
308
309 static int raidlock(struct raid_softc *);
310 static void raidunlock(struct raid_softc *);
311
312 static int raid_detach_unlocked(struct raid_softc *);
313
314 static void rf_markalldirty(RF_Raid_t *);
315 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
316
317 void rf_ReconThread(struct rf_recon_req *);
318 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
319 void rf_CopybackThread(RF_Raid_t *raidPtr);
320 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
321 int rf_autoconfig(device_t);
322 void rf_buildroothack(RF_ConfigSet_t *);
323
324 RF_AutoConfig_t *rf_find_raid_components(void);
325 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
326 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
327 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
328 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
329 int rf_set_autoconfig(RF_Raid_t *, int);
330 int rf_set_rootpartition(RF_Raid_t *, int);
331 void rf_release_all_vps(RF_ConfigSet_t *);
332 void rf_cleanup_config_set(RF_ConfigSet_t *);
333 int rf_have_enough_components(RF_ConfigSet_t *);
334 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
335 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
336
337 /*
338 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
339 * Note that this is overridden by having RAID_AUTOCONFIG as an option
340 * in the kernel config file.
341 */
342 #ifdef RAID_AUTOCONFIG
343 int raidautoconfig = 1;
344 #else
345 int raidautoconfig = 0;
346 #endif
347 static bool raidautoconfigdone = false;
348
349 struct RF_Pools_s rf_pools;
350
351 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
352 static kmutex_t raid_lock;
353
354 static struct raid_softc *
355 raidcreate(int unit) {
356 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
357 if (sc == NULL) {
358 #ifdef DIAGNOSTIC
359 printf("%s: out of memory\n", __func__);
360 #endif
361 return NULL;
362 }
363 sc->sc_unit = unit;
364 cv_init(&sc->sc_cv, "raidunit");
365 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
366 return sc;
367 }
368
369 static void
370 raiddestroy(struct raid_softc *sc) {
371 cv_destroy(&sc->sc_cv);
372 mutex_destroy(&sc->sc_mutex);
373 kmem_free(sc, sizeof(*sc));
374 }
375
376 static struct raid_softc *
377 raidget(int unit, bool create) {
378 struct raid_softc *sc;
379 if (unit < 0) {
380 #ifdef DIAGNOSTIC
381 panic("%s: unit %d!", __func__, unit);
382 #endif
383 return NULL;
384 }
385 mutex_enter(&raid_lock);
386 LIST_FOREACH(sc, &raids, sc_link) {
387 if (sc->sc_unit == unit) {
388 mutex_exit(&raid_lock);
389 return sc;
390 }
391 }
392 mutex_exit(&raid_lock);
393 if (!create)
394 return NULL;
395 if ((sc = raidcreate(unit)) == NULL)
396 return NULL;
397 mutex_enter(&raid_lock);
398 LIST_INSERT_HEAD(&raids, sc, sc_link);
399 mutex_exit(&raid_lock);
400 return sc;
401 }
402
403 static void
404 raidput(struct raid_softc *sc) {
405 mutex_enter(&raid_lock);
406 LIST_REMOVE(sc, sc_link);
407 mutex_exit(&raid_lock);
408 raiddestroy(sc);
409 }
410
411 void
412 raidattach(int num)
413 {
414
415 /*
416 * Device attachment and associated initialization now occurs
417 * as part of the module initialization.
418 */
419 }
420
421 int
422 rf_autoconfig(device_t self)
423 {
424 RF_AutoConfig_t *ac_list;
425 RF_ConfigSet_t *config_sets;
426
427 if (!raidautoconfig || raidautoconfigdone == true)
428 return (0);
429
430 /* XXX This code can only be run once. */
431 raidautoconfigdone = true;
432
433 #ifdef __HAVE_CPU_BOOTCONF
434 /*
435 * 0. find the boot device if needed first so we can use it later
436 * this needs to be done before we autoconfigure any raid sets,
437 * because if we use wedges we are not going to be able to open
438 * the boot device later
439 */
440 if (booted_device == NULL)
441 cpu_bootconf();
442 #endif
443 /* 1. locate all RAID components on the system */
444 aprint_debug("Searching for RAID components...\n");
445 ac_list = rf_find_raid_components();
446
447 /* 2. Sort them into their respective sets. */
448 config_sets = rf_create_auto_sets(ac_list);
449
450 /*
451 * 3. Evaluate each set and configure the valid ones.
452 * This gets done in rf_buildroothack().
453 */
454 rf_buildroothack(config_sets);
455
456 return 1;
457 }
458
459 static int
460 rf_containsboot(RF_Raid_t *r, device_t bdv) {
461 const char *bootname = device_xname(bdv);
462 size_t len = strlen(bootname);
463
464 for (int col = 0; col < r->numCol; col++) {
465 const char *devname = r->Disks[col].devname;
466 devname += sizeof("/dev/") - 1;
467 if (strncmp(devname, "dk", 2) == 0) {
468 const char *parent =
469 dkwedge_get_parent_name(r->Disks[col].dev);
470 if (parent != NULL)
471 devname = parent;
472 }
473 if (strncmp(devname, bootname, len) == 0) {
474 struct raid_softc *sc = r->softc;
475 aprint_debug("raid%d includes boot device %s\n",
476 sc->sc_unit, devname);
477 return 1;
478 }
479 }
480 return 0;
481 }
482
483 void
484 rf_buildroothack(RF_ConfigSet_t *config_sets)
485 {
486 RF_ConfigSet_t *cset;
487 RF_ConfigSet_t *next_cset;
488 int num_root;
489 struct raid_softc *sc, *rsc;
490 struct dk_softc *dksc;
491
492 sc = rsc = NULL;
493 num_root = 0;
494 cset = config_sets;
495 while (cset != NULL) {
496 next_cset = cset->next;
497 if (rf_have_enough_components(cset) &&
498 cset->ac->clabel->autoconfigure == 1) {
499 sc = rf_auto_config_set(cset);
500 if (sc != NULL) {
501 aprint_debug("raid%d: configured ok\n",
502 sc->sc_unit);
503 if (cset->rootable) {
504 rsc = sc;
505 num_root++;
506 }
507 } else {
508 /* The autoconfig didn't work :( */
509 aprint_debug("Autoconfig failed\n");
510 rf_release_all_vps(cset);
511 }
512 } else {
513 /* we're not autoconfiguring this set...
514 release the associated resources */
515 rf_release_all_vps(cset);
516 }
517 /* cleanup */
518 rf_cleanup_config_set(cset);
519 cset = next_cset;
520 }
521 dksc = &rsc->sc_dksc;
522
523 /* if the user has specified what the root device should be
524 then we don't touch booted_device or boothowto... */
525
526 if (rootspec != NULL)
527 return;
528
529 /* we found something bootable... */
530
531 /*
532 * XXX: The following code assumes that the root raid
533 * is the first ('a') partition. This is about the best
534 * we can do with a BSD disklabel, but we might be able
535 * to do better with a GPT label, by setting a specified
536 * attribute to indicate the root partition. We can then
537 * stash the partition number in the r->root_partition
538 * high bits (the bottom 2 bits are already used). For
539 * now we just set booted_partition to 0 when we override
540 * root.
541 */
542 if (num_root == 1) {
543 device_t candidate_root;
544 if (dksc->sc_dkdev.dk_nwedges != 0) {
545 char cname[sizeof(cset->ac->devname)];
546 /* XXX: assume partition 'a' first */
547 snprintf(cname, sizeof(cname), "%s%c",
548 device_xname(dksc->sc_dev), 'a');
549 candidate_root = dkwedge_find_by_wname(cname);
550 DPRINTF("%s: candidate wedge root=%s\n", __func__,
551 cname);
552 if (candidate_root == NULL) {
553 /*
554 * If that is not found, because we don't use
555 * disklabel, return the first dk child
556 * XXX: we can skip the 'a' check above
557 * and always do this...
558 */
559 size_t i = 0;
560 candidate_root = dkwedge_find_by_parent(
561 device_xname(dksc->sc_dev), &i);
562 }
563 DPRINTF("%s: candidate wedge root=%p\n", __func__,
564 candidate_root);
565 } else
566 candidate_root = dksc->sc_dev;
567 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
568 DPRINTF("%s: booted_device=%p root_partition=%d "
569 "contains_boot=%d\n", __func__, booted_device,
570 rsc->sc_r.root_partition,
571 rf_containsboot(&rsc->sc_r, booted_device));
572 if (booted_device == NULL ||
573 rsc->sc_r.root_partition == 1 ||
574 rf_containsboot(&rsc->sc_r, booted_device)) {
575 booted_device = candidate_root;
576 booted_partition = 0; /* XXX assume 'a' */
577 }
578 } else if (num_root > 1) {
579 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
580 booted_device);
581
582 /*
583 * Maybe the MD code can help. If it cannot, then
584 * setroot() will discover that we have no
585 * booted_device and will ask the user if nothing was
586 * hardwired in the kernel config file
587 */
588 if (booted_device == NULL)
589 return;
590
591 num_root = 0;
592 mutex_enter(&raid_lock);
593 LIST_FOREACH(sc, &raids, sc_link) {
594 RF_Raid_t *r = &sc->sc_r;
595 if (r->valid == 0)
596 continue;
597
598 if (r->root_partition == 0)
599 continue;
600
601 if (rf_containsboot(r, booted_device)) {
602 num_root++;
603 rsc = sc;
604 dksc = &rsc->sc_dksc;
605 }
606 }
607 mutex_exit(&raid_lock);
608
609 if (num_root == 1) {
610 booted_device = dksc->sc_dev;
611 booted_partition = 0; /* XXX assume 'a' */
612 } else {
613 /* we can't guess.. require the user to answer... */
614 boothowto |= RB_ASKNAME;
615 }
616 }
617 }
618
619 static int
620 raidsize(dev_t dev)
621 {
622 struct raid_softc *rs;
623 struct dk_softc *dksc;
624 unsigned int unit;
625
626 unit = raidunit(dev);
627 if ((rs = raidget(unit, false)) == NULL)
628 return -1;
629 dksc = &rs->sc_dksc;
630
631 if ((rs->sc_flags & RAIDF_INITED) == 0)
632 return -1;
633
634 return dk_size(dksc, dev);
635 }
636
637 static int
638 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
639 {
640 unsigned int unit;
641 struct raid_softc *rs;
642 struct dk_softc *dksc;
643
644 unit = raidunit(dev);
645 if ((rs = raidget(unit, false)) == NULL)
646 return ENXIO;
647 dksc = &rs->sc_dksc;
648
649 if ((rs->sc_flags & RAIDF_INITED) == 0)
650 return ENODEV;
651
652 /*
653 Note that blkno is relative to this particular partition.
654 By adding adding RF_PROTECTED_SECTORS, we get a value that
655 is relative to the partition used for the underlying component.
656 */
657 blkno += RF_PROTECTED_SECTORS;
658
659 return dk_dump(dksc, dev, blkno, va, size);
660 }
661
662 static int
663 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
664 {
665 struct raid_softc *rs = raidsoftc(dev);
666 const struct bdevsw *bdev;
667 RF_Raid_t *raidPtr;
668 int c, sparecol, j, scol, dumpto;
669 int error = 0;
670
671 raidPtr = &rs->sc_r;
672
673 /* we only support dumping to RAID 1 sets */
674 if (raidPtr->Layout.numDataCol != 1 ||
675 raidPtr->Layout.numParityCol != 1)
676 return EINVAL;
677
678 if ((error = raidlock(rs)) != 0)
679 return error;
680
681 /* figure out what device is alive.. */
682
683 /*
684 Look for a component to dump to. The preference for the
685 component to dump to is as follows:
686 1) the master
687 2) a used_spare of the master
688 3) the slave
689 4) a used_spare of the slave
690 */
691
692 dumpto = -1;
693 for (c = 0; c < raidPtr->numCol; c++) {
694 if (raidPtr->Disks[c].status == rf_ds_optimal) {
695 /* this might be the one */
696 dumpto = c;
697 break;
698 }
699 }
700
701 /*
702 At this point we have possibly selected a live master or a
703 live slave. We now check to see if there is a spared
704 master (or a spared slave), if we didn't find a live master
705 or a live slave.
706 */
707
708 for (c = 0; c < raidPtr->numSpare; c++) {
709 sparecol = raidPtr->numCol + c;
710 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
711 /* How about this one? */
712 scol = -1;
713 for(j=0;j<raidPtr->numCol;j++) {
714 if (raidPtr->Disks[j].spareCol == sparecol) {
715 scol = j;
716 break;
717 }
718 }
719 if (scol == 0) {
720 /*
721 We must have found a spared master!
722 We'll take that over anything else
723 found so far. (We couldn't have
724 found a real master before, since
725 this is a used spare, and it's
726 saying that it's replacing the
727 master.) On reboot (with
728 autoconfiguration turned on)
729 sparecol will become the 1st
730 component (component0) of this set.
731 */
732 dumpto = sparecol;
733 break;
734 } else if (scol != -1) {
735 /*
736 Must be a spared slave. We'll dump
737 to that if we havn't found anything
738 else so far.
739 */
740 if (dumpto == -1)
741 dumpto = sparecol;
742 }
743 }
744 }
745
746 if (dumpto == -1) {
747 /* we couldn't find any live components to dump to!?!?
748 */
749 error = EINVAL;
750 goto out;
751 }
752
753 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
754 if (bdev == NULL) {
755 error = ENXIO;
756 goto out;
757 }
758
759 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
760 blkno, va, nblk * raidPtr->bytesPerSector);
761
762 out:
763 raidunlock(rs);
764
765 return error;
766 }
767
768 /* ARGSUSED */
769 static int
770 raidopen(dev_t dev, int flags, int fmt,
771 struct lwp *l)
772 {
773 int unit = raidunit(dev);
774 struct raid_softc *rs;
775 struct dk_softc *dksc;
776 int error = 0;
777 int part, pmask;
778
779 if ((rs = raidget(unit, true)) == NULL)
780 return ENXIO;
781 if ((error = raidlock(rs)) != 0)
782 return (error);
783
784 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
785 error = EBUSY;
786 goto bad;
787 }
788
789 dksc = &rs->sc_dksc;
790
791 part = DISKPART(dev);
792 pmask = (1 << part);
793
794 if (!DK_BUSY(dksc, pmask) &&
795 ((rs->sc_flags & RAIDF_INITED) != 0)) {
796 /* First one... mark things as dirty... Note that we *MUST*
797 have done a configure before this. I DO NOT WANT TO BE
798 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
799 THAT THEY BELONG TOGETHER!!!!! */
800 /* XXX should check to see if we're only open for reading
801 here... If so, we needn't do this, but then need some
802 other way of keeping track of what's happened.. */
803
804 rf_markalldirty(&rs->sc_r);
805 }
806
807 if ((rs->sc_flags & RAIDF_INITED) != 0)
808 error = dk_open(dksc, dev, flags, fmt, l);
809
810 bad:
811 raidunlock(rs);
812
813 return (error);
814
815
816 }
817
818 static int
819 raid_lastclose(device_t self)
820 {
821 struct raid_softc *rs = raidsoftc(self);
822
823 /* Last one... device is not unconfigured yet.
824 Device shutdown has taken care of setting the
825 clean bits if RAIDF_INITED is not set
826 mark things as clean... */
827
828 rf_update_component_labels(&rs->sc_r,
829 RF_FINAL_COMPONENT_UPDATE);
830
831 /* pass to unlocked code */
832 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
833 rs->sc_flags |= RAIDF_DETACH;
834
835 return 0;
836 }
837
838 /* ARGSUSED */
839 static int
840 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
841 {
842 int unit = raidunit(dev);
843 struct raid_softc *rs;
844 struct dk_softc *dksc;
845 cfdata_t cf;
846 int error = 0, do_detach = 0, do_put = 0;
847
848 if ((rs = raidget(unit, false)) == NULL)
849 return ENXIO;
850 dksc = &rs->sc_dksc;
851
852 if ((error = raidlock(rs)) != 0)
853 return (error);
854
855 if ((rs->sc_flags & RAIDF_INITED) != 0) {
856 error = dk_close(dksc, dev, flags, fmt, l);
857 if ((rs->sc_flags & RAIDF_DETACH) != 0)
858 do_detach = 1;
859 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
860 do_put = 1;
861
862 raidunlock(rs);
863
864 if (do_detach) {
865 /* free the pseudo device attach bits */
866 cf = device_cfdata(dksc->sc_dev);
867 error = config_detach(dksc->sc_dev, 0);
868 if (error == 0)
869 free(cf, M_RAIDFRAME);
870 } else if (do_put) {
871 raidput(rs);
872 }
873
874 return (error);
875
876 }
877
878 static void
879 raid_wakeup(RF_Raid_t *raidPtr)
880 {
881 rf_lock_mutex2(raidPtr->iodone_lock);
882 rf_signal_cond2(raidPtr->iodone_cv);
883 rf_unlock_mutex2(raidPtr->iodone_lock);
884 }
885
886 static void
887 raidstrategy(struct buf *bp)
888 {
889 unsigned int unit;
890 struct raid_softc *rs;
891 struct dk_softc *dksc;
892 RF_Raid_t *raidPtr;
893
894 unit = raidunit(bp->b_dev);
895 if ((rs = raidget(unit, false)) == NULL) {
896 bp->b_error = ENXIO;
897 goto fail;
898 }
899 if ((rs->sc_flags & RAIDF_INITED) == 0) {
900 bp->b_error = ENXIO;
901 goto fail;
902 }
903 dksc = &rs->sc_dksc;
904 raidPtr = &rs->sc_r;
905
906 /* Queue IO only */
907 if (dk_strategy_defer(dksc, bp))
908 goto done;
909
910 /* schedule the IO to happen at the next convenient time */
911 raid_wakeup(raidPtr);
912
913 done:
914 return;
915
916 fail:
917 bp->b_resid = bp->b_bcount;
918 biodone(bp);
919 }
920
921 static int
922 raid_diskstart(device_t dev, struct buf *bp)
923 {
924 struct raid_softc *rs = raidsoftc(dev);
925 RF_Raid_t *raidPtr;
926
927 raidPtr = &rs->sc_r;
928 if (!raidPtr->valid) {
929 db1_printf(("raid is not valid..\n"));
930 return ENODEV;
931 }
932
933 /* XXX */
934 bp->b_resid = 0;
935
936 return raiddoaccess(raidPtr, bp);
937 }
938
939 void
940 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
941 {
942 struct raid_softc *rs;
943 struct dk_softc *dksc;
944
945 rs = raidPtr->softc;
946 dksc = &rs->sc_dksc;
947
948 dk_done(dksc, bp);
949
950 rf_lock_mutex2(raidPtr->mutex);
951 raidPtr->openings++;
952 rf_unlock_mutex2(raidPtr->mutex);
953
954 /* schedule more IO */
955 raid_wakeup(raidPtr);
956 }
957
958 /* ARGSUSED */
959 static int
960 raidread(dev_t dev, struct uio *uio, int flags)
961 {
962 int unit = raidunit(dev);
963 struct raid_softc *rs;
964
965 if ((rs = raidget(unit, false)) == NULL)
966 return ENXIO;
967
968 if ((rs->sc_flags & RAIDF_INITED) == 0)
969 return (ENXIO);
970
971 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
972
973 }
974
975 /* ARGSUSED */
976 static int
977 raidwrite(dev_t dev, struct uio *uio, int flags)
978 {
979 int unit = raidunit(dev);
980 struct raid_softc *rs;
981
982 if ((rs = raidget(unit, false)) == NULL)
983 return ENXIO;
984
985 if ((rs->sc_flags & RAIDF_INITED) == 0)
986 return (ENXIO);
987
988 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
989
990 }
991
992 static int
993 raid_detach_unlocked(struct raid_softc *rs)
994 {
995 struct dk_softc *dksc = &rs->sc_dksc;
996 RF_Raid_t *raidPtr;
997 int error;
998
999 raidPtr = &rs->sc_r;
1000
1001 if (DK_BUSY(dksc, 0) ||
1002 raidPtr->recon_in_progress != 0 ||
1003 raidPtr->parity_rewrite_in_progress != 0 ||
1004 raidPtr->copyback_in_progress != 0)
1005 return EBUSY;
1006
1007 if ((rs->sc_flags & RAIDF_INITED) == 0)
1008 return 0;
1009
1010 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1011
1012 if ((error = rf_Shutdown(raidPtr)) != 0)
1013 return error;
1014
1015 rs->sc_flags &= ~RAIDF_INITED;
1016
1017 /* Kill off any queued buffers */
1018 dk_drain(dksc);
1019 bufq_free(dksc->sc_bufq);
1020
1021 /* Detach the disk. */
1022 dkwedge_delall(&dksc->sc_dkdev);
1023 disk_detach(&dksc->sc_dkdev);
1024 disk_destroy(&dksc->sc_dkdev);
1025 dk_detach(dksc);
1026
1027 return 0;
1028 }
1029
1030 static int
1031 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1032 {
1033 int unit = raidunit(dev);
1034 int error = 0;
1035 int part, pmask;
1036 struct raid_softc *rs;
1037 struct dk_softc *dksc;
1038 RF_Config_t *k_cfg, *u_cfg;
1039 RF_Raid_t *raidPtr;
1040 RF_RaidDisk_t *diskPtr;
1041 RF_AccTotals_t *totals;
1042 RF_DeviceConfig_t *d_cfg, **ucfgp;
1043 u_char *specific_buf;
1044 int retcode = 0;
1045 int column;
1046 /* int raidid; */
1047 struct rf_recon_req *rrcopy, *rr;
1048 RF_ComponentLabel_t *clabel;
1049 RF_ComponentLabel_t *ci_label;
1050 RF_ComponentLabel_t **clabel_ptr;
1051 RF_SingleComponent_t *sparePtr,*componentPtr;
1052 RF_SingleComponent_t component;
1053 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1054 int i, j, d;
1055
1056 if ((rs = raidget(unit, false)) == NULL)
1057 return ENXIO;
1058 dksc = &rs->sc_dksc;
1059 raidPtr = &rs->sc_r;
1060
1061 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1062 (int) DISKPART(dev), (int) unit, cmd));
1063
1064 /* Must be initialized for these... */
1065 switch (cmd) {
1066 case RAIDFRAME_REWRITEPARITY:
1067 case RAIDFRAME_GET_INFO:
1068 case RAIDFRAME_RESET_ACCTOTALS:
1069 case RAIDFRAME_GET_ACCTOTALS:
1070 case RAIDFRAME_KEEP_ACCTOTALS:
1071 case RAIDFRAME_GET_SIZE:
1072 case RAIDFRAME_FAIL_DISK:
1073 case RAIDFRAME_COPYBACK:
1074 case RAIDFRAME_CHECK_RECON_STATUS:
1075 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1076 case RAIDFRAME_GET_COMPONENT_LABEL:
1077 case RAIDFRAME_SET_COMPONENT_LABEL:
1078 case RAIDFRAME_ADD_HOT_SPARE:
1079 case RAIDFRAME_REMOVE_HOT_SPARE:
1080 case RAIDFRAME_INIT_LABELS:
1081 case RAIDFRAME_REBUILD_IN_PLACE:
1082 case RAIDFRAME_CHECK_PARITY:
1083 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1084 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1085 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1086 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1087 case RAIDFRAME_SET_AUTOCONFIG:
1088 case RAIDFRAME_SET_ROOT:
1089 case RAIDFRAME_DELETE_COMPONENT:
1090 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1091 case RAIDFRAME_PARITYMAP_STATUS:
1092 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1093 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1094 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1095 if ((rs->sc_flags & RAIDF_INITED) == 0)
1096 return (ENXIO);
1097 }
1098
1099 switch (cmd) {
1100 #ifdef COMPAT_50
1101 case RAIDFRAME_GET_INFO50:
1102 return rf_get_info50(raidPtr, data);
1103
1104 case RAIDFRAME_CONFIGURE50:
1105 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1106 return retcode;
1107 goto config;
1108 #endif
1109 /* configure the system */
1110 case RAIDFRAME_CONFIGURE:
1111
1112 if (raidPtr->valid) {
1113 /* There is a valid RAID set running on this unit! */
1114 printf("raid%d: Device already configured!\n",unit);
1115 return(EINVAL);
1116 }
1117
1118 /* copy-in the configuration information */
1119 /* data points to a pointer to the configuration structure */
1120
1121 u_cfg = *((RF_Config_t **) data);
1122 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1123 if (k_cfg == NULL) {
1124 return (ENOMEM);
1125 }
1126 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1127 if (retcode) {
1128 RF_Free(k_cfg, sizeof(RF_Config_t));
1129 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1130 retcode));
1131 goto no_config;
1132 }
1133 goto config;
1134 config:
1135 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1136
1137 /* allocate a buffer for the layout-specific data, and copy it
1138 * in */
1139 if (k_cfg->layoutSpecificSize) {
1140 if (k_cfg->layoutSpecificSize > 10000) {
1141 /* sanity check */
1142 RF_Free(k_cfg, sizeof(RF_Config_t));
1143 retcode = EINVAL;
1144 goto no_config;
1145 }
1146 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1147 (u_char *));
1148 if (specific_buf == NULL) {
1149 RF_Free(k_cfg, sizeof(RF_Config_t));
1150 retcode = ENOMEM;
1151 goto no_config;
1152 }
1153 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1154 k_cfg->layoutSpecificSize);
1155 if (retcode) {
1156 RF_Free(k_cfg, sizeof(RF_Config_t));
1157 RF_Free(specific_buf,
1158 k_cfg->layoutSpecificSize);
1159 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1160 retcode));
1161 goto no_config;
1162 }
1163 } else
1164 specific_buf = NULL;
1165 k_cfg->layoutSpecific = specific_buf;
1166
1167 /* should do some kind of sanity check on the configuration.
1168 * Store the sum of all the bytes in the last byte? */
1169
1170 /* configure the system */
1171
1172 /*
1173 * Clear the entire RAID descriptor, just to make sure
1174 * there is no stale data left in the case of a
1175 * reconfiguration
1176 */
1177 memset(raidPtr, 0, sizeof(*raidPtr));
1178 raidPtr->softc = rs;
1179 raidPtr->raidid = unit;
1180
1181 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1182
1183 if (retcode == 0) {
1184
1185 /* allow this many simultaneous IO's to
1186 this RAID device */
1187 raidPtr->openings = RAIDOUTSTANDING;
1188
1189 raidinit(rs);
1190 raid_wakeup(raidPtr);
1191 rf_markalldirty(raidPtr);
1192 }
1193 /* free the buffers. No return code here. */
1194 if (k_cfg->layoutSpecificSize) {
1195 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1196 }
1197 RF_Free(k_cfg, sizeof(RF_Config_t));
1198
1199 no_config:
1200 /*
1201 * If configuration failed, set sc_flags so that we
1202 * will detach the device when we close it.
1203 */
1204 if (retcode != 0)
1205 rs->sc_flags |= RAIDF_SHUTDOWN;
1206 return (retcode);
1207
1208 /* shutdown the system */
1209 case RAIDFRAME_SHUTDOWN:
1210
1211 part = DISKPART(dev);
1212 pmask = (1 << part);
1213
1214 if ((error = raidlock(rs)) != 0)
1215 return (error);
1216
1217 if (DK_BUSY(dksc, pmask) ||
1218 raidPtr->recon_in_progress != 0 ||
1219 raidPtr->parity_rewrite_in_progress != 0 ||
1220 raidPtr->copyback_in_progress != 0)
1221 retcode = EBUSY;
1222 else {
1223 /* detach and free on close */
1224 rs->sc_flags |= RAIDF_SHUTDOWN;
1225 retcode = 0;
1226 }
1227
1228 raidunlock(rs);
1229
1230 return (retcode);
1231 case RAIDFRAME_GET_COMPONENT_LABEL:
1232 clabel_ptr = (RF_ComponentLabel_t **) data;
1233 /* need to read the component label for the disk indicated
1234 by row,column in clabel */
1235
1236 /*
1237 * Perhaps there should be an option to skip the in-core
1238 * copy and hit the disk, as with disklabel(8).
1239 */
1240 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1241
1242 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1243
1244 if (retcode) {
1245 RF_Free(clabel, sizeof(*clabel));
1246 return retcode;
1247 }
1248
1249 clabel->row = 0; /* Don't allow looking at anything else.*/
1250
1251 column = clabel->column;
1252
1253 if ((column < 0) || (column >= raidPtr->numCol +
1254 raidPtr->numSpare)) {
1255 RF_Free(clabel, sizeof(*clabel));
1256 return EINVAL;
1257 }
1258
1259 RF_Free(clabel, sizeof(*clabel));
1260
1261 clabel = raidget_component_label(raidPtr, column);
1262
1263 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1264
1265 #if 0
1266 case RAIDFRAME_SET_COMPONENT_LABEL:
1267 clabel = (RF_ComponentLabel_t *) data;
1268
1269 /* XXX check the label for valid stuff... */
1270 /* Note that some things *should not* get modified --
1271 the user should be re-initing the labels instead of
1272 trying to patch things.
1273 */
1274
1275 raidid = raidPtr->raidid;
1276 #ifdef DEBUG
1277 printf("raid%d: Got component label:\n", raidid);
1278 printf("raid%d: Version: %d\n", raidid, clabel->version);
1279 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1280 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1281 printf("raid%d: Column: %d\n", raidid, clabel->column);
1282 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1283 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1284 printf("raid%d: Status: %d\n", raidid, clabel->status);
1285 #endif
1286 clabel->row = 0;
1287 column = clabel->column;
1288
1289 if ((column < 0) || (column >= raidPtr->numCol)) {
1290 return(EINVAL);
1291 }
1292
1293 /* XXX this isn't allowed to do anything for now :-) */
1294
1295 /* XXX and before it is, we need to fill in the rest
1296 of the fields!?!?!?! */
1297 memcpy(raidget_component_label(raidPtr, column),
1298 clabel, sizeof(*clabel));
1299 raidflush_component_label(raidPtr, column);
1300 return (0);
1301 #endif
1302
1303 case RAIDFRAME_INIT_LABELS:
1304 clabel = (RF_ComponentLabel_t *) data;
1305 /*
1306 we only want the serial number from
1307 the above. We get all the rest of the information
1308 from the config that was used to create this RAID
1309 set.
1310 */
1311
1312 raidPtr->serial_number = clabel->serial_number;
1313
1314 for(column=0;column<raidPtr->numCol;column++) {
1315 diskPtr = &raidPtr->Disks[column];
1316 if (!RF_DEAD_DISK(diskPtr->status)) {
1317 ci_label = raidget_component_label(raidPtr,
1318 column);
1319 /* Zeroing this is important. */
1320 memset(ci_label, 0, sizeof(*ci_label));
1321 raid_init_component_label(raidPtr, ci_label);
1322 ci_label->serial_number =
1323 raidPtr->serial_number;
1324 ci_label->row = 0; /* we dont' pretend to support more */
1325 rf_component_label_set_partitionsize(ci_label,
1326 diskPtr->partitionSize);
1327 ci_label->column = column;
1328 raidflush_component_label(raidPtr, column);
1329 }
1330 /* XXXjld what about the spares? */
1331 }
1332
1333 return (retcode);
1334 case RAIDFRAME_SET_AUTOCONFIG:
1335 d = rf_set_autoconfig(raidPtr, *(int *) data);
1336 printf("raid%d: New autoconfig value is: %d\n",
1337 raidPtr->raidid, d);
1338 *(int *) data = d;
1339 return (retcode);
1340
1341 case RAIDFRAME_SET_ROOT:
1342 d = rf_set_rootpartition(raidPtr, *(int *) data);
1343 printf("raid%d: New rootpartition value is: %d\n",
1344 raidPtr->raidid, d);
1345 *(int *) data = d;
1346 return (retcode);
1347
1348 /* initialize all parity */
1349 case RAIDFRAME_REWRITEPARITY:
1350
1351 if (raidPtr->Layout.map->faultsTolerated == 0) {
1352 /* Parity for RAID 0 is trivially correct */
1353 raidPtr->parity_good = RF_RAID_CLEAN;
1354 return(0);
1355 }
1356
1357 if (raidPtr->parity_rewrite_in_progress == 1) {
1358 /* Re-write is already in progress! */
1359 return(EINVAL);
1360 }
1361
1362 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1363 rf_RewriteParityThread,
1364 raidPtr,"raid_parity");
1365 return (retcode);
1366
1367
1368 case RAIDFRAME_ADD_HOT_SPARE:
1369 sparePtr = (RF_SingleComponent_t *) data;
1370 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1371 retcode = rf_add_hot_spare(raidPtr, &component);
1372 return(retcode);
1373
1374 case RAIDFRAME_REMOVE_HOT_SPARE:
1375 return(retcode);
1376
1377 case RAIDFRAME_DELETE_COMPONENT:
1378 componentPtr = (RF_SingleComponent_t *)data;
1379 memcpy( &component, componentPtr,
1380 sizeof(RF_SingleComponent_t));
1381 retcode = rf_delete_component(raidPtr, &component);
1382 return(retcode);
1383
1384 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1385 componentPtr = (RF_SingleComponent_t *)data;
1386 memcpy( &component, componentPtr,
1387 sizeof(RF_SingleComponent_t));
1388 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1389 return(retcode);
1390
1391 case RAIDFRAME_REBUILD_IN_PLACE:
1392
1393 if (raidPtr->Layout.map->faultsTolerated == 0) {
1394 /* Can't do this on a RAID 0!! */
1395 return(EINVAL);
1396 }
1397
1398 if (raidPtr->recon_in_progress == 1) {
1399 /* a reconstruct is already in progress! */
1400 return(EINVAL);
1401 }
1402
1403 componentPtr = (RF_SingleComponent_t *) data;
1404 memcpy( &component, componentPtr,
1405 sizeof(RF_SingleComponent_t));
1406 component.row = 0; /* we don't support any more */
1407 column = component.column;
1408
1409 if ((column < 0) || (column >= raidPtr->numCol)) {
1410 return(EINVAL);
1411 }
1412
1413 rf_lock_mutex2(raidPtr->mutex);
1414 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1415 (raidPtr->numFailures > 0)) {
1416 /* XXX 0 above shouldn't be constant!!! */
1417 /* some component other than this has failed.
1418 Let's not make things worse than they already
1419 are... */
1420 printf("raid%d: Unable to reconstruct to disk at:\n",
1421 raidPtr->raidid);
1422 printf("raid%d: Col: %d Too many failures.\n",
1423 raidPtr->raidid, column);
1424 rf_unlock_mutex2(raidPtr->mutex);
1425 return (EINVAL);
1426 }
1427 if (raidPtr->Disks[column].status ==
1428 rf_ds_reconstructing) {
1429 printf("raid%d: Unable to reconstruct to disk at:\n",
1430 raidPtr->raidid);
1431 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1432
1433 rf_unlock_mutex2(raidPtr->mutex);
1434 return (EINVAL);
1435 }
1436 if (raidPtr->Disks[column].status == rf_ds_spared) {
1437 rf_unlock_mutex2(raidPtr->mutex);
1438 return (EINVAL);
1439 }
1440 rf_unlock_mutex2(raidPtr->mutex);
1441
1442 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1443 if (rrcopy == NULL)
1444 return(ENOMEM);
1445
1446 rrcopy->raidPtr = (void *) raidPtr;
1447 rrcopy->col = column;
1448
1449 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1450 rf_ReconstructInPlaceThread,
1451 rrcopy,"raid_reconip");
1452 return(retcode);
1453
1454 case RAIDFRAME_GET_INFO:
1455 if (!raidPtr->valid)
1456 return (ENODEV);
1457 ucfgp = (RF_DeviceConfig_t **) data;
1458 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1459 (RF_DeviceConfig_t *));
1460 if (d_cfg == NULL)
1461 return (ENOMEM);
1462 d_cfg->rows = 1; /* there is only 1 row now */
1463 d_cfg->cols = raidPtr->numCol;
1464 d_cfg->ndevs = raidPtr->numCol;
1465 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1466 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1467 return (ENOMEM);
1468 }
1469 d_cfg->nspares = raidPtr->numSpare;
1470 if (d_cfg->nspares >= RF_MAX_DISKS) {
1471 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1472 return (ENOMEM);
1473 }
1474 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1475 d = 0;
1476 for (j = 0; j < d_cfg->cols; j++) {
1477 d_cfg->devs[d] = raidPtr->Disks[j];
1478 d++;
1479 }
1480 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1481 d_cfg->spares[i] = raidPtr->Disks[j];
1482 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1483 /* XXX: raidctl(8) expects to see this as a used spare */
1484 d_cfg->spares[i].status = rf_ds_used_spare;
1485 }
1486 }
1487 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1488 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1489
1490 return (retcode);
1491
1492 case RAIDFRAME_CHECK_PARITY:
1493 *(int *) data = raidPtr->parity_good;
1494 return (0);
1495
1496 case RAIDFRAME_PARITYMAP_STATUS:
1497 if (rf_paritymap_ineligible(raidPtr))
1498 return EINVAL;
1499 rf_paritymap_status(raidPtr->parity_map,
1500 (struct rf_pmstat *)data);
1501 return 0;
1502
1503 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1504 if (rf_paritymap_ineligible(raidPtr))
1505 return EINVAL;
1506 if (raidPtr->parity_map == NULL)
1507 return ENOENT; /* ??? */
1508 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1509 (struct rf_pmparams *)data, 1))
1510 return EINVAL;
1511 return 0;
1512
1513 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1514 if (rf_paritymap_ineligible(raidPtr))
1515 return EINVAL;
1516 *(int *) data = rf_paritymap_get_disable(raidPtr);
1517 return 0;
1518
1519 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1520 if (rf_paritymap_ineligible(raidPtr))
1521 return EINVAL;
1522 rf_paritymap_set_disable(raidPtr, *(int *)data);
1523 /* XXX should errors be passed up? */
1524 return 0;
1525
1526 case RAIDFRAME_RESET_ACCTOTALS:
1527 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1528 return (0);
1529
1530 case RAIDFRAME_GET_ACCTOTALS:
1531 totals = (RF_AccTotals_t *) data;
1532 *totals = raidPtr->acc_totals;
1533 return (0);
1534
1535 case RAIDFRAME_KEEP_ACCTOTALS:
1536 raidPtr->keep_acc_totals = *(int *)data;
1537 return (0);
1538
1539 case RAIDFRAME_GET_SIZE:
1540 *(int *) data = raidPtr->totalSectors;
1541 return (0);
1542
1543 /* fail a disk & optionally start reconstruction */
1544 case RAIDFRAME_FAIL_DISK:
1545
1546 if (raidPtr->Layout.map->faultsTolerated == 0) {
1547 /* Can't do this on a RAID 0!! */
1548 return(EINVAL);
1549 }
1550
1551 rr = (struct rf_recon_req *) data;
1552 rr->row = 0;
1553 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1554 return (EINVAL);
1555
1556
1557 rf_lock_mutex2(raidPtr->mutex);
1558 if (raidPtr->status == rf_rs_reconstructing) {
1559 /* you can't fail a disk while we're reconstructing! */
1560 /* XXX wrong for RAID6 */
1561 rf_unlock_mutex2(raidPtr->mutex);
1562 return (EINVAL);
1563 }
1564 if ((raidPtr->Disks[rr->col].status ==
1565 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1566 /* some other component has failed. Let's not make
1567 things worse. XXX wrong for RAID6 */
1568 rf_unlock_mutex2(raidPtr->mutex);
1569 return (EINVAL);
1570 }
1571 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1572 /* Can't fail a spared disk! */
1573 rf_unlock_mutex2(raidPtr->mutex);
1574 return (EINVAL);
1575 }
1576 rf_unlock_mutex2(raidPtr->mutex);
1577
1578 /* make a copy of the recon request so that we don't rely on
1579 * the user's buffer */
1580 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1581 if (rrcopy == NULL)
1582 return(ENOMEM);
1583 memcpy(rrcopy, rr, sizeof(*rr));
1584 rrcopy->raidPtr = (void *) raidPtr;
1585
1586 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1587 rf_ReconThread,
1588 rrcopy,"raid_recon");
1589 return (0);
1590
1591 /* invoke a copyback operation after recon on whatever disk
1592 * needs it, if any */
1593 case RAIDFRAME_COPYBACK:
1594
1595 if (raidPtr->Layout.map->faultsTolerated == 0) {
1596 /* This makes no sense on a RAID 0!! */
1597 return(EINVAL);
1598 }
1599
1600 if (raidPtr->copyback_in_progress == 1) {
1601 /* Copyback is already in progress! */
1602 return(EINVAL);
1603 }
1604
1605 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1606 rf_CopybackThread,
1607 raidPtr,"raid_copyback");
1608 return (retcode);
1609
1610 /* return the percentage completion of reconstruction */
1611 case RAIDFRAME_CHECK_RECON_STATUS:
1612 if (raidPtr->Layout.map->faultsTolerated == 0) {
1613 /* This makes no sense on a RAID 0, so tell the
1614 user it's done. */
1615 *(int *) data = 100;
1616 return(0);
1617 }
1618 if (raidPtr->status != rf_rs_reconstructing)
1619 *(int *) data = 100;
1620 else {
1621 if (raidPtr->reconControl->numRUsTotal > 0) {
1622 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1623 } else {
1624 *(int *) data = 0;
1625 }
1626 }
1627 return (0);
1628 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1629 progressInfoPtr = (RF_ProgressInfo_t **) data;
1630 if (raidPtr->status != rf_rs_reconstructing) {
1631 progressInfo.remaining = 0;
1632 progressInfo.completed = 100;
1633 progressInfo.total = 100;
1634 } else {
1635 progressInfo.total =
1636 raidPtr->reconControl->numRUsTotal;
1637 progressInfo.completed =
1638 raidPtr->reconControl->numRUsComplete;
1639 progressInfo.remaining = progressInfo.total -
1640 progressInfo.completed;
1641 }
1642 retcode = copyout(&progressInfo, *progressInfoPtr,
1643 sizeof(RF_ProgressInfo_t));
1644 return (retcode);
1645
1646 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1647 if (raidPtr->Layout.map->faultsTolerated == 0) {
1648 /* This makes no sense on a RAID 0, so tell the
1649 user it's done. */
1650 *(int *) data = 100;
1651 return(0);
1652 }
1653 if (raidPtr->parity_rewrite_in_progress == 1) {
1654 *(int *) data = 100 *
1655 raidPtr->parity_rewrite_stripes_done /
1656 raidPtr->Layout.numStripe;
1657 } else {
1658 *(int *) data = 100;
1659 }
1660 return (0);
1661
1662 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1663 progressInfoPtr = (RF_ProgressInfo_t **) data;
1664 if (raidPtr->parity_rewrite_in_progress == 1) {
1665 progressInfo.total = raidPtr->Layout.numStripe;
1666 progressInfo.completed =
1667 raidPtr->parity_rewrite_stripes_done;
1668 progressInfo.remaining = progressInfo.total -
1669 progressInfo.completed;
1670 } else {
1671 progressInfo.remaining = 0;
1672 progressInfo.completed = 100;
1673 progressInfo.total = 100;
1674 }
1675 retcode = copyout(&progressInfo, *progressInfoPtr,
1676 sizeof(RF_ProgressInfo_t));
1677 return (retcode);
1678
1679 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1680 if (raidPtr->Layout.map->faultsTolerated == 0) {
1681 /* This makes no sense on a RAID 0 */
1682 *(int *) data = 100;
1683 return(0);
1684 }
1685 if (raidPtr->copyback_in_progress == 1) {
1686 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1687 raidPtr->Layout.numStripe;
1688 } else {
1689 *(int *) data = 100;
1690 }
1691 return (0);
1692
1693 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1694 progressInfoPtr = (RF_ProgressInfo_t **) data;
1695 if (raidPtr->copyback_in_progress == 1) {
1696 progressInfo.total = raidPtr->Layout.numStripe;
1697 progressInfo.completed =
1698 raidPtr->copyback_stripes_done;
1699 progressInfo.remaining = progressInfo.total -
1700 progressInfo.completed;
1701 } else {
1702 progressInfo.remaining = 0;
1703 progressInfo.completed = 100;
1704 progressInfo.total = 100;
1705 }
1706 retcode = copyout(&progressInfo, *progressInfoPtr,
1707 sizeof(RF_ProgressInfo_t));
1708 return (retcode);
1709
1710 case RAIDFRAME_SET_LAST_UNIT:
1711 for (column = 0; column < raidPtr->numCol; column++)
1712 if (raidPtr->Disks[column].status != rf_ds_optimal)
1713 return EBUSY;
1714
1715 for (column = 0; column < raidPtr->numCol; column++) {
1716 clabel = raidget_component_label(raidPtr, column);
1717 clabel->last_unit = *(int *)data;
1718 raidflush_component_label(raidPtr, column);
1719 }
1720 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1721 return 0;
1722
1723 /* the sparetable daemon calls this to wait for the kernel to
1724 * need a spare table. this ioctl does not return until a
1725 * spare table is needed. XXX -- calling mpsleep here in the
1726 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1727 * -- I should either compute the spare table in the kernel,
1728 * or have a different -- XXX XXX -- interface (a different
1729 * character device) for delivering the table -- XXX */
1730 #if 0
1731 case RAIDFRAME_SPARET_WAIT:
1732 rf_lock_mutex2(rf_sparet_wait_mutex);
1733 while (!rf_sparet_wait_queue)
1734 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1735 waitreq = rf_sparet_wait_queue;
1736 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1737 rf_unlock_mutex2(rf_sparet_wait_mutex);
1738
1739 /* structure assignment */
1740 *((RF_SparetWait_t *) data) = *waitreq;
1741
1742 RF_Free(waitreq, sizeof(*waitreq));
1743 return (0);
1744
1745 /* wakes up a process waiting on SPARET_WAIT and puts an error
1746 * code in it that will cause the dameon to exit */
1747 case RAIDFRAME_ABORT_SPARET_WAIT:
1748 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1749 waitreq->fcol = -1;
1750 rf_lock_mutex2(rf_sparet_wait_mutex);
1751 waitreq->next = rf_sparet_wait_queue;
1752 rf_sparet_wait_queue = waitreq;
1753 rf_broadcast_conf2(rf_sparet_wait_cv);
1754 rf_unlock_mutex2(rf_sparet_wait_mutex);
1755 return (0);
1756
1757 /* used by the spare table daemon to deliver a spare table
1758 * into the kernel */
1759 case RAIDFRAME_SEND_SPARET:
1760
1761 /* install the spare table */
1762 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1763
1764 /* respond to the requestor. the return status of the spare
1765 * table installation is passed in the "fcol" field */
1766 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1767 waitreq->fcol = retcode;
1768 rf_lock_mutex2(rf_sparet_wait_mutex);
1769 waitreq->next = rf_sparet_resp_queue;
1770 rf_sparet_resp_queue = waitreq;
1771 rf_broadcast_cond2(rf_sparet_resp_cv);
1772 rf_unlock_mutex2(rf_sparet_wait_mutex);
1773
1774 return (retcode);
1775 #endif
1776
1777 default:
1778 break; /* fall through to the os-specific code below */
1779
1780 }
1781
1782 if (!raidPtr->valid)
1783 return (EINVAL);
1784
1785 /*
1786 * Add support for "regular" device ioctls here.
1787 */
1788
1789 switch (cmd) {
1790 case DIOCCACHESYNC:
1791 retcode = rf_sync_component_caches(raidPtr);
1792
1793 default:
1794 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1795 }
1796
1797 return (retcode);
1798
1799 }
1800
1801
1802 /* raidinit -- complete the rest of the initialization for the
1803 RAIDframe device. */
1804
1805
1806 static void
1807 raidinit(struct raid_softc *rs)
1808 {
1809 cfdata_t cf;
1810 unsigned int unit;
1811 struct dk_softc *dksc = &rs->sc_dksc;
1812 RF_Raid_t *raidPtr = &rs->sc_r;
1813 device_t dev;
1814
1815 unit = raidPtr->raidid;
1816
1817 /* XXX doesn't check bounds. */
1818 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1819
1820 /* attach the pseudo device */
1821 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1822 cf->cf_name = raid_cd.cd_name;
1823 cf->cf_atname = raid_cd.cd_name;
1824 cf->cf_unit = unit;
1825 cf->cf_fstate = FSTATE_STAR;
1826
1827 dev = config_attach_pseudo(cf);
1828 if (dev == NULL) {
1829 printf("raid%d: config_attach_pseudo failed\n",
1830 raidPtr->raidid);
1831 free(cf, M_RAIDFRAME);
1832 return;
1833 }
1834
1835 /* provide a backpointer to the real softc */
1836 raidsoftc(dev) = rs;
1837
1838 /* disk_attach actually creates space for the CPU disklabel, among
1839 * other things, so it's critical to call this *BEFORE* we try putzing
1840 * with disklabels. */
1841 dk_init(dksc, dev, DKTYPE_RAID);
1842 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1843
1844 /* XXX There may be a weird interaction here between this, and
1845 * protectedSectors, as used in RAIDframe. */
1846
1847 rs->sc_size = raidPtr->totalSectors;
1848
1849 /* Attach dk and disk subsystems */
1850 dk_attach(dksc);
1851 disk_attach(&dksc->sc_dkdev);
1852 rf_set_geometry(rs, raidPtr);
1853
1854 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1855
1856 /* mark unit as usuable */
1857 rs->sc_flags |= RAIDF_INITED;
1858
1859 dkwedge_discover(&dksc->sc_dkdev);
1860 }
1861
1862 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1863 /* wake up the daemon & tell it to get us a spare table
1864 * XXX
1865 * the entries in the queues should be tagged with the raidPtr
1866 * so that in the extremely rare case that two recons happen at once,
1867 * we know for which device were requesting a spare table
1868 * XXX
1869 *
1870 * XXX This code is not currently used. GO
1871 */
1872 int
1873 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1874 {
1875 int retcode;
1876
1877 rf_lock_mutex2(rf_sparet_wait_mutex);
1878 req->next = rf_sparet_wait_queue;
1879 rf_sparet_wait_queue = req;
1880 rf_broadcast_cond2(rf_sparet_wait_cv);
1881
1882 /* mpsleep unlocks the mutex */
1883 while (!rf_sparet_resp_queue) {
1884 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1885 }
1886 req = rf_sparet_resp_queue;
1887 rf_sparet_resp_queue = req->next;
1888 rf_unlock_mutex2(rf_sparet_wait_mutex);
1889
1890 retcode = req->fcol;
1891 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1892 * alloc'd */
1893 return (retcode);
1894 }
1895 #endif
1896
1897 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1898 * bp & passes it down.
1899 * any calls originating in the kernel must use non-blocking I/O
1900 * do some extra sanity checking to return "appropriate" error values for
1901 * certain conditions (to make some standard utilities work)
1902 *
1903 * Formerly known as: rf_DoAccessKernel
1904 */
1905 void
1906 raidstart(RF_Raid_t *raidPtr)
1907 {
1908 struct raid_softc *rs;
1909 struct dk_softc *dksc;
1910
1911 rs = raidPtr->softc;
1912 dksc = &rs->sc_dksc;
1913 /* quick check to see if anything has died recently */
1914 rf_lock_mutex2(raidPtr->mutex);
1915 if (raidPtr->numNewFailures > 0) {
1916 rf_unlock_mutex2(raidPtr->mutex);
1917 rf_update_component_labels(raidPtr,
1918 RF_NORMAL_COMPONENT_UPDATE);
1919 rf_lock_mutex2(raidPtr->mutex);
1920 raidPtr->numNewFailures--;
1921 }
1922 rf_unlock_mutex2(raidPtr->mutex);
1923
1924 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1925 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1926 return;
1927 }
1928
1929 dk_start(dksc, NULL);
1930 }
1931
1932 static int
1933 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1934 {
1935 RF_SectorCount_t num_blocks, pb, sum;
1936 RF_RaidAddr_t raid_addr;
1937 daddr_t blocknum;
1938 int do_async;
1939 int rc;
1940
1941 rf_lock_mutex2(raidPtr->mutex);
1942 if (raidPtr->openings == 0) {
1943 rf_unlock_mutex2(raidPtr->mutex);
1944 return EAGAIN;
1945 }
1946 rf_unlock_mutex2(raidPtr->mutex);
1947
1948 blocknum = bp->b_rawblkno;
1949
1950 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1951 (int) blocknum));
1952
1953 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1954 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1955
1956 /* *THIS* is where we adjust what block we're going to...
1957 * but DO NOT TOUCH bp->b_blkno!!! */
1958 raid_addr = blocknum;
1959
1960 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1961 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1962 sum = raid_addr + num_blocks + pb;
1963 if (1 || rf_debugKernelAccess) {
1964 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1965 (int) raid_addr, (int) sum, (int) num_blocks,
1966 (int) pb, (int) bp->b_resid));
1967 }
1968 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1969 || (sum < num_blocks) || (sum < pb)) {
1970 rc = ENOSPC;
1971 goto done;
1972 }
1973 /*
1974 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1975 */
1976
1977 if (bp->b_bcount & raidPtr->sectorMask) {
1978 rc = ENOSPC;
1979 goto done;
1980 }
1981 db1_printf(("Calling DoAccess..\n"));
1982
1983
1984 rf_lock_mutex2(raidPtr->mutex);
1985 raidPtr->openings--;
1986 rf_unlock_mutex2(raidPtr->mutex);
1987
1988 /*
1989 * Everything is async.
1990 */
1991 do_async = 1;
1992
1993 /* don't ever condition on bp->b_flags & B_WRITE.
1994 * always condition on B_READ instead */
1995
1996 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1997 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1998 do_async, raid_addr, num_blocks,
1999 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2000
2001 done:
2002 return rc;
2003 }
2004
2005 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2006
2007 int
2008 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2009 {
2010 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2011 struct buf *bp;
2012
2013 req->queue = queue;
2014 bp = req->bp;
2015
2016 switch (req->type) {
2017 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2018 /* XXX need to do something extra here.. */
2019 /* I'm leaving this in, as I've never actually seen it used,
2020 * and I'd like folks to report it... GO */
2021 printf(("WAKEUP CALLED\n"));
2022 queue->numOutstanding++;
2023
2024 bp->b_flags = 0;
2025 bp->b_private = req;
2026
2027 KernelWakeupFunc(bp);
2028 break;
2029
2030 case RF_IO_TYPE_READ:
2031 case RF_IO_TYPE_WRITE:
2032 #if RF_ACC_TRACE > 0
2033 if (req->tracerec) {
2034 RF_ETIMER_START(req->tracerec->timer);
2035 }
2036 #endif
2037 InitBP(bp, queue->rf_cinfo->ci_vp,
2038 op, queue->rf_cinfo->ci_dev,
2039 req->sectorOffset, req->numSector,
2040 req->buf, KernelWakeupFunc, (void *) req,
2041 queue->raidPtr->logBytesPerSector, req->b_proc);
2042
2043 if (rf_debugKernelAccess) {
2044 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2045 (long) bp->b_blkno));
2046 }
2047 queue->numOutstanding++;
2048 queue->last_deq_sector = req->sectorOffset;
2049 /* acc wouldn't have been let in if there were any pending
2050 * reqs at any other priority */
2051 queue->curPriority = req->priority;
2052
2053 db1_printf(("Going for %c to unit %d col %d\n",
2054 req->type, queue->raidPtr->raidid,
2055 queue->col));
2056 db1_printf(("sector %d count %d (%d bytes) %d\n",
2057 (int) req->sectorOffset, (int) req->numSector,
2058 (int) (req->numSector <<
2059 queue->raidPtr->logBytesPerSector),
2060 (int) queue->raidPtr->logBytesPerSector));
2061
2062 /*
2063 * XXX: drop lock here since this can block at
2064 * least with backing SCSI devices. Retake it
2065 * to minimize fuss with calling interfaces.
2066 */
2067
2068 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2069 bdev_strategy(bp);
2070 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2071 break;
2072
2073 default:
2074 panic("bad req->type in rf_DispatchKernelIO");
2075 }
2076 db1_printf(("Exiting from DispatchKernelIO\n"));
2077
2078 return (0);
2079 }
2080 /* this is the callback function associated with a I/O invoked from
2081 kernel code.
2082 */
2083 static void
2084 KernelWakeupFunc(struct buf *bp)
2085 {
2086 RF_DiskQueueData_t *req = NULL;
2087 RF_DiskQueue_t *queue;
2088
2089 db1_printf(("recovering the request queue:\n"));
2090
2091 req = bp->b_private;
2092
2093 queue = (RF_DiskQueue_t *) req->queue;
2094
2095 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2096
2097 #if RF_ACC_TRACE > 0
2098 if (req->tracerec) {
2099 RF_ETIMER_STOP(req->tracerec->timer);
2100 RF_ETIMER_EVAL(req->tracerec->timer);
2101 rf_lock_mutex2(rf_tracing_mutex);
2102 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2103 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2104 req->tracerec->num_phys_ios++;
2105 rf_unlock_mutex2(rf_tracing_mutex);
2106 }
2107 #endif
2108
2109 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2110 * ballistic, and mark the component as hosed... */
2111
2112 if (bp->b_error != 0) {
2113 /* Mark the disk as dead */
2114 /* but only mark it once... */
2115 /* and only if it wouldn't leave this RAID set
2116 completely broken */
2117 if (((queue->raidPtr->Disks[queue->col].status ==
2118 rf_ds_optimal) ||
2119 (queue->raidPtr->Disks[queue->col].status ==
2120 rf_ds_used_spare)) &&
2121 (queue->raidPtr->numFailures <
2122 queue->raidPtr->Layout.map->faultsTolerated)) {
2123 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2124 queue->raidPtr->raidid,
2125 bp->b_error,
2126 queue->raidPtr->Disks[queue->col].devname);
2127 queue->raidPtr->Disks[queue->col].status =
2128 rf_ds_failed;
2129 queue->raidPtr->status = rf_rs_degraded;
2130 queue->raidPtr->numFailures++;
2131 queue->raidPtr->numNewFailures++;
2132 } else { /* Disk is already dead... */
2133 /* printf("Disk already marked as dead!\n"); */
2134 }
2135
2136 }
2137
2138 /* Fill in the error value */
2139 req->error = bp->b_error;
2140
2141 /* Drop this one on the "finished" queue... */
2142 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2143
2144 /* Let the raidio thread know there is work to be done. */
2145 rf_signal_cond2(queue->raidPtr->iodone_cv);
2146
2147 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2148 }
2149
2150
2151 /*
2152 * initialize a buf structure for doing an I/O in the kernel.
2153 */
2154 static void
2155 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2156 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2157 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2158 struct proc *b_proc)
2159 {
2160 /* bp->b_flags = B_PHYS | rw_flag; */
2161 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2162 bp->b_oflags = 0;
2163 bp->b_cflags = 0;
2164 bp->b_bcount = numSect << logBytesPerSector;
2165 bp->b_bufsize = bp->b_bcount;
2166 bp->b_error = 0;
2167 bp->b_dev = dev;
2168 bp->b_data = bf;
2169 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2170 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2171 if (bp->b_bcount == 0) {
2172 panic("bp->b_bcount is zero in InitBP!!");
2173 }
2174 bp->b_proc = b_proc;
2175 bp->b_iodone = cbFunc;
2176 bp->b_private = cbArg;
2177 }
2178
2179 /*
2180 * Wait interruptibly for an exclusive lock.
2181 *
2182 * XXX
2183 * Several drivers do this; it should be abstracted and made MP-safe.
2184 * (Hmm... where have we seen this warning before :-> GO )
2185 */
2186 static int
2187 raidlock(struct raid_softc *rs)
2188 {
2189 int error;
2190
2191 error = 0;
2192 mutex_enter(&rs->sc_mutex);
2193 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2194 rs->sc_flags |= RAIDF_WANTED;
2195 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2196 if (error != 0)
2197 goto done;
2198 }
2199 rs->sc_flags |= RAIDF_LOCKED;
2200 done:
2201 mutex_exit(&rs->sc_mutex);
2202 return (error);
2203 }
2204 /*
2205 * Unlock and wake up any waiters.
2206 */
2207 static void
2208 raidunlock(struct raid_softc *rs)
2209 {
2210
2211 mutex_enter(&rs->sc_mutex);
2212 rs->sc_flags &= ~RAIDF_LOCKED;
2213 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2214 rs->sc_flags &= ~RAIDF_WANTED;
2215 cv_broadcast(&rs->sc_cv);
2216 }
2217 mutex_exit(&rs->sc_mutex);
2218 }
2219
2220
2221 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2222 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2223 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2224
2225 static daddr_t
2226 rf_component_info_offset(void)
2227 {
2228
2229 return RF_COMPONENT_INFO_OFFSET;
2230 }
2231
2232 static daddr_t
2233 rf_component_info_size(unsigned secsize)
2234 {
2235 daddr_t info_size;
2236
2237 KASSERT(secsize);
2238 if (secsize > RF_COMPONENT_INFO_SIZE)
2239 info_size = secsize;
2240 else
2241 info_size = RF_COMPONENT_INFO_SIZE;
2242
2243 return info_size;
2244 }
2245
2246 static daddr_t
2247 rf_parity_map_offset(RF_Raid_t *raidPtr)
2248 {
2249 daddr_t map_offset;
2250
2251 KASSERT(raidPtr->bytesPerSector);
2252 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2253 map_offset = raidPtr->bytesPerSector;
2254 else
2255 map_offset = RF_COMPONENT_INFO_SIZE;
2256 map_offset += rf_component_info_offset();
2257
2258 return map_offset;
2259 }
2260
2261 static daddr_t
2262 rf_parity_map_size(RF_Raid_t *raidPtr)
2263 {
2264 daddr_t map_size;
2265
2266 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2267 map_size = raidPtr->bytesPerSector;
2268 else
2269 map_size = RF_PARITY_MAP_SIZE;
2270
2271 return map_size;
2272 }
2273
2274 int
2275 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2276 {
2277 RF_ComponentLabel_t *clabel;
2278
2279 clabel = raidget_component_label(raidPtr, col);
2280 clabel->clean = RF_RAID_CLEAN;
2281 raidflush_component_label(raidPtr, col);
2282 return(0);
2283 }
2284
2285
2286 int
2287 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2288 {
2289 RF_ComponentLabel_t *clabel;
2290
2291 clabel = raidget_component_label(raidPtr, col);
2292 clabel->clean = RF_RAID_DIRTY;
2293 raidflush_component_label(raidPtr, col);
2294 return(0);
2295 }
2296
2297 int
2298 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2299 {
2300 KASSERT(raidPtr->bytesPerSector);
2301 return raidread_component_label(raidPtr->bytesPerSector,
2302 raidPtr->Disks[col].dev,
2303 raidPtr->raid_cinfo[col].ci_vp,
2304 &raidPtr->raid_cinfo[col].ci_label);
2305 }
2306
2307 RF_ComponentLabel_t *
2308 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2309 {
2310 return &raidPtr->raid_cinfo[col].ci_label;
2311 }
2312
2313 int
2314 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2315 {
2316 RF_ComponentLabel_t *label;
2317
2318 label = &raidPtr->raid_cinfo[col].ci_label;
2319 label->mod_counter = raidPtr->mod_counter;
2320 #ifndef RF_NO_PARITY_MAP
2321 label->parity_map_modcount = label->mod_counter;
2322 #endif
2323 return raidwrite_component_label(raidPtr->bytesPerSector,
2324 raidPtr->Disks[col].dev,
2325 raidPtr->raid_cinfo[col].ci_vp, label);
2326 }
2327
2328
2329 static int
2330 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2331 RF_ComponentLabel_t *clabel)
2332 {
2333 return raidread_component_area(dev, b_vp, clabel,
2334 sizeof(RF_ComponentLabel_t),
2335 rf_component_info_offset(),
2336 rf_component_info_size(secsize));
2337 }
2338
2339 /* ARGSUSED */
2340 static int
2341 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2342 size_t msize, daddr_t offset, daddr_t dsize)
2343 {
2344 struct buf *bp;
2345 int error;
2346
2347 /* XXX should probably ensure that we don't try to do this if
2348 someone has changed rf_protected_sectors. */
2349
2350 if (b_vp == NULL) {
2351 /* For whatever reason, this component is not valid.
2352 Don't try to read a component label from it. */
2353 return(EINVAL);
2354 }
2355
2356 /* get a block of the appropriate size... */
2357 bp = geteblk((int)dsize);
2358 bp->b_dev = dev;
2359
2360 /* get our ducks in a row for the read */
2361 bp->b_blkno = offset / DEV_BSIZE;
2362 bp->b_bcount = dsize;
2363 bp->b_flags |= B_READ;
2364 bp->b_resid = dsize;
2365
2366 bdev_strategy(bp);
2367 error = biowait(bp);
2368
2369 if (!error) {
2370 memcpy(data, bp->b_data, msize);
2371 }
2372
2373 brelse(bp, 0);
2374 return(error);
2375 }
2376
2377
2378 static int
2379 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2380 RF_ComponentLabel_t *clabel)
2381 {
2382 return raidwrite_component_area(dev, b_vp, clabel,
2383 sizeof(RF_ComponentLabel_t),
2384 rf_component_info_offset(),
2385 rf_component_info_size(secsize), 0);
2386 }
2387
2388 /* ARGSUSED */
2389 static int
2390 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2391 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2392 {
2393 struct buf *bp;
2394 int error;
2395
2396 /* get a block of the appropriate size... */
2397 bp = geteblk((int)dsize);
2398 bp->b_dev = dev;
2399
2400 /* get our ducks in a row for the write */
2401 bp->b_blkno = offset / DEV_BSIZE;
2402 bp->b_bcount = dsize;
2403 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2404 bp->b_resid = dsize;
2405
2406 memset(bp->b_data, 0, dsize);
2407 memcpy(bp->b_data, data, msize);
2408
2409 bdev_strategy(bp);
2410 if (asyncp)
2411 return 0;
2412 error = biowait(bp);
2413 brelse(bp, 0);
2414 if (error) {
2415 #if 1
2416 printf("Failed to write RAID component info!\n");
2417 #endif
2418 }
2419
2420 return(error);
2421 }
2422
2423 void
2424 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2425 {
2426 int c;
2427
2428 for (c = 0; c < raidPtr->numCol; c++) {
2429 /* Skip dead disks. */
2430 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2431 continue;
2432 /* XXXjld: what if an error occurs here? */
2433 raidwrite_component_area(raidPtr->Disks[c].dev,
2434 raidPtr->raid_cinfo[c].ci_vp, map,
2435 RF_PARITYMAP_NBYTE,
2436 rf_parity_map_offset(raidPtr),
2437 rf_parity_map_size(raidPtr), 0);
2438 }
2439 }
2440
2441 void
2442 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2443 {
2444 struct rf_paritymap_ondisk tmp;
2445 int c,first;
2446
2447 first=1;
2448 for (c = 0; c < raidPtr->numCol; c++) {
2449 /* Skip dead disks. */
2450 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2451 continue;
2452 raidread_component_area(raidPtr->Disks[c].dev,
2453 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2454 RF_PARITYMAP_NBYTE,
2455 rf_parity_map_offset(raidPtr),
2456 rf_parity_map_size(raidPtr));
2457 if (first) {
2458 memcpy(map, &tmp, sizeof(*map));
2459 first = 0;
2460 } else {
2461 rf_paritymap_merge(map, &tmp);
2462 }
2463 }
2464 }
2465
2466 void
2467 rf_markalldirty(RF_Raid_t *raidPtr)
2468 {
2469 RF_ComponentLabel_t *clabel;
2470 int sparecol;
2471 int c;
2472 int j;
2473 int scol = -1;
2474
2475 raidPtr->mod_counter++;
2476 for (c = 0; c < raidPtr->numCol; c++) {
2477 /* we don't want to touch (at all) a disk that has
2478 failed */
2479 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2480 clabel = raidget_component_label(raidPtr, c);
2481 if (clabel->status == rf_ds_spared) {
2482 /* XXX do something special...
2483 but whatever you do, don't
2484 try to access it!! */
2485 } else {
2486 raidmarkdirty(raidPtr, c);
2487 }
2488 }
2489 }
2490
2491 for( c = 0; c < raidPtr->numSpare ; c++) {
2492 sparecol = raidPtr->numCol + c;
2493 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2494 /*
2495
2496 we claim this disk is "optimal" if it's
2497 rf_ds_used_spare, as that means it should be
2498 directly substitutable for the disk it replaced.
2499 We note that too...
2500
2501 */
2502
2503 for(j=0;j<raidPtr->numCol;j++) {
2504 if (raidPtr->Disks[j].spareCol == sparecol) {
2505 scol = j;
2506 break;
2507 }
2508 }
2509
2510 clabel = raidget_component_label(raidPtr, sparecol);
2511 /* make sure status is noted */
2512
2513 raid_init_component_label(raidPtr, clabel);
2514
2515 clabel->row = 0;
2516 clabel->column = scol;
2517 /* Note: we *don't* change status from rf_ds_used_spare
2518 to rf_ds_optimal */
2519 /* clabel.status = rf_ds_optimal; */
2520
2521 raidmarkdirty(raidPtr, sparecol);
2522 }
2523 }
2524 }
2525
2526
2527 void
2528 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2529 {
2530 RF_ComponentLabel_t *clabel;
2531 int sparecol;
2532 int c;
2533 int j;
2534 int scol;
2535 struct raid_softc *rs = raidPtr->softc;
2536
2537 scol = -1;
2538
2539 /* XXX should do extra checks to make sure things really are clean,
2540 rather than blindly setting the clean bit... */
2541
2542 raidPtr->mod_counter++;
2543
2544 for (c = 0; c < raidPtr->numCol; c++) {
2545 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2546 clabel = raidget_component_label(raidPtr, c);
2547 /* make sure status is noted */
2548 clabel->status = rf_ds_optimal;
2549
2550 /* note what unit we are configured as */
2551 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2552 clabel->last_unit = raidPtr->raidid;
2553
2554 raidflush_component_label(raidPtr, c);
2555 if (final == RF_FINAL_COMPONENT_UPDATE) {
2556 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2557 raidmarkclean(raidPtr, c);
2558 }
2559 }
2560 }
2561 /* else we don't touch it.. */
2562 }
2563
2564 for( c = 0; c < raidPtr->numSpare ; c++) {
2565 sparecol = raidPtr->numCol + c;
2566 /* Need to ensure that the reconstruct actually completed! */
2567 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2568 /*
2569
2570 we claim this disk is "optimal" if it's
2571 rf_ds_used_spare, as that means it should be
2572 directly substitutable for the disk it replaced.
2573 We note that too...
2574
2575 */
2576
2577 for(j=0;j<raidPtr->numCol;j++) {
2578 if (raidPtr->Disks[j].spareCol == sparecol) {
2579 scol = j;
2580 break;
2581 }
2582 }
2583
2584 /* XXX shouldn't *really* need this... */
2585 clabel = raidget_component_label(raidPtr, sparecol);
2586 /* make sure status is noted */
2587
2588 raid_init_component_label(raidPtr, clabel);
2589
2590 clabel->column = scol;
2591 clabel->status = rf_ds_optimal;
2592 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2593 clabel->last_unit = raidPtr->raidid;
2594
2595 raidflush_component_label(raidPtr, sparecol);
2596 if (final == RF_FINAL_COMPONENT_UPDATE) {
2597 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2598 raidmarkclean(raidPtr, sparecol);
2599 }
2600 }
2601 }
2602 }
2603 }
2604
2605 void
2606 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2607 {
2608
2609 if (vp != NULL) {
2610 if (auto_configured == 1) {
2611 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2612 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2613 vput(vp);
2614
2615 } else {
2616 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2617 }
2618 }
2619 }
2620
2621
2622 void
2623 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2624 {
2625 int r,c;
2626 struct vnode *vp;
2627 int acd;
2628
2629
2630 /* We take this opportunity to close the vnodes like we should.. */
2631
2632 for (c = 0; c < raidPtr->numCol; c++) {
2633 vp = raidPtr->raid_cinfo[c].ci_vp;
2634 acd = raidPtr->Disks[c].auto_configured;
2635 rf_close_component(raidPtr, vp, acd);
2636 raidPtr->raid_cinfo[c].ci_vp = NULL;
2637 raidPtr->Disks[c].auto_configured = 0;
2638 }
2639
2640 for (r = 0; r < raidPtr->numSpare; r++) {
2641 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2642 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2643 rf_close_component(raidPtr, vp, acd);
2644 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2645 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2646 }
2647 }
2648
2649
2650 void
2651 rf_ReconThread(struct rf_recon_req *req)
2652 {
2653 int s;
2654 RF_Raid_t *raidPtr;
2655
2656 s = splbio();
2657 raidPtr = (RF_Raid_t *) req->raidPtr;
2658 raidPtr->recon_in_progress = 1;
2659
2660 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2661 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2662
2663 RF_Free(req, sizeof(*req));
2664
2665 raidPtr->recon_in_progress = 0;
2666 splx(s);
2667
2668 /* That's all... */
2669 kthread_exit(0); /* does not return */
2670 }
2671
2672 void
2673 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2674 {
2675 int retcode;
2676 int s;
2677
2678 raidPtr->parity_rewrite_stripes_done = 0;
2679 raidPtr->parity_rewrite_in_progress = 1;
2680 s = splbio();
2681 retcode = rf_RewriteParity(raidPtr);
2682 splx(s);
2683 if (retcode) {
2684 printf("raid%d: Error re-writing parity (%d)!\n",
2685 raidPtr->raidid, retcode);
2686 } else {
2687 /* set the clean bit! If we shutdown correctly,
2688 the clean bit on each component label will get
2689 set */
2690 raidPtr->parity_good = RF_RAID_CLEAN;
2691 }
2692 raidPtr->parity_rewrite_in_progress = 0;
2693
2694 /* Anyone waiting for us to stop? If so, inform them... */
2695 if (raidPtr->waitShutdown) {
2696 wakeup(&raidPtr->parity_rewrite_in_progress);
2697 }
2698
2699 /* That's all... */
2700 kthread_exit(0); /* does not return */
2701 }
2702
2703
2704 void
2705 rf_CopybackThread(RF_Raid_t *raidPtr)
2706 {
2707 int s;
2708
2709 raidPtr->copyback_in_progress = 1;
2710 s = splbio();
2711 rf_CopybackReconstructedData(raidPtr);
2712 splx(s);
2713 raidPtr->copyback_in_progress = 0;
2714
2715 /* That's all... */
2716 kthread_exit(0); /* does not return */
2717 }
2718
2719
2720 void
2721 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2722 {
2723 int s;
2724 RF_Raid_t *raidPtr;
2725
2726 s = splbio();
2727 raidPtr = req->raidPtr;
2728 raidPtr->recon_in_progress = 1;
2729 rf_ReconstructInPlace(raidPtr, req->col);
2730 RF_Free(req, sizeof(*req));
2731 raidPtr->recon_in_progress = 0;
2732 splx(s);
2733
2734 /* That's all... */
2735 kthread_exit(0); /* does not return */
2736 }
2737
2738 static RF_AutoConfig_t *
2739 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2740 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2741 unsigned secsize)
2742 {
2743 int good_one = 0;
2744 RF_ComponentLabel_t *clabel;
2745 RF_AutoConfig_t *ac;
2746
2747 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2748 if (clabel == NULL) {
2749 oomem:
2750 while(ac_list) {
2751 ac = ac_list;
2752 if (ac->clabel)
2753 free(ac->clabel, M_RAIDFRAME);
2754 ac_list = ac_list->next;
2755 free(ac, M_RAIDFRAME);
2756 }
2757 printf("RAID auto config: out of memory!\n");
2758 return NULL; /* XXX probably should panic? */
2759 }
2760
2761 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2762 /* Got the label. Does it look reasonable? */
2763 if (rf_reasonable_label(clabel, numsecs) &&
2764 (rf_component_label_partitionsize(clabel) <= size)) {
2765 #ifdef DEBUG
2766 printf("Component on: %s: %llu\n",
2767 cname, (unsigned long long)size);
2768 rf_print_component_label(clabel);
2769 #endif
2770 /* if it's reasonable, add it, else ignore it. */
2771 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2772 M_NOWAIT);
2773 if (ac == NULL) {
2774 free(clabel, M_RAIDFRAME);
2775 goto oomem;
2776 }
2777 strlcpy(ac->devname, cname, sizeof(ac->devname));
2778 ac->dev = dev;
2779 ac->vp = vp;
2780 ac->clabel = clabel;
2781 ac->next = ac_list;
2782 ac_list = ac;
2783 good_one = 1;
2784 }
2785 }
2786 if (!good_one) {
2787 /* cleanup */
2788 free(clabel, M_RAIDFRAME);
2789 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2790 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2791 vput(vp);
2792 }
2793 return ac_list;
2794 }
2795
2796 RF_AutoConfig_t *
2797 rf_find_raid_components(void)
2798 {
2799 struct vnode *vp;
2800 struct disklabel label;
2801 device_t dv;
2802 deviter_t di;
2803 dev_t dev;
2804 int bmajor, bminor, wedge, rf_part_found;
2805 int error;
2806 int i;
2807 RF_AutoConfig_t *ac_list;
2808 uint64_t numsecs;
2809 unsigned secsize;
2810 int dowedges;
2811
2812 /* initialize the AutoConfig list */
2813 ac_list = NULL;
2814
2815 /*
2816 * we begin by trolling through *all* the devices on the system *twice*
2817 * first we scan for wedges, second for other devices. This avoids
2818 * using a raw partition instead of a wedge that covers the whole disk
2819 */
2820
2821 for (dowedges=1; dowedges>=0; --dowedges) {
2822 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2823 dv = deviter_next(&di)) {
2824
2825 /* we are only interested in disks... */
2826 if (device_class(dv) != DV_DISK)
2827 continue;
2828
2829 /* we don't care about floppies... */
2830 if (device_is_a(dv, "fd")) {
2831 continue;
2832 }
2833
2834 /* we don't care about CD's... */
2835 if (device_is_a(dv, "cd")) {
2836 continue;
2837 }
2838
2839 /* we don't care about md's... */
2840 if (device_is_a(dv, "md")) {
2841 continue;
2842 }
2843
2844 /* hdfd is the Atari/Hades floppy driver */
2845 if (device_is_a(dv, "hdfd")) {
2846 continue;
2847 }
2848
2849 /* fdisa is the Atari/Milan floppy driver */
2850 if (device_is_a(dv, "fdisa")) {
2851 continue;
2852 }
2853
2854 /* are we in the wedges pass ? */
2855 wedge = device_is_a(dv, "dk");
2856 if (wedge != dowedges) {
2857 continue;
2858 }
2859
2860 /* need to find the device_name_to_block_device_major stuff */
2861 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2862
2863 rf_part_found = 0; /*No raid partition as yet*/
2864
2865 /* get a vnode for the raw partition of this disk */
2866 bminor = minor(device_unit(dv));
2867 dev = wedge ? makedev(bmajor, bminor) :
2868 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2869 if (bdevvp(dev, &vp))
2870 panic("RAID can't alloc vnode");
2871
2872 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2873
2874 if (error) {
2875 /* "Who cares." Continue looking
2876 for something that exists*/
2877 vput(vp);
2878 continue;
2879 }
2880
2881 error = getdisksize(vp, &numsecs, &secsize);
2882 if (error) {
2883 /*
2884 * Pseudo devices like vnd and cgd can be
2885 * opened but may still need some configuration.
2886 * Ignore these quietly.
2887 */
2888 if (error != ENXIO)
2889 printf("RAIDframe: can't get disk size"
2890 " for dev %s (%d)\n",
2891 device_xname(dv), error);
2892 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2893 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2894 vput(vp);
2895 continue;
2896 }
2897 if (wedge) {
2898 struct dkwedge_info dkw;
2899 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2900 NOCRED);
2901 if (error) {
2902 printf("RAIDframe: can't get wedge info for "
2903 "dev %s (%d)\n", device_xname(dv), error);
2904 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2905 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2906 vput(vp);
2907 continue;
2908 }
2909
2910 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2911 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2912 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2913 vput(vp);
2914 continue;
2915 }
2916
2917 ac_list = rf_get_component(ac_list, dev, vp,
2918 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2919 rf_part_found = 1; /*There is a raid component on this disk*/
2920 continue;
2921 }
2922
2923 /* Ok, the disk exists. Go get the disklabel. */
2924 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2925 if (error) {
2926 /*
2927 * XXX can't happen - open() would
2928 * have errored out (or faked up one)
2929 */
2930 if (error != ENOTTY)
2931 printf("RAIDframe: can't get label for dev "
2932 "%s (%d)\n", device_xname(dv), error);
2933 }
2934
2935 /* don't need this any more. We'll allocate it again
2936 a little later if we really do... */
2937 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2938 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2939 vput(vp);
2940
2941 if (error)
2942 continue;
2943
2944 rf_part_found = 0; /*No raid partitions yet*/
2945 for (i = 0; i < label.d_npartitions; i++) {
2946 char cname[sizeof(ac_list->devname)];
2947
2948 /* We only support partitions marked as RAID */
2949 if (label.d_partitions[i].p_fstype != FS_RAID)
2950 continue;
2951
2952 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2953 if (bdevvp(dev, &vp))
2954 panic("RAID can't alloc vnode");
2955
2956 error = VOP_OPEN(vp, FREAD, NOCRED);
2957 if (error) {
2958 /* Whatever... */
2959 vput(vp);
2960 continue;
2961 }
2962 snprintf(cname, sizeof(cname), "%s%c",
2963 device_xname(dv), 'a' + i);
2964 ac_list = rf_get_component(ac_list, dev, vp, cname,
2965 label.d_partitions[i].p_size, numsecs, secsize);
2966 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2967 }
2968
2969 /*
2970 *If there is no raid component on this disk, either in a
2971 *disklabel or inside a wedge, check the raw partition as well,
2972 *as it is possible to configure raid components on raw disk
2973 *devices.
2974 */
2975
2976 if (!rf_part_found) {
2977 char cname[sizeof(ac_list->devname)];
2978
2979 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2980 if (bdevvp(dev, &vp))
2981 panic("RAID can't alloc vnode");
2982
2983 error = VOP_OPEN(vp, FREAD, NOCRED);
2984 if (error) {
2985 /* Whatever... */
2986 vput(vp);
2987 continue;
2988 }
2989 snprintf(cname, sizeof(cname), "%s%c",
2990 device_xname(dv), 'a' + RAW_PART);
2991 ac_list = rf_get_component(ac_list, dev, vp, cname,
2992 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2993 }
2994 }
2995 deviter_release(&di);
2996 }
2997 return ac_list;
2998 }
2999
3000
3001 int
3002 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3003 {
3004
3005 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3006 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3007 ((clabel->clean == RF_RAID_CLEAN) ||
3008 (clabel->clean == RF_RAID_DIRTY)) &&
3009 clabel->row >=0 &&
3010 clabel->column >= 0 &&
3011 clabel->num_rows > 0 &&
3012 clabel->num_columns > 0 &&
3013 clabel->row < clabel->num_rows &&
3014 clabel->column < clabel->num_columns &&
3015 clabel->blockSize > 0 &&
3016 /*
3017 * numBlocksHi may contain garbage, but it is ok since
3018 * the type is unsigned. If it is really garbage,
3019 * rf_fix_old_label_size() will fix it.
3020 */
3021 rf_component_label_numblocks(clabel) > 0) {
3022 /*
3023 * label looks reasonable enough...
3024 * let's make sure it has no old garbage.
3025 */
3026 if (numsecs)
3027 rf_fix_old_label_size(clabel, numsecs);
3028 return(1);
3029 }
3030 return(0);
3031 }
3032
3033
3034 /*
3035 * For reasons yet unknown, some old component labels have garbage in
3036 * the newer numBlocksHi region, and this causes lossage. Since those
3037 * disks will also have numsecs set to less than 32 bits of sectors,
3038 * we can determine when this corruption has occurred, and fix it.
3039 *
3040 * The exact same problem, with the same unknown reason, happens to
3041 * the partitionSizeHi member as well.
3042 */
3043 static void
3044 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3045 {
3046
3047 if (numsecs < ((uint64_t)1 << 32)) {
3048 if (clabel->numBlocksHi) {
3049 printf("WARNING: total sectors < 32 bits, yet "
3050 "numBlocksHi set\n"
3051 "WARNING: resetting numBlocksHi to zero.\n");
3052 clabel->numBlocksHi = 0;
3053 }
3054
3055 if (clabel->partitionSizeHi) {
3056 printf("WARNING: total sectors < 32 bits, yet "
3057 "partitionSizeHi set\n"
3058 "WARNING: resetting partitionSizeHi to zero.\n");
3059 clabel->partitionSizeHi = 0;
3060 }
3061 }
3062 }
3063
3064
3065 #ifdef DEBUG
3066 void
3067 rf_print_component_label(RF_ComponentLabel_t *clabel)
3068 {
3069 uint64_t numBlocks;
3070 static const char *rp[] = {
3071 "No", "Force", "Soft", "*invalid*"
3072 };
3073
3074
3075 numBlocks = rf_component_label_numblocks(clabel);
3076
3077 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3078 clabel->row, clabel->column,
3079 clabel->num_rows, clabel->num_columns);
3080 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3081 clabel->version, clabel->serial_number,
3082 clabel->mod_counter);
3083 printf(" Clean: %s Status: %d\n",
3084 clabel->clean ? "Yes" : "No", clabel->status);
3085 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3086 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3087 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3088 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3089 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3090 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3091 printf(" Last configured as: raid%d\n", clabel->last_unit);
3092 #if 0
3093 printf(" Config order: %d\n", clabel->config_order);
3094 #endif
3095
3096 }
3097 #endif
3098
3099 RF_ConfigSet_t *
3100 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3101 {
3102 RF_AutoConfig_t *ac;
3103 RF_ConfigSet_t *config_sets;
3104 RF_ConfigSet_t *cset;
3105 RF_AutoConfig_t *ac_next;
3106
3107
3108 config_sets = NULL;
3109
3110 /* Go through the AutoConfig list, and figure out which components
3111 belong to what sets. */
3112 ac = ac_list;
3113 while(ac!=NULL) {
3114 /* we're going to putz with ac->next, so save it here
3115 for use at the end of the loop */
3116 ac_next = ac->next;
3117
3118 if (config_sets == NULL) {
3119 /* will need at least this one... */
3120 config_sets = (RF_ConfigSet_t *)
3121 malloc(sizeof(RF_ConfigSet_t),
3122 M_RAIDFRAME, M_NOWAIT);
3123 if (config_sets == NULL) {
3124 panic("rf_create_auto_sets: No memory!");
3125 }
3126 /* this one is easy :) */
3127 config_sets->ac = ac;
3128 config_sets->next = NULL;
3129 config_sets->rootable = 0;
3130 ac->next = NULL;
3131 } else {
3132 /* which set does this component fit into? */
3133 cset = config_sets;
3134 while(cset!=NULL) {
3135 if (rf_does_it_fit(cset, ac)) {
3136 /* looks like it matches... */
3137 ac->next = cset->ac;
3138 cset->ac = ac;
3139 break;
3140 }
3141 cset = cset->next;
3142 }
3143 if (cset==NULL) {
3144 /* didn't find a match above... new set..*/
3145 cset = (RF_ConfigSet_t *)
3146 malloc(sizeof(RF_ConfigSet_t),
3147 M_RAIDFRAME, M_NOWAIT);
3148 if (cset == NULL) {
3149 panic("rf_create_auto_sets: No memory!");
3150 }
3151 cset->ac = ac;
3152 ac->next = NULL;
3153 cset->next = config_sets;
3154 cset->rootable = 0;
3155 config_sets = cset;
3156 }
3157 }
3158 ac = ac_next;
3159 }
3160
3161
3162 return(config_sets);
3163 }
3164
3165 static int
3166 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3167 {
3168 RF_ComponentLabel_t *clabel1, *clabel2;
3169
3170 /* If this one matches the *first* one in the set, that's good
3171 enough, since the other members of the set would have been
3172 through here too... */
3173 /* note that we are not checking partitionSize here..
3174
3175 Note that we are also not checking the mod_counters here.
3176 If everything else matches except the mod_counter, that's
3177 good enough for this test. We will deal with the mod_counters
3178 a little later in the autoconfiguration process.
3179
3180 (clabel1->mod_counter == clabel2->mod_counter) &&
3181
3182 The reason we don't check for this is that failed disks
3183 will have lower modification counts. If those disks are
3184 not added to the set they used to belong to, then they will
3185 form their own set, which may result in 2 different sets,
3186 for example, competing to be configured at raid0, and
3187 perhaps competing to be the root filesystem set. If the
3188 wrong ones get configured, or both attempt to become /,
3189 weird behaviour and or serious lossage will occur. Thus we
3190 need to bring them into the fold here, and kick them out at
3191 a later point.
3192
3193 */
3194
3195 clabel1 = cset->ac->clabel;
3196 clabel2 = ac->clabel;
3197 if ((clabel1->version == clabel2->version) &&
3198 (clabel1->serial_number == clabel2->serial_number) &&
3199 (clabel1->num_rows == clabel2->num_rows) &&
3200 (clabel1->num_columns == clabel2->num_columns) &&
3201 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3202 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3203 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3204 (clabel1->parityConfig == clabel2->parityConfig) &&
3205 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3206 (clabel1->blockSize == clabel2->blockSize) &&
3207 rf_component_label_numblocks(clabel1) ==
3208 rf_component_label_numblocks(clabel2) &&
3209 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3210 (clabel1->root_partition == clabel2->root_partition) &&
3211 (clabel1->last_unit == clabel2->last_unit) &&
3212 (clabel1->config_order == clabel2->config_order)) {
3213 /* if it get's here, it almost *has* to be a match */
3214 } else {
3215 /* it's not consistent with somebody in the set..
3216 punt */
3217 return(0);
3218 }
3219 /* all was fine.. it must fit... */
3220 return(1);
3221 }
3222
3223 int
3224 rf_have_enough_components(RF_ConfigSet_t *cset)
3225 {
3226 RF_AutoConfig_t *ac;
3227 RF_AutoConfig_t *auto_config;
3228 RF_ComponentLabel_t *clabel;
3229 int c;
3230 int num_cols;
3231 int num_missing;
3232 int mod_counter;
3233 int mod_counter_found;
3234 int even_pair_failed;
3235 char parity_type;
3236
3237
3238 /* check to see that we have enough 'live' components
3239 of this set. If so, we can configure it if necessary */
3240
3241 num_cols = cset->ac->clabel->num_columns;
3242 parity_type = cset->ac->clabel->parityConfig;
3243
3244 /* XXX Check for duplicate components!?!?!? */
3245
3246 /* Determine what the mod_counter is supposed to be for this set. */
3247
3248 mod_counter_found = 0;
3249 mod_counter = 0;
3250 ac = cset->ac;
3251 while(ac!=NULL) {
3252 if (mod_counter_found==0) {
3253 mod_counter = ac->clabel->mod_counter;
3254 mod_counter_found = 1;
3255 } else {
3256 if (ac->clabel->mod_counter > mod_counter) {
3257 mod_counter = ac->clabel->mod_counter;
3258 }
3259 }
3260 ac = ac->next;
3261 }
3262
3263 num_missing = 0;
3264 auto_config = cset->ac;
3265
3266 even_pair_failed = 0;
3267 for(c=0; c<num_cols; c++) {
3268 ac = auto_config;
3269 while(ac!=NULL) {
3270 if ((ac->clabel->column == c) &&
3271 (ac->clabel->mod_counter == mod_counter)) {
3272 /* it's this one... */
3273 #ifdef DEBUG
3274 printf("Found: %s at %d\n",
3275 ac->devname,c);
3276 #endif
3277 break;
3278 }
3279 ac=ac->next;
3280 }
3281 if (ac==NULL) {
3282 /* Didn't find one here! */
3283 /* special case for RAID 1, especially
3284 where there are more than 2
3285 components (where RAIDframe treats
3286 things a little differently :( ) */
3287 if (parity_type == '1') {
3288 if (c%2 == 0) { /* even component */
3289 even_pair_failed = 1;
3290 } else { /* odd component. If
3291 we're failed, and
3292 so is the even
3293 component, it's
3294 "Good Night, Charlie" */
3295 if (even_pair_failed == 1) {
3296 return(0);
3297 }
3298 }
3299 } else {
3300 /* normal accounting */
3301 num_missing++;
3302 }
3303 }
3304 if ((parity_type == '1') && (c%2 == 1)) {
3305 /* Just did an even component, and we didn't
3306 bail.. reset the even_pair_failed flag,
3307 and go on to the next component.... */
3308 even_pair_failed = 0;
3309 }
3310 }
3311
3312 clabel = cset->ac->clabel;
3313
3314 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3315 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3316 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3317 /* XXX this needs to be made *much* more general */
3318 /* Too many failures */
3319 return(0);
3320 }
3321 /* otherwise, all is well, and we've got enough to take a kick
3322 at autoconfiguring this set */
3323 return(1);
3324 }
3325
3326 void
3327 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3328 RF_Raid_t *raidPtr)
3329 {
3330 RF_ComponentLabel_t *clabel;
3331 int i;
3332
3333 clabel = ac->clabel;
3334
3335 /* 1. Fill in the common stuff */
3336 config->numRow = clabel->num_rows = 1;
3337 config->numCol = clabel->num_columns;
3338 config->numSpare = 0; /* XXX should this be set here? */
3339 config->sectPerSU = clabel->sectPerSU;
3340 config->SUsPerPU = clabel->SUsPerPU;
3341 config->SUsPerRU = clabel->SUsPerRU;
3342 config->parityConfig = clabel->parityConfig;
3343 /* XXX... */
3344 strcpy(config->diskQueueType,"fifo");
3345 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3346 config->layoutSpecificSize = 0; /* XXX ?? */
3347
3348 while(ac!=NULL) {
3349 /* row/col values will be in range due to the checks
3350 in reasonable_label() */
3351 strcpy(config->devnames[0][ac->clabel->column],
3352 ac->devname);
3353 ac = ac->next;
3354 }
3355
3356 for(i=0;i<RF_MAXDBGV;i++) {
3357 config->debugVars[i][0] = 0;
3358 }
3359 }
3360
3361 int
3362 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3363 {
3364 RF_ComponentLabel_t *clabel;
3365 int column;
3366 int sparecol;
3367
3368 raidPtr->autoconfigure = new_value;
3369
3370 for(column=0; column<raidPtr->numCol; column++) {
3371 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3372 clabel = raidget_component_label(raidPtr, column);
3373 clabel->autoconfigure = new_value;
3374 raidflush_component_label(raidPtr, column);
3375 }
3376 }
3377 for(column = 0; column < raidPtr->numSpare ; column++) {
3378 sparecol = raidPtr->numCol + column;
3379 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3380 clabel = raidget_component_label(raidPtr, sparecol);
3381 clabel->autoconfigure = new_value;
3382 raidflush_component_label(raidPtr, sparecol);
3383 }
3384 }
3385 return(new_value);
3386 }
3387
3388 int
3389 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3390 {
3391 RF_ComponentLabel_t *clabel;
3392 int column;
3393 int sparecol;
3394
3395 raidPtr->root_partition = new_value;
3396 for(column=0; column<raidPtr->numCol; column++) {
3397 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3398 clabel = raidget_component_label(raidPtr, column);
3399 clabel->root_partition = new_value;
3400 raidflush_component_label(raidPtr, column);
3401 }
3402 }
3403 for(column = 0; column < raidPtr->numSpare ; column++) {
3404 sparecol = raidPtr->numCol + column;
3405 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3406 clabel = raidget_component_label(raidPtr, sparecol);
3407 clabel->root_partition = new_value;
3408 raidflush_component_label(raidPtr, sparecol);
3409 }
3410 }
3411 return(new_value);
3412 }
3413
3414 void
3415 rf_release_all_vps(RF_ConfigSet_t *cset)
3416 {
3417 RF_AutoConfig_t *ac;
3418
3419 ac = cset->ac;
3420 while(ac!=NULL) {
3421 /* Close the vp, and give it back */
3422 if (ac->vp) {
3423 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3424 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3425 vput(ac->vp);
3426 ac->vp = NULL;
3427 }
3428 ac = ac->next;
3429 }
3430 }
3431
3432
3433 void
3434 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3435 {
3436 RF_AutoConfig_t *ac;
3437 RF_AutoConfig_t *next_ac;
3438
3439 ac = cset->ac;
3440 while(ac!=NULL) {
3441 next_ac = ac->next;
3442 /* nuke the label */
3443 free(ac->clabel, M_RAIDFRAME);
3444 /* cleanup the config structure */
3445 free(ac, M_RAIDFRAME);
3446 /* "next.." */
3447 ac = next_ac;
3448 }
3449 /* and, finally, nuke the config set */
3450 free(cset, M_RAIDFRAME);
3451 }
3452
3453
3454 void
3455 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3456 {
3457 /* current version number */
3458 clabel->version = RF_COMPONENT_LABEL_VERSION;
3459 clabel->serial_number = raidPtr->serial_number;
3460 clabel->mod_counter = raidPtr->mod_counter;
3461
3462 clabel->num_rows = 1;
3463 clabel->num_columns = raidPtr->numCol;
3464 clabel->clean = RF_RAID_DIRTY; /* not clean */
3465 clabel->status = rf_ds_optimal; /* "It's good!" */
3466
3467 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3468 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3469 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3470
3471 clabel->blockSize = raidPtr->bytesPerSector;
3472 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3473
3474 /* XXX not portable */
3475 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3476 clabel->maxOutstanding = raidPtr->maxOutstanding;
3477 clabel->autoconfigure = raidPtr->autoconfigure;
3478 clabel->root_partition = raidPtr->root_partition;
3479 clabel->last_unit = raidPtr->raidid;
3480 clabel->config_order = raidPtr->config_order;
3481
3482 #ifndef RF_NO_PARITY_MAP
3483 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3484 #endif
3485 }
3486
3487 struct raid_softc *
3488 rf_auto_config_set(RF_ConfigSet_t *cset)
3489 {
3490 RF_Raid_t *raidPtr;
3491 RF_Config_t *config;
3492 int raidID;
3493 struct raid_softc *sc;
3494
3495 #ifdef DEBUG
3496 printf("RAID autoconfigure\n");
3497 #endif
3498
3499 /* 1. Create a config structure */
3500 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3501 if (config == NULL) {
3502 printf("%s: Out of mem - config!?!?\n", __func__);
3503 /* XXX do something more intelligent here. */
3504 return NULL;
3505 }
3506
3507 /*
3508 2. Figure out what RAID ID this one is supposed to live at
3509 See if we can get the same RAID dev that it was configured
3510 on last time..
3511 */
3512
3513 raidID = cset->ac->clabel->last_unit;
3514 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3515 sc = raidget(++raidID, false))
3516 continue;
3517 #ifdef DEBUG
3518 printf("Configuring raid%d:\n",raidID);
3519 #endif
3520
3521 if (sc == NULL)
3522 sc = raidget(raidID, true);
3523 if (sc == NULL) {
3524 printf("%s: Out of mem - softc!?!?\n", __func__);
3525 /* XXX do something more intelligent here. */
3526 free(config, M_RAIDFRAME);
3527 return NULL;
3528 }
3529
3530 raidPtr = &sc->sc_r;
3531
3532 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3533 raidPtr->softc = sc;
3534 raidPtr->raidid = raidID;
3535 raidPtr->openings = RAIDOUTSTANDING;
3536
3537 /* 3. Build the configuration structure */
3538 rf_create_configuration(cset->ac, config, raidPtr);
3539
3540 /* 4. Do the configuration */
3541 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3542 raidinit(sc);
3543
3544 rf_markalldirty(raidPtr);
3545 raidPtr->autoconfigure = 1; /* XXX do this here? */
3546 switch (cset->ac->clabel->root_partition) {
3547 case 1: /* Force Root */
3548 case 2: /* Soft Root: root when boot partition part of raid */
3549 /*
3550 * everything configured just fine. Make a note
3551 * that this set is eligible to be root,
3552 * or forced to be root
3553 */
3554 cset->rootable = cset->ac->clabel->root_partition;
3555 /* XXX do this here? */
3556 raidPtr->root_partition = cset->rootable;
3557 break;
3558 default:
3559 break;
3560 }
3561 } else {
3562 raidput(sc);
3563 sc = NULL;
3564 }
3565
3566 /* 5. Cleanup */
3567 free(config, M_RAIDFRAME);
3568 return sc;
3569 }
3570
3571 void
3572 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3573 size_t xmin, size_t xmax)
3574 {
3575 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3576 pool_sethiwat(p, xmax);
3577 pool_prime(p, xmin);
3578 pool_setlowat(p, xmin);
3579 }
3580
3581 /*
3582 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3583 * to see if there is IO pending and if that IO could possibly be done
3584 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3585 * otherwise.
3586 *
3587 */
3588 int
3589 rf_buf_queue_check(RF_Raid_t *raidPtr)
3590 {
3591 struct raid_softc *rs;
3592 struct dk_softc *dksc;
3593
3594 rs = raidPtr->softc;
3595 dksc = &rs->sc_dksc;
3596
3597 if ((rs->sc_flags & RAIDF_INITED) == 0)
3598 return 1;
3599
3600 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3601 /* there is work to do */
3602 return 0;
3603 }
3604 /* default is nothing to do */
3605 return 1;
3606 }
3607
3608 int
3609 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3610 {
3611 uint64_t numsecs;
3612 unsigned secsize;
3613 int error;
3614
3615 error = getdisksize(vp, &numsecs, &secsize);
3616 if (error == 0) {
3617 diskPtr->blockSize = secsize;
3618 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3619 diskPtr->partitionSize = numsecs;
3620 return 0;
3621 }
3622 return error;
3623 }
3624
3625 static int
3626 raid_match(device_t self, cfdata_t cfdata, void *aux)
3627 {
3628 return 1;
3629 }
3630
3631 static void
3632 raid_attach(device_t parent, device_t self, void *aux)
3633 {
3634 }
3635
3636
3637 static int
3638 raid_detach(device_t self, int flags)
3639 {
3640 int error;
3641 struct raid_softc *rs = raidsoftc(self);
3642
3643 if (rs == NULL)
3644 return ENXIO;
3645
3646 if ((error = raidlock(rs)) != 0)
3647 return (error);
3648
3649 error = raid_detach_unlocked(rs);
3650
3651 raidunlock(rs);
3652
3653 /* XXX raid can be referenced here */
3654
3655 if (error)
3656 return error;
3657
3658 /* Free the softc */
3659 raidput(rs);
3660
3661 return 0;
3662 }
3663
3664 static void
3665 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3666 {
3667 struct dk_softc *dksc = &rs->sc_dksc;
3668 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3669
3670 memset(dg, 0, sizeof(*dg));
3671
3672 dg->dg_secperunit = raidPtr->totalSectors;
3673 dg->dg_secsize = raidPtr->bytesPerSector;
3674 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3675 dg->dg_ntracks = 4 * raidPtr->numCol;
3676
3677 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3678 }
3679
3680 /*
3681 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3682 * We end up returning whatever error was returned by the first cache flush
3683 * that fails.
3684 */
3685
3686 int
3687 rf_sync_component_caches(RF_Raid_t *raidPtr)
3688 {
3689 int c, sparecol;
3690 int e,error;
3691 int force = 1;
3692
3693 error = 0;
3694 for (c = 0; c < raidPtr->numCol; c++) {
3695 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3696 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3697 &force, FWRITE, NOCRED);
3698 if (e) {
3699 if (e != ENODEV)
3700 printf("raid%d: cache flush to component %s failed.\n",
3701 raidPtr->raidid, raidPtr->Disks[c].devname);
3702 if (error == 0) {
3703 error = e;
3704 }
3705 }
3706 }
3707 }
3708
3709 for( c = 0; c < raidPtr->numSpare ; c++) {
3710 sparecol = raidPtr->numCol + c;
3711 /* Need to ensure that the reconstruct actually completed! */
3712 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3713 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3714 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3715 if (e) {
3716 if (e != ENODEV)
3717 printf("raid%d: cache flush to component %s failed.\n",
3718 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3719 if (error == 0) {
3720 error = e;
3721 }
3722 }
3723 }
3724 }
3725 return error;
3726 }
3727
3728 /*
3729 * Module interface
3730 */
3731
3732 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3733
3734 #ifdef _MODULE
3735 CFDRIVER_DECL(raid, DV_DISK, NULL);
3736 #endif
3737
3738 static int raid_modcmd(modcmd_t, void *);
3739 static int raid_modcmd_init(void);
3740 static int raid_modcmd_fini(void);
3741
3742 static int
3743 raid_modcmd(modcmd_t cmd, void *data)
3744 {
3745 int error;
3746
3747 error = 0;
3748 switch (cmd) {
3749 case MODULE_CMD_INIT:
3750 error = raid_modcmd_init();
3751 break;
3752 case MODULE_CMD_FINI:
3753 error = raid_modcmd_fini();
3754 break;
3755 default:
3756 error = ENOTTY;
3757 break;
3758 }
3759 return error;
3760 }
3761
3762 static int
3763 raid_modcmd_init(void)
3764 {
3765 int error;
3766 int bmajor, cmajor;
3767
3768 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3769 mutex_enter(&raid_lock);
3770 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3771 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3772 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3773 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3774
3775 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3776 #endif
3777
3778 bmajor = cmajor = -1;
3779 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3780 &raid_cdevsw, &cmajor);
3781 if (error != 0 && error != EEXIST) {
3782 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3783 mutex_exit(&raid_lock);
3784 return error;
3785 }
3786 #ifdef _MODULE
3787 error = config_cfdriver_attach(&raid_cd);
3788 if (error != 0) {
3789 aprint_error("%s: config_cfdriver_attach failed %d\n",
3790 __func__, error);
3791 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3792 mutex_exit(&raid_lock);
3793 return error;
3794 }
3795 #endif
3796 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3797 if (error != 0) {
3798 aprint_error("%s: config_cfattach_attach failed %d\n",
3799 __func__, error);
3800 #ifdef _MODULE
3801 config_cfdriver_detach(&raid_cd);
3802 #endif
3803 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3804 mutex_exit(&raid_lock);
3805 return error;
3806 }
3807
3808 raidautoconfigdone = false;
3809
3810 mutex_exit(&raid_lock);
3811
3812 if (error == 0) {
3813 if (rf_BootRaidframe(true) == 0)
3814 aprint_verbose("Kernelized RAIDframe activated\n");
3815 else
3816 panic("Serious error activating RAID!!");
3817 }
3818
3819 /*
3820 * Register a finalizer which will be used to auto-config RAID
3821 * sets once all real hardware devices have been found.
3822 */
3823 error = config_finalize_register(NULL, rf_autoconfig);
3824 if (error != 0) {
3825 aprint_error("WARNING: unable to register RAIDframe "
3826 "finalizer\n");
3827 error = 0;
3828 }
3829
3830 return error;
3831 }
3832
3833 static int
3834 raid_modcmd_fini(void)
3835 {
3836 int error;
3837
3838 mutex_enter(&raid_lock);
3839
3840 /* Don't allow unload if raid device(s) exist. */
3841 if (!LIST_EMPTY(&raids)) {
3842 mutex_exit(&raid_lock);
3843 return EBUSY;
3844 }
3845
3846 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3847 if (error != 0) {
3848 aprint_error("%s: cannot detach cfattach\n",__func__);
3849 mutex_exit(&raid_lock);
3850 return error;
3851 }
3852 #ifdef _MODULE
3853 error = config_cfdriver_detach(&raid_cd);
3854 if (error != 0) {
3855 aprint_error("%s: cannot detach cfdriver\n",__func__);
3856 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3857 mutex_exit(&raid_lock);
3858 return error;
3859 }
3860 #endif
3861 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3862 if (error != 0) {
3863 aprint_error("%s: cannot detach devsw\n",__func__);
3864 #ifdef _MODULE
3865 config_cfdriver_attach(&raid_cd);
3866 #endif
3867 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3868 mutex_exit(&raid_lock);
3869 return error;
3870 }
3871 rf_BootRaidframe(false);
3872 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3873 rf_destroy_mutex2(rf_sparet_wait_mutex);
3874 rf_destroy_cond2(rf_sparet_wait_cv);
3875 rf_destroy_cond2(rf_sparet_resp_cv);
3876 #endif
3877 mutex_exit(&raid_lock);
3878 mutex_destroy(&raid_lock);
3879
3880 return error;
3881 }
3882