rf_netbsdkintf.c revision 1.347.2.1 1 /* $NetBSD: rf_netbsdkintf.c,v 1.347.2.1 2017/04/21 16:53:52 bouyer Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.347.2.1 2017/04/21 16:53:52 bouyer Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #include "ioconf.h"
156
157 #ifdef DEBUG
158 int rf_kdebug_level = 0;
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #else /* DEBUG */
161 #define db1_printf(a) { }
162 #endif /* DEBUG */
163
164 #ifdef DEBUG_ROOT
165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
166 #else
167 #define DPRINTF(a, ...)
168 #endif
169
170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
171 static rf_declare_mutex2(rf_sparet_wait_mutex);
172 static rf_declare_cond2(rf_sparet_wait_cv);
173 static rf_declare_cond2(rf_sparet_resp_cv);
174
175 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
176 * spare table */
177 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
178 * installation process */
179 #endif
180
181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
182
183 /* prototypes */
184 static void KernelWakeupFunc(struct buf *);
185 static void InitBP(struct buf *, struct vnode *, unsigned,
186 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
187 void *, int, struct proc *);
188 struct raid_softc;
189 static void raidinit(struct raid_softc *);
190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
191 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
192
193 static int raid_match(device_t, cfdata_t, void *);
194 static void raid_attach(device_t, device_t, void *);
195 static int raid_detach(device_t, int);
196
197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
198 daddr_t, daddr_t);
199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
200 daddr_t, daddr_t, int);
201
202 static int raidwrite_component_label(unsigned,
203 dev_t, struct vnode *, RF_ComponentLabel_t *);
204 static int raidread_component_label(unsigned,
205 dev_t, struct vnode *, RF_ComponentLabel_t *);
206
207 static int raid_diskstart(device_t, struct buf *bp);
208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
209 static int raid_lastclose(device_t);
210
211 static dev_type_open(raidopen);
212 static dev_type_close(raidclose);
213 static dev_type_read(raidread);
214 static dev_type_write(raidwrite);
215 static dev_type_ioctl(raidioctl);
216 static dev_type_strategy(raidstrategy);
217 static dev_type_dump(raiddump);
218 static dev_type_size(raidsize);
219
220 const struct bdevsw raid_bdevsw = {
221 .d_open = raidopen,
222 .d_close = raidclose,
223 .d_strategy = raidstrategy,
224 .d_ioctl = raidioctl,
225 .d_dump = raiddump,
226 .d_psize = raidsize,
227 .d_discard = nodiscard,
228 .d_flag = D_DISK
229 };
230
231 const struct cdevsw raid_cdevsw = {
232 .d_open = raidopen,
233 .d_close = raidclose,
234 .d_read = raidread,
235 .d_write = raidwrite,
236 .d_ioctl = raidioctl,
237 .d_stop = nostop,
238 .d_tty = notty,
239 .d_poll = nopoll,
240 .d_mmap = nommap,
241 .d_kqfilter = nokqfilter,
242 .d_discard = nodiscard,
243 .d_flag = D_DISK
244 };
245
246 static struct dkdriver rf_dkdriver = {
247 .d_open = raidopen,
248 .d_close = raidclose,
249 .d_strategy = raidstrategy,
250 .d_diskstart = raid_diskstart,
251 .d_dumpblocks = raid_dumpblocks,
252 .d_lastclose = raid_lastclose,
253 .d_minphys = minphys
254 };
255
256 struct raid_softc {
257 struct dk_softc sc_dksc;
258 int sc_unit;
259 int sc_flags; /* flags */
260 int sc_cflags; /* configuration flags */
261 kmutex_t sc_mutex; /* interlock mutex */
262 kcondvar_t sc_cv; /* and the condvar */
263 uint64_t sc_size; /* size of the raid device */
264 char sc_xname[20]; /* XXX external name */
265 RF_Raid_t sc_r;
266 LIST_ENTRY(raid_softc) sc_link;
267 };
268 /* sc_flags */
269 #define RAIDF_INITED 0x01 /* unit has been initialized */
270 #define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */
271 #define RAIDF_DETACH 0x04 /* detach after final close */
272 #define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */
273 #define RAIDF_LOCKED 0x10 /* unit is locked */
274 #define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */
275
276 #define raidunit(x) DISKUNIT(x)
277 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
278
279 extern struct cfdriver raid_cd;
280 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
281 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
282 DVF_DETACH_SHUTDOWN);
283
284 /*
285 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
286 * Be aware that large numbers can allow the driver to consume a lot of
287 * kernel memory, especially on writes, and in degraded mode reads.
288 *
289 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
290 * a single 64K write will typically require 64K for the old data,
291 * 64K for the old parity, and 64K for the new parity, for a total
292 * of 192K (if the parity buffer is not re-used immediately).
293 * Even it if is used immediately, that's still 128K, which when multiplied
294 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
295 *
296 * Now in degraded mode, for example, a 64K read on the above setup may
297 * require data reconstruction, which will require *all* of the 4 remaining
298 * disks to participate -- 4 * 32K/disk == 128K again.
299 */
300
301 #ifndef RAIDOUTSTANDING
302 #define RAIDOUTSTANDING 6
303 #endif
304
305 #define RAIDLABELDEV(dev) \
306 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
307
308 /* declared here, and made public, for the benefit of KVM stuff.. */
309
310 static int raidlock(struct raid_softc *);
311 static void raidunlock(struct raid_softc *);
312
313 static int raid_detach_unlocked(struct raid_softc *);
314
315 static void rf_markalldirty(RF_Raid_t *);
316 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
317
318 void rf_ReconThread(struct rf_recon_req *);
319 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
320 void rf_CopybackThread(RF_Raid_t *raidPtr);
321 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
322 int rf_autoconfig(device_t);
323 void rf_buildroothack(RF_ConfigSet_t *);
324
325 RF_AutoConfig_t *rf_find_raid_components(void);
326 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
327 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
328 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
329 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
330 int rf_set_autoconfig(RF_Raid_t *, int);
331 int rf_set_rootpartition(RF_Raid_t *, int);
332 void rf_release_all_vps(RF_ConfigSet_t *);
333 void rf_cleanup_config_set(RF_ConfigSet_t *);
334 int rf_have_enough_components(RF_ConfigSet_t *);
335 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
336 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
337
338 /*
339 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
340 * Note that this is overridden by having RAID_AUTOCONFIG as an option
341 * in the kernel config file.
342 */
343 #ifdef RAID_AUTOCONFIG
344 int raidautoconfig = 1;
345 #else
346 int raidautoconfig = 0;
347 #endif
348 static bool raidautoconfigdone = false;
349
350 struct RF_Pools_s rf_pools;
351
352 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
353 static kmutex_t raid_lock;
354
355 static struct raid_softc *
356 raidcreate(int unit) {
357 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
358 if (sc == NULL) {
359 #ifdef DIAGNOSTIC
360 printf("%s: out of memory\n", __func__);
361 #endif
362 return NULL;
363 }
364 sc->sc_unit = unit;
365 cv_init(&sc->sc_cv, "raidunit");
366 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
367 return sc;
368 }
369
370 static void
371 raiddestroy(struct raid_softc *sc) {
372 cv_destroy(&sc->sc_cv);
373 mutex_destroy(&sc->sc_mutex);
374 kmem_free(sc, sizeof(*sc));
375 }
376
377 static struct raid_softc *
378 raidget(int unit, bool create) {
379 struct raid_softc *sc;
380 if (unit < 0) {
381 #ifdef DIAGNOSTIC
382 panic("%s: unit %d!", __func__, unit);
383 #endif
384 return NULL;
385 }
386 mutex_enter(&raid_lock);
387 LIST_FOREACH(sc, &raids, sc_link) {
388 if (sc->sc_unit == unit) {
389 mutex_exit(&raid_lock);
390 return sc;
391 }
392 }
393 mutex_exit(&raid_lock);
394 if (!create)
395 return NULL;
396 if ((sc = raidcreate(unit)) == NULL)
397 return NULL;
398 mutex_enter(&raid_lock);
399 LIST_INSERT_HEAD(&raids, sc, sc_link);
400 mutex_exit(&raid_lock);
401 return sc;
402 }
403
404 static void
405 raidput(struct raid_softc *sc) {
406 mutex_enter(&raid_lock);
407 LIST_REMOVE(sc, sc_link);
408 mutex_exit(&raid_lock);
409 raiddestroy(sc);
410 }
411
412 void
413 raidattach(int num)
414 {
415
416 /*
417 * Device attachment and associated initialization now occurs
418 * as part of the module initialization.
419 */
420 }
421
422 int
423 rf_autoconfig(device_t self)
424 {
425 RF_AutoConfig_t *ac_list;
426 RF_ConfigSet_t *config_sets;
427
428 if (!raidautoconfig || raidautoconfigdone == true)
429 return (0);
430
431 /* XXX This code can only be run once. */
432 raidautoconfigdone = true;
433
434 #ifdef __HAVE_CPU_BOOTCONF
435 /*
436 * 0. find the boot device if needed first so we can use it later
437 * this needs to be done before we autoconfigure any raid sets,
438 * because if we use wedges we are not going to be able to open
439 * the boot device later
440 */
441 if (booted_device == NULL)
442 cpu_bootconf();
443 #endif
444 /* 1. locate all RAID components on the system */
445 aprint_debug("Searching for RAID components...\n");
446 ac_list = rf_find_raid_components();
447
448 /* 2. Sort them into their respective sets. */
449 config_sets = rf_create_auto_sets(ac_list);
450
451 /*
452 * 3. Evaluate each set and configure the valid ones.
453 * This gets done in rf_buildroothack().
454 */
455 rf_buildroothack(config_sets);
456
457 return 1;
458 }
459
460 static int
461 rf_containsboot(RF_Raid_t *r, device_t bdv) {
462 const char *bootname = device_xname(bdv);
463 size_t len = strlen(bootname);
464
465 for (int col = 0; col < r->numCol; col++) {
466 const char *devname = r->Disks[col].devname;
467 devname += sizeof("/dev/") - 1;
468 if (strncmp(devname, "dk", 2) == 0) {
469 const char *parent =
470 dkwedge_get_parent_name(r->Disks[col].dev);
471 if (parent != NULL)
472 devname = parent;
473 }
474 if (strncmp(devname, bootname, len) == 0) {
475 struct raid_softc *sc = r->softc;
476 aprint_debug("raid%d includes boot device %s\n",
477 sc->sc_unit, devname);
478 return 1;
479 }
480 }
481 return 0;
482 }
483
484 void
485 rf_buildroothack(RF_ConfigSet_t *config_sets)
486 {
487 RF_ConfigSet_t *cset;
488 RF_ConfigSet_t *next_cset;
489 int num_root;
490 struct raid_softc *sc, *rsc;
491 struct dk_softc *dksc;
492
493 sc = rsc = NULL;
494 num_root = 0;
495 cset = config_sets;
496 while (cset != NULL) {
497 next_cset = cset->next;
498 if (rf_have_enough_components(cset) &&
499 cset->ac->clabel->autoconfigure == 1) {
500 sc = rf_auto_config_set(cset);
501 if (sc != NULL) {
502 aprint_debug("raid%d: configured ok\n",
503 sc->sc_unit);
504 if (cset->rootable) {
505 rsc = sc;
506 num_root++;
507 }
508 } else {
509 /* The autoconfig didn't work :( */
510 aprint_debug("Autoconfig failed\n");
511 rf_release_all_vps(cset);
512 }
513 } else {
514 /* we're not autoconfiguring this set...
515 release the associated resources */
516 rf_release_all_vps(cset);
517 }
518 /* cleanup */
519 rf_cleanup_config_set(cset);
520 cset = next_cset;
521 }
522 dksc = &rsc->sc_dksc;
523
524 /* if the user has specified what the root device should be
525 then we don't touch booted_device or boothowto... */
526
527 if (rootspec != NULL)
528 return;
529
530 /* we found something bootable... */
531
532 /*
533 * XXX: The following code assumes that the root raid
534 * is the first ('a') partition. This is about the best
535 * we can do with a BSD disklabel, but we might be able
536 * to do better with a GPT label, by setting a specified
537 * attribute to indicate the root partition. We can then
538 * stash the partition number in the r->root_partition
539 * high bits (the bottom 2 bits are already used). For
540 * now we just set booted_partition to 0 when we override
541 * root.
542 */
543 if (num_root == 1) {
544 device_t candidate_root;
545 if (dksc->sc_dkdev.dk_nwedges != 0) {
546 char cname[sizeof(cset->ac->devname)];
547 /* XXX: assume partition 'a' first */
548 snprintf(cname, sizeof(cname), "%s%c",
549 device_xname(dksc->sc_dev), 'a');
550 candidate_root = dkwedge_find_by_wname(cname);
551 DPRINTF("%s: candidate wedge root=%s\n", __func__,
552 cname);
553 if (candidate_root == NULL) {
554 /*
555 * If that is not found, because we don't use
556 * disklabel, return the first dk child
557 * XXX: we can skip the 'a' check above
558 * and always do this...
559 */
560 size_t i = 0;
561 candidate_root = dkwedge_find_by_parent(
562 device_xname(dksc->sc_dev), &i);
563 }
564 DPRINTF("%s: candidate wedge root=%p\n", __func__,
565 candidate_root);
566 } else
567 candidate_root = dksc->sc_dev;
568 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
569 DPRINTF("%s: booted_device=%p root_partition=%d "
570 "contains_boot=%d\n", __func__, booted_device,
571 rsc->sc_r.root_partition,
572 rf_containsboot(&rsc->sc_r, booted_device));
573 if (booted_device == NULL ||
574 rsc->sc_r.root_partition == 1 ||
575 rf_containsboot(&rsc->sc_r, booted_device)) {
576 booted_device = candidate_root;
577 booted_partition = 0; /* XXX assume 'a' */
578 }
579 } else if (num_root > 1) {
580 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
581 booted_device);
582
583 /*
584 * Maybe the MD code can help. If it cannot, then
585 * setroot() will discover that we have no
586 * booted_device and will ask the user if nothing was
587 * hardwired in the kernel config file
588 */
589 if (booted_device == NULL)
590 return;
591
592 num_root = 0;
593 mutex_enter(&raid_lock);
594 LIST_FOREACH(sc, &raids, sc_link) {
595 RF_Raid_t *r = &sc->sc_r;
596 if (r->valid == 0)
597 continue;
598
599 if (r->root_partition == 0)
600 continue;
601
602 if (rf_containsboot(r, booted_device)) {
603 num_root++;
604 rsc = sc;
605 dksc = &rsc->sc_dksc;
606 }
607 }
608 mutex_exit(&raid_lock);
609
610 if (num_root == 1) {
611 booted_device = dksc->sc_dev;
612 booted_partition = 0; /* XXX assume 'a' */
613 } else {
614 /* we can't guess.. require the user to answer... */
615 boothowto |= RB_ASKNAME;
616 }
617 }
618 }
619
620 static int
621 raidsize(dev_t dev)
622 {
623 struct raid_softc *rs;
624 struct dk_softc *dksc;
625 unsigned int unit;
626
627 unit = raidunit(dev);
628 if ((rs = raidget(unit, false)) == NULL)
629 return -1;
630 dksc = &rs->sc_dksc;
631
632 if ((rs->sc_flags & RAIDF_INITED) == 0)
633 return -1;
634
635 return dk_size(dksc, dev);
636 }
637
638 static int
639 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
640 {
641 unsigned int unit;
642 struct raid_softc *rs;
643 struct dk_softc *dksc;
644
645 unit = raidunit(dev);
646 if ((rs = raidget(unit, false)) == NULL)
647 return ENXIO;
648 dksc = &rs->sc_dksc;
649
650 if ((rs->sc_flags & RAIDF_INITED) == 0)
651 return ENODEV;
652
653 /*
654 Note that blkno is relative to this particular partition.
655 By adding adding RF_PROTECTED_SECTORS, we get a value that
656 is relative to the partition used for the underlying component.
657 */
658 blkno += RF_PROTECTED_SECTORS;
659
660 return dk_dump(dksc, dev, blkno, va, size);
661 }
662
663 static int
664 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
665 {
666 struct raid_softc *rs = raidsoftc(dev);
667 const struct bdevsw *bdev;
668 RF_Raid_t *raidPtr;
669 int c, sparecol, j, scol, dumpto;
670 int error = 0;
671
672 raidPtr = &rs->sc_r;
673
674 /* we only support dumping to RAID 1 sets */
675 if (raidPtr->Layout.numDataCol != 1 ||
676 raidPtr->Layout.numParityCol != 1)
677 return EINVAL;
678
679 if ((error = raidlock(rs)) != 0)
680 return error;
681
682 /* figure out what device is alive.. */
683
684 /*
685 Look for a component to dump to. The preference for the
686 component to dump to is as follows:
687 1) the master
688 2) a used_spare of the master
689 3) the slave
690 4) a used_spare of the slave
691 */
692
693 dumpto = -1;
694 for (c = 0; c < raidPtr->numCol; c++) {
695 if (raidPtr->Disks[c].status == rf_ds_optimal) {
696 /* this might be the one */
697 dumpto = c;
698 break;
699 }
700 }
701
702 /*
703 At this point we have possibly selected a live master or a
704 live slave. We now check to see if there is a spared
705 master (or a spared slave), if we didn't find a live master
706 or a live slave.
707 */
708
709 for (c = 0; c < raidPtr->numSpare; c++) {
710 sparecol = raidPtr->numCol + c;
711 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
712 /* How about this one? */
713 scol = -1;
714 for(j=0;j<raidPtr->numCol;j++) {
715 if (raidPtr->Disks[j].spareCol == sparecol) {
716 scol = j;
717 break;
718 }
719 }
720 if (scol == 0) {
721 /*
722 We must have found a spared master!
723 We'll take that over anything else
724 found so far. (We couldn't have
725 found a real master before, since
726 this is a used spare, and it's
727 saying that it's replacing the
728 master.) On reboot (with
729 autoconfiguration turned on)
730 sparecol will become the 1st
731 component (component0) of this set.
732 */
733 dumpto = sparecol;
734 break;
735 } else if (scol != -1) {
736 /*
737 Must be a spared slave. We'll dump
738 to that if we havn't found anything
739 else so far.
740 */
741 if (dumpto == -1)
742 dumpto = sparecol;
743 }
744 }
745 }
746
747 if (dumpto == -1) {
748 /* we couldn't find any live components to dump to!?!?
749 */
750 error = EINVAL;
751 goto out;
752 }
753
754 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
755 if (bdev == NULL) {
756 error = ENXIO;
757 goto out;
758 }
759
760 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
761 blkno, va, nblk * raidPtr->bytesPerSector);
762
763 out:
764 raidunlock(rs);
765
766 return error;
767 }
768
769 /* ARGSUSED */
770 static int
771 raidopen(dev_t dev, int flags, int fmt,
772 struct lwp *l)
773 {
774 int unit = raidunit(dev);
775 struct raid_softc *rs;
776 struct dk_softc *dksc;
777 int error = 0;
778 int part, pmask;
779
780 if ((rs = raidget(unit, true)) == NULL)
781 return ENXIO;
782 if ((error = raidlock(rs)) != 0)
783 return (error);
784
785 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
786 error = EBUSY;
787 goto bad;
788 }
789
790 dksc = &rs->sc_dksc;
791
792 part = DISKPART(dev);
793 pmask = (1 << part);
794
795 if (!DK_BUSY(dksc, pmask) &&
796 ((rs->sc_flags & RAIDF_INITED) != 0)) {
797 /* First one... mark things as dirty... Note that we *MUST*
798 have done a configure before this. I DO NOT WANT TO BE
799 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
800 THAT THEY BELONG TOGETHER!!!!! */
801 /* XXX should check to see if we're only open for reading
802 here... If so, we needn't do this, but then need some
803 other way of keeping track of what's happened.. */
804
805 rf_markalldirty(&rs->sc_r);
806 }
807
808 if ((rs->sc_flags & RAIDF_INITED) != 0)
809 error = dk_open(dksc, dev, flags, fmt, l);
810
811 bad:
812 raidunlock(rs);
813
814 return (error);
815
816
817 }
818
819 static int
820 raid_lastclose(device_t self)
821 {
822 struct raid_softc *rs = raidsoftc(self);
823
824 /* Last one... device is not unconfigured yet.
825 Device shutdown has taken care of setting the
826 clean bits if RAIDF_INITED is not set
827 mark things as clean... */
828
829 rf_update_component_labels(&rs->sc_r,
830 RF_FINAL_COMPONENT_UPDATE);
831
832 /* pass to unlocked code */
833 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
834 rs->sc_flags |= RAIDF_DETACH;
835
836 return 0;
837 }
838
839 /* ARGSUSED */
840 static int
841 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
842 {
843 int unit = raidunit(dev);
844 struct raid_softc *rs;
845 struct dk_softc *dksc;
846 cfdata_t cf;
847 int error = 0, do_detach = 0, do_put = 0;
848
849 if ((rs = raidget(unit, false)) == NULL)
850 return ENXIO;
851 dksc = &rs->sc_dksc;
852
853 if ((error = raidlock(rs)) != 0)
854 return (error);
855
856 if ((rs->sc_flags & RAIDF_INITED) != 0) {
857 error = dk_close(dksc, dev, flags, fmt, l);
858 if ((rs->sc_flags & RAIDF_DETACH) != 0)
859 do_detach = 1;
860 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
861 do_put = 1;
862
863 raidunlock(rs);
864
865 if (do_detach) {
866 /* free the pseudo device attach bits */
867 cf = device_cfdata(dksc->sc_dev);
868 error = config_detach(dksc->sc_dev, 0);
869 if (error == 0)
870 free(cf, M_RAIDFRAME);
871 } else if (do_put) {
872 raidput(rs);
873 }
874
875 return (error);
876
877 }
878
879 static void
880 raid_wakeup(RF_Raid_t *raidPtr)
881 {
882 rf_lock_mutex2(raidPtr->iodone_lock);
883 rf_signal_cond2(raidPtr->iodone_cv);
884 rf_unlock_mutex2(raidPtr->iodone_lock);
885 }
886
887 static void
888 raidstrategy(struct buf *bp)
889 {
890 unsigned int unit;
891 struct raid_softc *rs;
892 struct dk_softc *dksc;
893 RF_Raid_t *raidPtr;
894
895 unit = raidunit(bp->b_dev);
896 if ((rs = raidget(unit, false)) == NULL) {
897 bp->b_error = ENXIO;
898 goto fail;
899 }
900 if ((rs->sc_flags & RAIDF_INITED) == 0) {
901 bp->b_error = ENXIO;
902 goto fail;
903 }
904 dksc = &rs->sc_dksc;
905 raidPtr = &rs->sc_r;
906
907 /* Queue IO only */
908 if (dk_strategy_defer(dksc, bp))
909 goto done;
910
911 /* schedule the IO to happen at the next convenient time */
912 raid_wakeup(raidPtr);
913
914 done:
915 return;
916
917 fail:
918 bp->b_resid = bp->b_bcount;
919 biodone(bp);
920 }
921
922 static int
923 raid_diskstart(device_t dev, struct buf *bp)
924 {
925 struct raid_softc *rs = raidsoftc(dev);
926 RF_Raid_t *raidPtr;
927
928 raidPtr = &rs->sc_r;
929 if (!raidPtr->valid) {
930 db1_printf(("raid is not valid..\n"));
931 return ENODEV;
932 }
933
934 /* XXX */
935 bp->b_resid = 0;
936
937 return raiddoaccess(raidPtr, bp);
938 }
939
940 void
941 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
942 {
943 struct raid_softc *rs;
944 struct dk_softc *dksc;
945
946 rs = raidPtr->softc;
947 dksc = &rs->sc_dksc;
948
949 dk_done(dksc, bp);
950
951 rf_lock_mutex2(raidPtr->mutex);
952 raidPtr->openings++;
953 rf_unlock_mutex2(raidPtr->mutex);
954
955 /* schedule more IO */
956 raid_wakeup(raidPtr);
957 }
958
959 /* ARGSUSED */
960 static int
961 raidread(dev_t dev, struct uio *uio, int flags)
962 {
963 int unit = raidunit(dev);
964 struct raid_softc *rs;
965
966 if ((rs = raidget(unit, false)) == NULL)
967 return ENXIO;
968
969 if ((rs->sc_flags & RAIDF_INITED) == 0)
970 return (ENXIO);
971
972 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
973
974 }
975
976 /* ARGSUSED */
977 static int
978 raidwrite(dev_t dev, struct uio *uio, int flags)
979 {
980 int unit = raidunit(dev);
981 struct raid_softc *rs;
982
983 if ((rs = raidget(unit, false)) == NULL)
984 return ENXIO;
985
986 if ((rs->sc_flags & RAIDF_INITED) == 0)
987 return (ENXIO);
988
989 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
990
991 }
992
993 static int
994 raid_detach_unlocked(struct raid_softc *rs)
995 {
996 struct dk_softc *dksc = &rs->sc_dksc;
997 RF_Raid_t *raidPtr;
998 int error;
999
1000 raidPtr = &rs->sc_r;
1001
1002 if (DK_BUSY(dksc, 0) ||
1003 raidPtr->recon_in_progress != 0 ||
1004 raidPtr->parity_rewrite_in_progress != 0 ||
1005 raidPtr->copyback_in_progress != 0)
1006 return EBUSY;
1007
1008 if ((rs->sc_flags & RAIDF_INITED) == 0)
1009 return 0;
1010
1011 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1012
1013 if ((error = rf_Shutdown(raidPtr)) != 0)
1014 return error;
1015
1016 rs->sc_flags &= ~RAIDF_INITED;
1017
1018 /* Kill off any queued buffers */
1019 dk_drain(dksc);
1020 bufq_free(dksc->sc_bufq);
1021
1022 /* Detach the disk. */
1023 dkwedge_delall(&dksc->sc_dkdev);
1024 disk_detach(&dksc->sc_dkdev);
1025 disk_destroy(&dksc->sc_dkdev);
1026 dk_detach(dksc);
1027
1028 return 0;
1029 }
1030
1031 static int
1032 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1033 {
1034 int unit = raidunit(dev);
1035 int error = 0;
1036 int part, pmask;
1037 struct raid_softc *rs;
1038 struct dk_softc *dksc;
1039 RF_Config_t *k_cfg, *u_cfg;
1040 RF_Raid_t *raidPtr;
1041 RF_RaidDisk_t *diskPtr;
1042 RF_AccTotals_t *totals;
1043 RF_DeviceConfig_t *d_cfg, **ucfgp;
1044 u_char *specific_buf;
1045 int retcode = 0;
1046 int column;
1047 /* int raidid; */
1048 struct rf_recon_req *rrcopy, *rr;
1049 RF_ComponentLabel_t *clabel;
1050 RF_ComponentLabel_t *ci_label;
1051 RF_ComponentLabel_t **clabel_ptr;
1052 RF_SingleComponent_t *sparePtr,*componentPtr;
1053 RF_SingleComponent_t component;
1054 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1055 int i, j, d;
1056
1057 if ((rs = raidget(unit, false)) == NULL)
1058 return ENXIO;
1059 dksc = &rs->sc_dksc;
1060 raidPtr = &rs->sc_r;
1061
1062 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1063 (int) DISKPART(dev), (int) unit, cmd));
1064
1065 /* Must be initialized for these... */
1066 switch (cmd) {
1067 case RAIDFRAME_REWRITEPARITY:
1068 case RAIDFRAME_GET_INFO:
1069 case RAIDFRAME_RESET_ACCTOTALS:
1070 case RAIDFRAME_GET_ACCTOTALS:
1071 case RAIDFRAME_KEEP_ACCTOTALS:
1072 case RAIDFRAME_GET_SIZE:
1073 case RAIDFRAME_FAIL_DISK:
1074 case RAIDFRAME_COPYBACK:
1075 case RAIDFRAME_CHECK_RECON_STATUS:
1076 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1077 case RAIDFRAME_GET_COMPONENT_LABEL:
1078 case RAIDFRAME_SET_COMPONENT_LABEL:
1079 case RAIDFRAME_ADD_HOT_SPARE:
1080 case RAIDFRAME_REMOVE_HOT_SPARE:
1081 case RAIDFRAME_INIT_LABELS:
1082 case RAIDFRAME_REBUILD_IN_PLACE:
1083 case RAIDFRAME_CHECK_PARITY:
1084 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1085 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1086 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1087 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1088 case RAIDFRAME_SET_AUTOCONFIG:
1089 case RAIDFRAME_SET_ROOT:
1090 case RAIDFRAME_DELETE_COMPONENT:
1091 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1092 case RAIDFRAME_PARITYMAP_STATUS:
1093 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1094 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1095 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1096 if ((rs->sc_flags & RAIDF_INITED) == 0)
1097 return (ENXIO);
1098 }
1099
1100 switch (cmd) {
1101 #ifdef COMPAT_50
1102 case RAIDFRAME_GET_INFO50:
1103 return rf_get_info50(raidPtr, data);
1104
1105 case RAIDFRAME_CONFIGURE50:
1106 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1107 return retcode;
1108 goto config;
1109 #endif
1110 /* configure the system */
1111 case RAIDFRAME_CONFIGURE:
1112
1113 if (raidPtr->valid) {
1114 /* There is a valid RAID set running on this unit! */
1115 printf("raid%d: Device already configured!\n",unit);
1116 return(EINVAL);
1117 }
1118
1119 /* copy-in the configuration information */
1120 /* data points to a pointer to the configuration structure */
1121
1122 u_cfg = *((RF_Config_t **) data);
1123 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1124 if (k_cfg == NULL) {
1125 return (ENOMEM);
1126 }
1127 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1128 if (retcode) {
1129 RF_Free(k_cfg, sizeof(RF_Config_t));
1130 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1131 retcode));
1132 goto no_config;
1133 }
1134 goto config;
1135 config:
1136 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1137
1138 /* allocate a buffer for the layout-specific data, and copy it
1139 * in */
1140 if (k_cfg->layoutSpecificSize) {
1141 if (k_cfg->layoutSpecificSize > 10000) {
1142 /* sanity check */
1143 RF_Free(k_cfg, sizeof(RF_Config_t));
1144 retcode = EINVAL;
1145 goto no_config;
1146 }
1147 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1148 (u_char *));
1149 if (specific_buf == NULL) {
1150 RF_Free(k_cfg, sizeof(RF_Config_t));
1151 retcode = ENOMEM;
1152 goto no_config;
1153 }
1154 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1155 k_cfg->layoutSpecificSize);
1156 if (retcode) {
1157 RF_Free(k_cfg, sizeof(RF_Config_t));
1158 RF_Free(specific_buf,
1159 k_cfg->layoutSpecificSize);
1160 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1161 retcode));
1162 goto no_config;
1163 }
1164 } else
1165 specific_buf = NULL;
1166 k_cfg->layoutSpecific = specific_buf;
1167
1168 /* should do some kind of sanity check on the configuration.
1169 * Store the sum of all the bytes in the last byte? */
1170
1171 /* configure the system */
1172
1173 /*
1174 * Clear the entire RAID descriptor, just to make sure
1175 * there is no stale data left in the case of a
1176 * reconfiguration
1177 */
1178 memset(raidPtr, 0, sizeof(*raidPtr));
1179 raidPtr->softc = rs;
1180 raidPtr->raidid = unit;
1181
1182 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1183
1184 if (retcode == 0) {
1185
1186 /* allow this many simultaneous IO's to
1187 this RAID device */
1188 raidPtr->openings = RAIDOUTSTANDING;
1189
1190 raidinit(rs);
1191 raid_wakeup(raidPtr);
1192 rf_markalldirty(raidPtr);
1193 }
1194 /* free the buffers. No return code here. */
1195 if (k_cfg->layoutSpecificSize) {
1196 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1197 }
1198 RF_Free(k_cfg, sizeof(RF_Config_t));
1199
1200 no_config:
1201 /*
1202 * If configuration failed, set sc_flags so that we
1203 * will detach the device when we close it.
1204 */
1205 if (retcode != 0)
1206 rs->sc_flags |= RAIDF_SHUTDOWN;
1207 return (retcode);
1208
1209 /* shutdown the system */
1210 case RAIDFRAME_SHUTDOWN:
1211
1212 part = DISKPART(dev);
1213 pmask = (1 << part);
1214
1215 if ((error = raidlock(rs)) != 0)
1216 return (error);
1217
1218 if (DK_BUSY(dksc, pmask) ||
1219 raidPtr->recon_in_progress != 0 ||
1220 raidPtr->parity_rewrite_in_progress != 0 ||
1221 raidPtr->copyback_in_progress != 0)
1222 retcode = EBUSY;
1223 else {
1224 /* detach and free on close */
1225 rs->sc_flags |= RAIDF_SHUTDOWN;
1226 retcode = 0;
1227 }
1228
1229 raidunlock(rs);
1230
1231 return (retcode);
1232 case RAIDFRAME_GET_COMPONENT_LABEL:
1233 clabel_ptr = (RF_ComponentLabel_t **) data;
1234 /* need to read the component label for the disk indicated
1235 by row,column in clabel */
1236
1237 /*
1238 * Perhaps there should be an option to skip the in-core
1239 * copy and hit the disk, as with disklabel(8).
1240 */
1241 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1242
1243 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1244
1245 if (retcode) {
1246 RF_Free(clabel, sizeof(*clabel));
1247 return retcode;
1248 }
1249
1250 clabel->row = 0; /* Don't allow looking at anything else.*/
1251
1252 column = clabel->column;
1253
1254 if ((column < 0) || (column >= raidPtr->numCol +
1255 raidPtr->numSpare)) {
1256 RF_Free(clabel, sizeof(*clabel));
1257 return EINVAL;
1258 }
1259
1260 RF_Free(clabel, sizeof(*clabel));
1261
1262 clabel = raidget_component_label(raidPtr, column);
1263
1264 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1265
1266 #if 0
1267 case RAIDFRAME_SET_COMPONENT_LABEL:
1268 clabel = (RF_ComponentLabel_t *) data;
1269
1270 /* XXX check the label for valid stuff... */
1271 /* Note that some things *should not* get modified --
1272 the user should be re-initing the labels instead of
1273 trying to patch things.
1274 */
1275
1276 raidid = raidPtr->raidid;
1277 #ifdef DEBUG
1278 printf("raid%d: Got component label:\n", raidid);
1279 printf("raid%d: Version: %d\n", raidid, clabel->version);
1280 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1281 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1282 printf("raid%d: Column: %d\n", raidid, clabel->column);
1283 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1284 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1285 printf("raid%d: Status: %d\n", raidid, clabel->status);
1286 #endif
1287 clabel->row = 0;
1288 column = clabel->column;
1289
1290 if ((column < 0) || (column >= raidPtr->numCol)) {
1291 return(EINVAL);
1292 }
1293
1294 /* XXX this isn't allowed to do anything for now :-) */
1295
1296 /* XXX and before it is, we need to fill in the rest
1297 of the fields!?!?!?! */
1298 memcpy(raidget_component_label(raidPtr, column),
1299 clabel, sizeof(*clabel));
1300 raidflush_component_label(raidPtr, column);
1301 return (0);
1302 #endif
1303
1304 case RAIDFRAME_INIT_LABELS:
1305 clabel = (RF_ComponentLabel_t *) data;
1306 /*
1307 we only want the serial number from
1308 the above. We get all the rest of the information
1309 from the config that was used to create this RAID
1310 set.
1311 */
1312
1313 raidPtr->serial_number = clabel->serial_number;
1314
1315 for(column=0;column<raidPtr->numCol;column++) {
1316 diskPtr = &raidPtr->Disks[column];
1317 if (!RF_DEAD_DISK(diskPtr->status)) {
1318 ci_label = raidget_component_label(raidPtr,
1319 column);
1320 /* Zeroing this is important. */
1321 memset(ci_label, 0, sizeof(*ci_label));
1322 raid_init_component_label(raidPtr, ci_label);
1323 ci_label->serial_number =
1324 raidPtr->serial_number;
1325 ci_label->row = 0; /* we dont' pretend to support more */
1326 rf_component_label_set_partitionsize(ci_label,
1327 diskPtr->partitionSize);
1328 ci_label->column = column;
1329 raidflush_component_label(raidPtr, column);
1330 }
1331 /* XXXjld what about the spares? */
1332 }
1333
1334 return (retcode);
1335 case RAIDFRAME_SET_AUTOCONFIG:
1336 d = rf_set_autoconfig(raidPtr, *(int *) data);
1337 printf("raid%d: New autoconfig value is: %d\n",
1338 raidPtr->raidid, d);
1339 *(int *) data = d;
1340 return (retcode);
1341
1342 case RAIDFRAME_SET_ROOT:
1343 d = rf_set_rootpartition(raidPtr, *(int *) data);
1344 printf("raid%d: New rootpartition value is: %d\n",
1345 raidPtr->raidid, d);
1346 *(int *) data = d;
1347 return (retcode);
1348
1349 /* initialize all parity */
1350 case RAIDFRAME_REWRITEPARITY:
1351
1352 if (raidPtr->Layout.map->faultsTolerated == 0) {
1353 /* Parity for RAID 0 is trivially correct */
1354 raidPtr->parity_good = RF_RAID_CLEAN;
1355 return(0);
1356 }
1357
1358 if (raidPtr->parity_rewrite_in_progress == 1) {
1359 /* Re-write is already in progress! */
1360 return(EINVAL);
1361 }
1362
1363 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1364 rf_RewriteParityThread,
1365 raidPtr,"raid_parity");
1366 return (retcode);
1367
1368
1369 case RAIDFRAME_ADD_HOT_SPARE:
1370 sparePtr = (RF_SingleComponent_t *) data;
1371 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1372 retcode = rf_add_hot_spare(raidPtr, &component);
1373 return(retcode);
1374
1375 case RAIDFRAME_REMOVE_HOT_SPARE:
1376 return(retcode);
1377
1378 case RAIDFRAME_DELETE_COMPONENT:
1379 componentPtr = (RF_SingleComponent_t *)data;
1380 memcpy( &component, componentPtr,
1381 sizeof(RF_SingleComponent_t));
1382 retcode = rf_delete_component(raidPtr, &component);
1383 return(retcode);
1384
1385 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1386 componentPtr = (RF_SingleComponent_t *)data;
1387 memcpy( &component, componentPtr,
1388 sizeof(RF_SingleComponent_t));
1389 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1390 return(retcode);
1391
1392 case RAIDFRAME_REBUILD_IN_PLACE:
1393
1394 if (raidPtr->Layout.map->faultsTolerated == 0) {
1395 /* Can't do this on a RAID 0!! */
1396 return(EINVAL);
1397 }
1398
1399 if (raidPtr->recon_in_progress == 1) {
1400 /* a reconstruct is already in progress! */
1401 return(EINVAL);
1402 }
1403
1404 componentPtr = (RF_SingleComponent_t *) data;
1405 memcpy( &component, componentPtr,
1406 sizeof(RF_SingleComponent_t));
1407 component.row = 0; /* we don't support any more */
1408 column = component.column;
1409
1410 if ((column < 0) || (column >= raidPtr->numCol)) {
1411 return(EINVAL);
1412 }
1413
1414 rf_lock_mutex2(raidPtr->mutex);
1415 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1416 (raidPtr->numFailures > 0)) {
1417 /* XXX 0 above shouldn't be constant!!! */
1418 /* some component other than this has failed.
1419 Let's not make things worse than they already
1420 are... */
1421 printf("raid%d: Unable to reconstruct to disk at:\n",
1422 raidPtr->raidid);
1423 printf("raid%d: Col: %d Too many failures.\n",
1424 raidPtr->raidid, column);
1425 rf_unlock_mutex2(raidPtr->mutex);
1426 return (EINVAL);
1427 }
1428 if (raidPtr->Disks[column].status ==
1429 rf_ds_reconstructing) {
1430 printf("raid%d: Unable to reconstruct to disk at:\n",
1431 raidPtr->raidid);
1432 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1433
1434 rf_unlock_mutex2(raidPtr->mutex);
1435 return (EINVAL);
1436 }
1437 if (raidPtr->Disks[column].status == rf_ds_spared) {
1438 rf_unlock_mutex2(raidPtr->mutex);
1439 return (EINVAL);
1440 }
1441 rf_unlock_mutex2(raidPtr->mutex);
1442
1443 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1444 if (rrcopy == NULL)
1445 return(ENOMEM);
1446
1447 rrcopy->raidPtr = (void *) raidPtr;
1448 rrcopy->col = column;
1449
1450 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1451 rf_ReconstructInPlaceThread,
1452 rrcopy,"raid_reconip");
1453 return(retcode);
1454
1455 case RAIDFRAME_GET_INFO:
1456 if (!raidPtr->valid)
1457 return (ENODEV);
1458 ucfgp = (RF_DeviceConfig_t **) data;
1459 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1460 (RF_DeviceConfig_t *));
1461 if (d_cfg == NULL)
1462 return (ENOMEM);
1463 d_cfg->rows = 1; /* there is only 1 row now */
1464 d_cfg->cols = raidPtr->numCol;
1465 d_cfg->ndevs = raidPtr->numCol;
1466 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1467 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1468 return (ENOMEM);
1469 }
1470 d_cfg->nspares = raidPtr->numSpare;
1471 if (d_cfg->nspares >= RF_MAX_DISKS) {
1472 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1473 return (ENOMEM);
1474 }
1475 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1476 d = 0;
1477 for (j = 0; j < d_cfg->cols; j++) {
1478 d_cfg->devs[d] = raidPtr->Disks[j];
1479 d++;
1480 }
1481 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1482 d_cfg->spares[i] = raidPtr->Disks[j];
1483 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1484 /* XXX: raidctl(8) expects to see this as a used spare */
1485 d_cfg->spares[i].status = rf_ds_used_spare;
1486 }
1487 }
1488 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1489 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1490
1491 return (retcode);
1492
1493 case RAIDFRAME_CHECK_PARITY:
1494 *(int *) data = raidPtr->parity_good;
1495 return (0);
1496
1497 case RAIDFRAME_PARITYMAP_STATUS:
1498 if (rf_paritymap_ineligible(raidPtr))
1499 return EINVAL;
1500 rf_paritymap_status(raidPtr->parity_map,
1501 (struct rf_pmstat *)data);
1502 return 0;
1503
1504 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1505 if (rf_paritymap_ineligible(raidPtr))
1506 return EINVAL;
1507 if (raidPtr->parity_map == NULL)
1508 return ENOENT; /* ??? */
1509 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1510 (struct rf_pmparams *)data, 1))
1511 return EINVAL;
1512 return 0;
1513
1514 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1515 if (rf_paritymap_ineligible(raidPtr))
1516 return EINVAL;
1517 *(int *) data = rf_paritymap_get_disable(raidPtr);
1518 return 0;
1519
1520 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1521 if (rf_paritymap_ineligible(raidPtr))
1522 return EINVAL;
1523 rf_paritymap_set_disable(raidPtr, *(int *)data);
1524 /* XXX should errors be passed up? */
1525 return 0;
1526
1527 case RAIDFRAME_RESET_ACCTOTALS:
1528 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1529 return (0);
1530
1531 case RAIDFRAME_GET_ACCTOTALS:
1532 totals = (RF_AccTotals_t *) data;
1533 *totals = raidPtr->acc_totals;
1534 return (0);
1535
1536 case RAIDFRAME_KEEP_ACCTOTALS:
1537 raidPtr->keep_acc_totals = *(int *)data;
1538 return (0);
1539
1540 case RAIDFRAME_GET_SIZE:
1541 *(int *) data = raidPtr->totalSectors;
1542 return (0);
1543
1544 /* fail a disk & optionally start reconstruction */
1545 case RAIDFRAME_FAIL_DISK:
1546
1547 if (raidPtr->Layout.map->faultsTolerated == 0) {
1548 /* Can't do this on a RAID 0!! */
1549 return(EINVAL);
1550 }
1551
1552 rr = (struct rf_recon_req *) data;
1553 rr->row = 0;
1554 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1555 return (EINVAL);
1556
1557
1558 rf_lock_mutex2(raidPtr->mutex);
1559 if (raidPtr->status == rf_rs_reconstructing) {
1560 /* you can't fail a disk while we're reconstructing! */
1561 /* XXX wrong for RAID6 */
1562 rf_unlock_mutex2(raidPtr->mutex);
1563 return (EINVAL);
1564 }
1565 if ((raidPtr->Disks[rr->col].status ==
1566 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1567 /* some other component has failed. Let's not make
1568 things worse. XXX wrong for RAID6 */
1569 rf_unlock_mutex2(raidPtr->mutex);
1570 return (EINVAL);
1571 }
1572 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1573 /* Can't fail a spared disk! */
1574 rf_unlock_mutex2(raidPtr->mutex);
1575 return (EINVAL);
1576 }
1577 rf_unlock_mutex2(raidPtr->mutex);
1578
1579 /* make a copy of the recon request so that we don't rely on
1580 * the user's buffer */
1581 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1582 if (rrcopy == NULL)
1583 return(ENOMEM);
1584 memcpy(rrcopy, rr, sizeof(*rr));
1585 rrcopy->raidPtr = (void *) raidPtr;
1586
1587 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1588 rf_ReconThread,
1589 rrcopy,"raid_recon");
1590 return (0);
1591
1592 /* invoke a copyback operation after recon on whatever disk
1593 * needs it, if any */
1594 case RAIDFRAME_COPYBACK:
1595
1596 if (raidPtr->Layout.map->faultsTolerated == 0) {
1597 /* This makes no sense on a RAID 0!! */
1598 return(EINVAL);
1599 }
1600
1601 if (raidPtr->copyback_in_progress == 1) {
1602 /* Copyback is already in progress! */
1603 return(EINVAL);
1604 }
1605
1606 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1607 rf_CopybackThread,
1608 raidPtr,"raid_copyback");
1609 return (retcode);
1610
1611 /* return the percentage completion of reconstruction */
1612 case RAIDFRAME_CHECK_RECON_STATUS:
1613 if (raidPtr->Layout.map->faultsTolerated == 0) {
1614 /* This makes no sense on a RAID 0, so tell the
1615 user it's done. */
1616 *(int *) data = 100;
1617 return(0);
1618 }
1619 if (raidPtr->status != rf_rs_reconstructing)
1620 *(int *) data = 100;
1621 else {
1622 if (raidPtr->reconControl->numRUsTotal > 0) {
1623 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1624 } else {
1625 *(int *) data = 0;
1626 }
1627 }
1628 return (0);
1629 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1630 progressInfoPtr = (RF_ProgressInfo_t **) data;
1631 if (raidPtr->status != rf_rs_reconstructing) {
1632 progressInfo.remaining = 0;
1633 progressInfo.completed = 100;
1634 progressInfo.total = 100;
1635 } else {
1636 progressInfo.total =
1637 raidPtr->reconControl->numRUsTotal;
1638 progressInfo.completed =
1639 raidPtr->reconControl->numRUsComplete;
1640 progressInfo.remaining = progressInfo.total -
1641 progressInfo.completed;
1642 }
1643 retcode = copyout(&progressInfo, *progressInfoPtr,
1644 sizeof(RF_ProgressInfo_t));
1645 return (retcode);
1646
1647 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1648 if (raidPtr->Layout.map->faultsTolerated == 0) {
1649 /* This makes no sense on a RAID 0, so tell the
1650 user it's done. */
1651 *(int *) data = 100;
1652 return(0);
1653 }
1654 if (raidPtr->parity_rewrite_in_progress == 1) {
1655 *(int *) data = 100 *
1656 raidPtr->parity_rewrite_stripes_done /
1657 raidPtr->Layout.numStripe;
1658 } else {
1659 *(int *) data = 100;
1660 }
1661 return (0);
1662
1663 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1664 progressInfoPtr = (RF_ProgressInfo_t **) data;
1665 if (raidPtr->parity_rewrite_in_progress == 1) {
1666 progressInfo.total = raidPtr->Layout.numStripe;
1667 progressInfo.completed =
1668 raidPtr->parity_rewrite_stripes_done;
1669 progressInfo.remaining = progressInfo.total -
1670 progressInfo.completed;
1671 } else {
1672 progressInfo.remaining = 0;
1673 progressInfo.completed = 100;
1674 progressInfo.total = 100;
1675 }
1676 retcode = copyout(&progressInfo, *progressInfoPtr,
1677 sizeof(RF_ProgressInfo_t));
1678 return (retcode);
1679
1680 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1681 if (raidPtr->Layout.map->faultsTolerated == 0) {
1682 /* This makes no sense on a RAID 0 */
1683 *(int *) data = 100;
1684 return(0);
1685 }
1686 if (raidPtr->copyback_in_progress == 1) {
1687 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1688 raidPtr->Layout.numStripe;
1689 } else {
1690 *(int *) data = 100;
1691 }
1692 return (0);
1693
1694 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1695 progressInfoPtr = (RF_ProgressInfo_t **) data;
1696 if (raidPtr->copyback_in_progress == 1) {
1697 progressInfo.total = raidPtr->Layout.numStripe;
1698 progressInfo.completed =
1699 raidPtr->copyback_stripes_done;
1700 progressInfo.remaining = progressInfo.total -
1701 progressInfo.completed;
1702 } else {
1703 progressInfo.remaining = 0;
1704 progressInfo.completed = 100;
1705 progressInfo.total = 100;
1706 }
1707 retcode = copyout(&progressInfo, *progressInfoPtr,
1708 sizeof(RF_ProgressInfo_t));
1709 return (retcode);
1710
1711 case RAIDFRAME_SET_LAST_UNIT:
1712 for (column = 0; column < raidPtr->numCol; column++)
1713 if (raidPtr->Disks[column].status != rf_ds_optimal)
1714 return EBUSY;
1715
1716 for (column = 0; column < raidPtr->numCol; column++) {
1717 clabel = raidget_component_label(raidPtr, column);
1718 clabel->last_unit = *(int *)data;
1719 raidflush_component_label(raidPtr, column);
1720 }
1721 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1722 return 0;
1723
1724 /* the sparetable daemon calls this to wait for the kernel to
1725 * need a spare table. this ioctl does not return until a
1726 * spare table is needed. XXX -- calling mpsleep here in the
1727 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1728 * -- I should either compute the spare table in the kernel,
1729 * or have a different -- XXX XXX -- interface (a different
1730 * character device) for delivering the table -- XXX */
1731 #if 0
1732 case RAIDFRAME_SPARET_WAIT:
1733 rf_lock_mutex2(rf_sparet_wait_mutex);
1734 while (!rf_sparet_wait_queue)
1735 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1736 waitreq = rf_sparet_wait_queue;
1737 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1738 rf_unlock_mutex2(rf_sparet_wait_mutex);
1739
1740 /* structure assignment */
1741 *((RF_SparetWait_t *) data) = *waitreq;
1742
1743 RF_Free(waitreq, sizeof(*waitreq));
1744 return (0);
1745
1746 /* wakes up a process waiting on SPARET_WAIT and puts an error
1747 * code in it that will cause the dameon to exit */
1748 case RAIDFRAME_ABORT_SPARET_WAIT:
1749 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1750 waitreq->fcol = -1;
1751 rf_lock_mutex2(rf_sparet_wait_mutex);
1752 waitreq->next = rf_sparet_wait_queue;
1753 rf_sparet_wait_queue = waitreq;
1754 rf_broadcast_conf2(rf_sparet_wait_cv);
1755 rf_unlock_mutex2(rf_sparet_wait_mutex);
1756 return (0);
1757
1758 /* used by the spare table daemon to deliver a spare table
1759 * into the kernel */
1760 case RAIDFRAME_SEND_SPARET:
1761
1762 /* install the spare table */
1763 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1764
1765 /* respond to the requestor. the return status of the spare
1766 * table installation is passed in the "fcol" field */
1767 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1768 waitreq->fcol = retcode;
1769 rf_lock_mutex2(rf_sparet_wait_mutex);
1770 waitreq->next = rf_sparet_resp_queue;
1771 rf_sparet_resp_queue = waitreq;
1772 rf_broadcast_cond2(rf_sparet_resp_cv);
1773 rf_unlock_mutex2(rf_sparet_wait_mutex);
1774
1775 return (retcode);
1776 #endif
1777
1778 default:
1779 break; /* fall through to the os-specific code below */
1780
1781 }
1782
1783 if (!raidPtr->valid)
1784 return (EINVAL);
1785
1786 /*
1787 * Add support for "regular" device ioctls here.
1788 */
1789
1790 switch (cmd) {
1791 case DIOCGCACHE:
1792 retcode = rf_get_component_caches(raidPtr, (int *)data);
1793 break;
1794
1795 case DIOCCACHESYNC:
1796 retcode = rf_sync_component_caches(raidPtr);
1797 break;
1798
1799 default:
1800 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1801 break;
1802 }
1803
1804 return (retcode);
1805
1806 }
1807
1808
1809 /* raidinit -- complete the rest of the initialization for the
1810 RAIDframe device. */
1811
1812
1813 static void
1814 raidinit(struct raid_softc *rs)
1815 {
1816 cfdata_t cf;
1817 unsigned int unit;
1818 struct dk_softc *dksc = &rs->sc_dksc;
1819 RF_Raid_t *raidPtr = &rs->sc_r;
1820 device_t dev;
1821
1822 unit = raidPtr->raidid;
1823
1824 /* XXX doesn't check bounds. */
1825 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1826
1827 /* attach the pseudo device */
1828 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1829 cf->cf_name = raid_cd.cd_name;
1830 cf->cf_atname = raid_cd.cd_name;
1831 cf->cf_unit = unit;
1832 cf->cf_fstate = FSTATE_STAR;
1833
1834 dev = config_attach_pseudo(cf);
1835 if (dev == NULL) {
1836 printf("raid%d: config_attach_pseudo failed\n",
1837 raidPtr->raidid);
1838 free(cf, M_RAIDFRAME);
1839 return;
1840 }
1841
1842 /* provide a backpointer to the real softc */
1843 raidsoftc(dev) = rs;
1844
1845 /* disk_attach actually creates space for the CPU disklabel, among
1846 * other things, so it's critical to call this *BEFORE* we try putzing
1847 * with disklabels. */
1848 dk_init(dksc, dev, DKTYPE_RAID);
1849 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1850
1851 /* XXX There may be a weird interaction here between this, and
1852 * protectedSectors, as used in RAIDframe. */
1853
1854 rs->sc_size = raidPtr->totalSectors;
1855
1856 /* Attach dk and disk subsystems */
1857 dk_attach(dksc);
1858 disk_attach(&dksc->sc_dkdev);
1859 rf_set_geometry(rs, raidPtr);
1860
1861 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1862
1863 /* mark unit as usuable */
1864 rs->sc_flags |= RAIDF_INITED;
1865
1866 dkwedge_discover(&dksc->sc_dkdev);
1867 }
1868
1869 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1870 /* wake up the daemon & tell it to get us a spare table
1871 * XXX
1872 * the entries in the queues should be tagged with the raidPtr
1873 * so that in the extremely rare case that two recons happen at once,
1874 * we know for which device were requesting a spare table
1875 * XXX
1876 *
1877 * XXX This code is not currently used. GO
1878 */
1879 int
1880 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1881 {
1882 int retcode;
1883
1884 rf_lock_mutex2(rf_sparet_wait_mutex);
1885 req->next = rf_sparet_wait_queue;
1886 rf_sparet_wait_queue = req;
1887 rf_broadcast_cond2(rf_sparet_wait_cv);
1888
1889 /* mpsleep unlocks the mutex */
1890 while (!rf_sparet_resp_queue) {
1891 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1892 }
1893 req = rf_sparet_resp_queue;
1894 rf_sparet_resp_queue = req->next;
1895 rf_unlock_mutex2(rf_sparet_wait_mutex);
1896
1897 retcode = req->fcol;
1898 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1899 * alloc'd */
1900 return (retcode);
1901 }
1902 #endif
1903
1904 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1905 * bp & passes it down.
1906 * any calls originating in the kernel must use non-blocking I/O
1907 * do some extra sanity checking to return "appropriate" error values for
1908 * certain conditions (to make some standard utilities work)
1909 *
1910 * Formerly known as: rf_DoAccessKernel
1911 */
1912 void
1913 raidstart(RF_Raid_t *raidPtr)
1914 {
1915 struct raid_softc *rs;
1916 struct dk_softc *dksc;
1917
1918 rs = raidPtr->softc;
1919 dksc = &rs->sc_dksc;
1920 /* quick check to see if anything has died recently */
1921 rf_lock_mutex2(raidPtr->mutex);
1922 if (raidPtr->numNewFailures > 0) {
1923 rf_unlock_mutex2(raidPtr->mutex);
1924 rf_update_component_labels(raidPtr,
1925 RF_NORMAL_COMPONENT_UPDATE);
1926 rf_lock_mutex2(raidPtr->mutex);
1927 raidPtr->numNewFailures--;
1928 }
1929 rf_unlock_mutex2(raidPtr->mutex);
1930
1931 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1932 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1933 return;
1934 }
1935
1936 dk_start(dksc, NULL);
1937 }
1938
1939 static int
1940 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1941 {
1942 RF_SectorCount_t num_blocks, pb, sum;
1943 RF_RaidAddr_t raid_addr;
1944 daddr_t blocknum;
1945 int do_async;
1946 int rc;
1947
1948 rf_lock_mutex2(raidPtr->mutex);
1949 if (raidPtr->openings == 0) {
1950 rf_unlock_mutex2(raidPtr->mutex);
1951 return EAGAIN;
1952 }
1953 rf_unlock_mutex2(raidPtr->mutex);
1954
1955 blocknum = bp->b_rawblkno;
1956
1957 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1958 (int) blocknum));
1959
1960 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1961 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1962
1963 /* *THIS* is where we adjust what block we're going to...
1964 * but DO NOT TOUCH bp->b_blkno!!! */
1965 raid_addr = blocknum;
1966
1967 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1968 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1969 sum = raid_addr + num_blocks + pb;
1970 if (1 || rf_debugKernelAccess) {
1971 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1972 (int) raid_addr, (int) sum, (int) num_blocks,
1973 (int) pb, (int) bp->b_resid));
1974 }
1975 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1976 || (sum < num_blocks) || (sum < pb)) {
1977 rc = ENOSPC;
1978 goto done;
1979 }
1980 /*
1981 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1982 */
1983
1984 if (bp->b_bcount & raidPtr->sectorMask) {
1985 rc = ENOSPC;
1986 goto done;
1987 }
1988 db1_printf(("Calling DoAccess..\n"));
1989
1990
1991 rf_lock_mutex2(raidPtr->mutex);
1992 raidPtr->openings--;
1993 rf_unlock_mutex2(raidPtr->mutex);
1994
1995 /*
1996 * Everything is async.
1997 */
1998 do_async = 1;
1999
2000 /* don't ever condition on bp->b_flags & B_WRITE.
2001 * always condition on B_READ instead */
2002
2003 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2004 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2005 do_async, raid_addr, num_blocks,
2006 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2007
2008 done:
2009 return rc;
2010 }
2011
2012 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2013
2014 int
2015 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2016 {
2017 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2018 struct buf *bp;
2019
2020 req->queue = queue;
2021 bp = req->bp;
2022
2023 switch (req->type) {
2024 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2025 /* XXX need to do something extra here.. */
2026 /* I'm leaving this in, as I've never actually seen it used,
2027 * and I'd like folks to report it... GO */
2028 printf(("WAKEUP CALLED\n"));
2029 queue->numOutstanding++;
2030
2031 bp->b_flags = 0;
2032 bp->b_private = req;
2033
2034 KernelWakeupFunc(bp);
2035 break;
2036
2037 case RF_IO_TYPE_READ:
2038 case RF_IO_TYPE_WRITE:
2039 #if RF_ACC_TRACE > 0
2040 if (req->tracerec) {
2041 RF_ETIMER_START(req->tracerec->timer);
2042 }
2043 #endif
2044 InitBP(bp, queue->rf_cinfo->ci_vp,
2045 op, queue->rf_cinfo->ci_dev,
2046 req->sectorOffset, req->numSector,
2047 req->buf, KernelWakeupFunc, (void *) req,
2048 queue->raidPtr->logBytesPerSector, req->b_proc);
2049
2050 if (rf_debugKernelAccess) {
2051 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2052 (long) bp->b_blkno));
2053 }
2054 queue->numOutstanding++;
2055 queue->last_deq_sector = req->sectorOffset;
2056 /* acc wouldn't have been let in if there were any pending
2057 * reqs at any other priority */
2058 queue->curPriority = req->priority;
2059
2060 db1_printf(("Going for %c to unit %d col %d\n",
2061 req->type, queue->raidPtr->raidid,
2062 queue->col));
2063 db1_printf(("sector %d count %d (%d bytes) %d\n",
2064 (int) req->sectorOffset, (int) req->numSector,
2065 (int) (req->numSector <<
2066 queue->raidPtr->logBytesPerSector),
2067 (int) queue->raidPtr->logBytesPerSector));
2068
2069 /*
2070 * XXX: drop lock here since this can block at
2071 * least with backing SCSI devices. Retake it
2072 * to minimize fuss with calling interfaces.
2073 */
2074
2075 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2076 bdev_strategy(bp);
2077 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2078 break;
2079
2080 default:
2081 panic("bad req->type in rf_DispatchKernelIO");
2082 }
2083 db1_printf(("Exiting from DispatchKernelIO\n"));
2084
2085 return (0);
2086 }
2087 /* this is the callback function associated with a I/O invoked from
2088 kernel code.
2089 */
2090 static void
2091 KernelWakeupFunc(struct buf *bp)
2092 {
2093 RF_DiskQueueData_t *req = NULL;
2094 RF_DiskQueue_t *queue;
2095
2096 db1_printf(("recovering the request queue:\n"));
2097
2098 req = bp->b_private;
2099
2100 queue = (RF_DiskQueue_t *) req->queue;
2101
2102 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2103
2104 #if RF_ACC_TRACE > 0
2105 if (req->tracerec) {
2106 RF_ETIMER_STOP(req->tracerec->timer);
2107 RF_ETIMER_EVAL(req->tracerec->timer);
2108 rf_lock_mutex2(rf_tracing_mutex);
2109 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2110 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2111 req->tracerec->num_phys_ios++;
2112 rf_unlock_mutex2(rf_tracing_mutex);
2113 }
2114 #endif
2115
2116 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2117 * ballistic, and mark the component as hosed... */
2118
2119 if (bp->b_error != 0) {
2120 /* Mark the disk as dead */
2121 /* but only mark it once... */
2122 /* and only if it wouldn't leave this RAID set
2123 completely broken */
2124 if (((queue->raidPtr->Disks[queue->col].status ==
2125 rf_ds_optimal) ||
2126 (queue->raidPtr->Disks[queue->col].status ==
2127 rf_ds_used_spare)) &&
2128 (queue->raidPtr->numFailures <
2129 queue->raidPtr->Layout.map->faultsTolerated)) {
2130 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2131 queue->raidPtr->raidid,
2132 bp->b_error,
2133 queue->raidPtr->Disks[queue->col].devname);
2134 queue->raidPtr->Disks[queue->col].status =
2135 rf_ds_failed;
2136 queue->raidPtr->status = rf_rs_degraded;
2137 queue->raidPtr->numFailures++;
2138 queue->raidPtr->numNewFailures++;
2139 } else { /* Disk is already dead... */
2140 /* printf("Disk already marked as dead!\n"); */
2141 }
2142
2143 }
2144
2145 /* Fill in the error value */
2146 req->error = bp->b_error;
2147
2148 /* Drop this one on the "finished" queue... */
2149 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2150
2151 /* Let the raidio thread know there is work to be done. */
2152 rf_signal_cond2(queue->raidPtr->iodone_cv);
2153
2154 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2155 }
2156
2157
2158 /*
2159 * initialize a buf structure for doing an I/O in the kernel.
2160 */
2161 static void
2162 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2163 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2164 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2165 struct proc *b_proc)
2166 {
2167 /* bp->b_flags = B_PHYS | rw_flag; */
2168 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2169 bp->b_oflags = 0;
2170 bp->b_cflags = 0;
2171 bp->b_bcount = numSect << logBytesPerSector;
2172 bp->b_bufsize = bp->b_bcount;
2173 bp->b_error = 0;
2174 bp->b_dev = dev;
2175 bp->b_data = bf;
2176 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2177 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2178 if (bp->b_bcount == 0) {
2179 panic("bp->b_bcount is zero in InitBP!!");
2180 }
2181 bp->b_proc = b_proc;
2182 bp->b_iodone = cbFunc;
2183 bp->b_private = cbArg;
2184 }
2185
2186 /*
2187 * Wait interruptibly for an exclusive lock.
2188 *
2189 * XXX
2190 * Several drivers do this; it should be abstracted and made MP-safe.
2191 * (Hmm... where have we seen this warning before :-> GO )
2192 */
2193 static int
2194 raidlock(struct raid_softc *rs)
2195 {
2196 int error;
2197
2198 error = 0;
2199 mutex_enter(&rs->sc_mutex);
2200 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2201 rs->sc_flags |= RAIDF_WANTED;
2202 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2203 if (error != 0)
2204 goto done;
2205 }
2206 rs->sc_flags |= RAIDF_LOCKED;
2207 done:
2208 mutex_exit(&rs->sc_mutex);
2209 return (error);
2210 }
2211 /*
2212 * Unlock and wake up any waiters.
2213 */
2214 static void
2215 raidunlock(struct raid_softc *rs)
2216 {
2217
2218 mutex_enter(&rs->sc_mutex);
2219 rs->sc_flags &= ~RAIDF_LOCKED;
2220 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2221 rs->sc_flags &= ~RAIDF_WANTED;
2222 cv_broadcast(&rs->sc_cv);
2223 }
2224 mutex_exit(&rs->sc_mutex);
2225 }
2226
2227
2228 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2229 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2230 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2231
2232 static daddr_t
2233 rf_component_info_offset(void)
2234 {
2235
2236 return RF_COMPONENT_INFO_OFFSET;
2237 }
2238
2239 static daddr_t
2240 rf_component_info_size(unsigned secsize)
2241 {
2242 daddr_t info_size;
2243
2244 KASSERT(secsize);
2245 if (secsize > RF_COMPONENT_INFO_SIZE)
2246 info_size = secsize;
2247 else
2248 info_size = RF_COMPONENT_INFO_SIZE;
2249
2250 return info_size;
2251 }
2252
2253 static daddr_t
2254 rf_parity_map_offset(RF_Raid_t *raidPtr)
2255 {
2256 daddr_t map_offset;
2257
2258 KASSERT(raidPtr->bytesPerSector);
2259 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2260 map_offset = raidPtr->bytesPerSector;
2261 else
2262 map_offset = RF_COMPONENT_INFO_SIZE;
2263 map_offset += rf_component_info_offset();
2264
2265 return map_offset;
2266 }
2267
2268 static daddr_t
2269 rf_parity_map_size(RF_Raid_t *raidPtr)
2270 {
2271 daddr_t map_size;
2272
2273 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2274 map_size = raidPtr->bytesPerSector;
2275 else
2276 map_size = RF_PARITY_MAP_SIZE;
2277
2278 return map_size;
2279 }
2280
2281 int
2282 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2283 {
2284 RF_ComponentLabel_t *clabel;
2285
2286 clabel = raidget_component_label(raidPtr, col);
2287 clabel->clean = RF_RAID_CLEAN;
2288 raidflush_component_label(raidPtr, col);
2289 return(0);
2290 }
2291
2292
2293 int
2294 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2295 {
2296 RF_ComponentLabel_t *clabel;
2297
2298 clabel = raidget_component_label(raidPtr, col);
2299 clabel->clean = RF_RAID_DIRTY;
2300 raidflush_component_label(raidPtr, col);
2301 return(0);
2302 }
2303
2304 int
2305 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2306 {
2307 KASSERT(raidPtr->bytesPerSector);
2308 return raidread_component_label(raidPtr->bytesPerSector,
2309 raidPtr->Disks[col].dev,
2310 raidPtr->raid_cinfo[col].ci_vp,
2311 &raidPtr->raid_cinfo[col].ci_label);
2312 }
2313
2314 RF_ComponentLabel_t *
2315 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2316 {
2317 return &raidPtr->raid_cinfo[col].ci_label;
2318 }
2319
2320 int
2321 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2322 {
2323 RF_ComponentLabel_t *label;
2324
2325 label = &raidPtr->raid_cinfo[col].ci_label;
2326 label->mod_counter = raidPtr->mod_counter;
2327 #ifndef RF_NO_PARITY_MAP
2328 label->parity_map_modcount = label->mod_counter;
2329 #endif
2330 return raidwrite_component_label(raidPtr->bytesPerSector,
2331 raidPtr->Disks[col].dev,
2332 raidPtr->raid_cinfo[col].ci_vp, label);
2333 }
2334
2335
2336 static int
2337 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2338 RF_ComponentLabel_t *clabel)
2339 {
2340 return raidread_component_area(dev, b_vp, clabel,
2341 sizeof(RF_ComponentLabel_t),
2342 rf_component_info_offset(),
2343 rf_component_info_size(secsize));
2344 }
2345
2346 /* ARGSUSED */
2347 static int
2348 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2349 size_t msize, daddr_t offset, daddr_t dsize)
2350 {
2351 struct buf *bp;
2352 int error;
2353
2354 /* XXX should probably ensure that we don't try to do this if
2355 someone has changed rf_protected_sectors. */
2356
2357 if (b_vp == NULL) {
2358 /* For whatever reason, this component is not valid.
2359 Don't try to read a component label from it. */
2360 return(EINVAL);
2361 }
2362
2363 /* get a block of the appropriate size... */
2364 bp = geteblk((int)dsize);
2365 bp->b_dev = dev;
2366
2367 /* get our ducks in a row for the read */
2368 bp->b_blkno = offset / DEV_BSIZE;
2369 bp->b_bcount = dsize;
2370 bp->b_flags |= B_READ;
2371 bp->b_resid = dsize;
2372
2373 bdev_strategy(bp);
2374 error = biowait(bp);
2375
2376 if (!error) {
2377 memcpy(data, bp->b_data, msize);
2378 }
2379
2380 brelse(bp, 0);
2381 return(error);
2382 }
2383
2384
2385 static int
2386 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2387 RF_ComponentLabel_t *clabel)
2388 {
2389 return raidwrite_component_area(dev, b_vp, clabel,
2390 sizeof(RF_ComponentLabel_t),
2391 rf_component_info_offset(),
2392 rf_component_info_size(secsize), 0);
2393 }
2394
2395 /* ARGSUSED */
2396 static int
2397 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2398 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2399 {
2400 struct buf *bp;
2401 int error;
2402
2403 /* get a block of the appropriate size... */
2404 bp = geteblk((int)dsize);
2405 bp->b_dev = dev;
2406
2407 /* get our ducks in a row for the write */
2408 bp->b_blkno = offset / DEV_BSIZE;
2409 bp->b_bcount = dsize;
2410 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2411 bp->b_resid = dsize;
2412
2413 memset(bp->b_data, 0, dsize);
2414 memcpy(bp->b_data, data, msize);
2415
2416 bdev_strategy(bp);
2417 if (asyncp)
2418 return 0;
2419 error = biowait(bp);
2420 brelse(bp, 0);
2421 if (error) {
2422 #if 1
2423 printf("Failed to write RAID component info!\n");
2424 #endif
2425 }
2426
2427 return(error);
2428 }
2429
2430 void
2431 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2432 {
2433 int c;
2434
2435 for (c = 0; c < raidPtr->numCol; c++) {
2436 /* Skip dead disks. */
2437 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2438 continue;
2439 /* XXXjld: what if an error occurs here? */
2440 raidwrite_component_area(raidPtr->Disks[c].dev,
2441 raidPtr->raid_cinfo[c].ci_vp, map,
2442 RF_PARITYMAP_NBYTE,
2443 rf_parity_map_offset(raidPtr),
2444 rf_parity_map_size(raidPtr), 0);
2445 }
2446 }
2447
2448 void
2449 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2450 {
2451 struct rf_paritymap_ondisk tmp;
2452 int c,first;
2453
2454 first=1;
2455 for (c = 0; c < raidPtr->numCol; c++) {
2456 /* Skip dead disks. */
2457 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2458 continue;
2459 raidread_component_area(raidPtr->Disks[c].dev,
2460 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2461 RF_PARITYMAP_NBYTE,
2462 rf_parity_map_offset(raidPtr),
2463 rf_parity_map_size(raidPtr));
2464 if (first) {
2465 memcpy(map, &tmp, sizeof(*map));
2466 first = 0;
2467 } else {
2468 rf_paritymap_merge(map, &tmp);
2469 }
2470 }
2471 }
2472
2473 void
2474 rf_markalldirty(RF_Raid_t *raidPtr)
2475 {
2476 RF_ComponentLabel_t *clabel;
2477 int sparecol;
2478 int c;
2479 int j;
2480 int scol = -1;
2481
2482 raidPtr->mod_counter++;
2483 for (c = 0; c < raidPtr->numCol; c++) {
2484 /* we don't want to touch (at all) a disk that has
2485 failed */
2486 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2487 clabel = raidget_component_label(raidPtr, c);
2488 if (clabel->status == rf_ds_spared) {
2489 /* XXX do something special...
2490 but whatever you do, don't
2491 try to access it!! */
2492 } else {
2493 raidmarkdirty(raidPtr, c);
2494 }
2495 }
2496 }
2497
2498 for( c = 0; c < raidPtr->numSpare ; c++) {
2499 sparecol = raidPtr->numCol + c;
2500 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2501 /*
2502
2503 we claim this disk is "optimal" if it's
2504 rf_ds_used_spare, as that means it should be
2505 directly substitutable for the disk it replaced.
2506 We note that too...
2507
2508 */
2509
2510 for(j=0;j<raidPtr->numCol;j++) {
2511 if (raidPtr->Disks[j].spareCol == sparecol) {
2512 scol = j;
2513 break;
2514 }
2515 }
2516
2517 clabel = raidget_component_label(raidPtr, sparecol);
2518 /* make sure status is noted */
2519
2520 raid_init_component_label(raidPtr, clabel);
2521
2522 clabel->row = 0;
2523 clabel->column = scol;
2524 /* Note: we *don't* change status from rf_ds_used_spare
2525 to rf_ds_optimal */
2526 /* clabel.status = rf_ds_optimal; */
2527
2528 raidmarkdirty(raidPtr, sparecol);
2529 }
2530 }
2531 }
2532
2533
2534 void
2535 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2536 {
2537 RF_ComponentLabel_t *clabel;
2538 int sparecol;
2539 int c;
2540 int j;
2541 int scol;
2542 struct raid_softc *rs = raidPtr->softc;
2543
2544 scol = -1;
2545
2546 /* XXX should do extra checks to make sure things really are clean,
2547 rather than blindly setting the clean bit... */
2548
2549 raidPtr->mod_counter++;
2550
2551 for (c = 0; c < raidPtr->numCol; c++) {
2552 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2553 clabel = raidget_component_label(raidPtr, c);
2554 /* make sure status is noted */
2555 clabel->status = rf_ds_optimal;
2556
2557 /* note what unit we are configured as */
2558 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2559 clabel->last_unit = raidPtr->raidid;
2560
2561 raidflush_component_label(raidPtr, c);
2562 if (final == RF_FINAL_COMPONENT_UPDATE) {
2563 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2564 raidmarkclean(raidPtr, c);
2565 }
2566 }
2567 }
2568 /* else we don't touch it.. */
2569 }
2570
2571 for( c = 0; c < raidPtr->numSpare ; c++) {
2572 sparecol = raidPtr->numCol + c;
2573 /* Need to ensure that the reconstruct actually completed! */
2574 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2575 /*
2576
2577 we claim this disk is "optimal" if it's
2578 rf_ds_used_spare, as that means it should be
2579 directly substitutable for the disk it replaced.
2580 We note that too...
2581
2582 */
2583
2584 for(j=0;j<raidPtr->numCol;j++) {
2585 if (raidPtr->Disks[j].spareCol == sparecol) {
2586 scol = j;
2587 break;
2588 }
2589 }
2590
2591 /* XXX shouldn't *really* need this... */
2592 clabel = raidget_component_label(raidPtr, sparecol);
2593 /* make sure status is noted */
2594
2595 raid_init_component_label(raidPtr, clabel);
2596
2597 clabel->column = scol;
2598 clabel->status = rf_ds_optimal;
2599 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2600 clabel->last_unit = raidPtr->raidid;
2601
2602 raidflush_component_label(raidPtr, sparecol);
2603 if (final == RF_FINAL_COMPONENT_UPDATE) {
2604 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2605 raidmarkclean(raidPtr, sparecol);
2606 }
2607 }
2608 }
2609 }
2610 }
2611
2612 void
2613 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2614 {
2615
2616 if (vp != NULL) {
2617 if (auto_configured == 1) {
2618 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2619 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2620 vput(vp);
2621
2622 } else {
2623 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2624 }
2625 }
2626 }
2627
2628
2629 void
2630 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2631 {
2632 int r,c;
2633 struct vnode *vp;
2634 int acd;
2635
2636
2637 /* We take this opportunity to close the vnodes like we should.. */
2638
2639 for (c = 0; c < raidPtr->numCol; c++) {
2640 vp = raidPtr->raid_cinfo[c].ci_vp;
2641 acd = raidPtr->Disks[c].auto_configured;
2642 rf_close_component(raidPtr, vp, acd);
2643 raidPtr->raid_cinfo[c].ci_vp = NULL;
2644 raidPtr->Disks[c].auto_configured = 0;
2645 }
2646
2647 for (r = 0; r < raidPtr->numSpare; r++) {
2648 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2649 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2650 rf_close_component(raidPtr, vp, acd);
2651 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2652 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2653 }
2654 }
2655
2656
2657 void
2658 rf_ReconThread(struct rf_recon_req *req)
2659 {
2660 int s;
2661 RF_Raid_t *raidPtr;
2662
2663 s = splbio();
2664 raidPtr = (RF_Raid_t *) req->raidPtr;
2665 raidPtr->recon_in_progress = 1;
2666
2667 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2668 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2669
2670 RF_Free(req, sizeof(*req));
2671
2672 raidPtr->recon_in_progress = 0;
2673 splx(s);
2674
2675 /* That's all... */
2676 kthread_exit(0); /* does not return */
2677 }
2678
2679 void
2680 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2681 {
2682 int retcode;
2683 int s;
2684
2685 raidPtr->parity_rewrite_stripes_done = 0;
2686 raidPtr->parity_rewrite_in_progress = 1;
2687 s = splbio();
2688 retcode = rf_RewriteParity(raidPtr);
2689 splx(s);
2690 if (retcode) {
2691 printf("raid%d: Error re-writing parity (%d)!\n",
2692 raidPtr->raidid, retcode);
2693 } else {
2694 /* set the clean bit! If we shutdown correctly,
2695 the clean bit on each component label will get
2696 set */
2697 raidPtr->parity_good = RF_RAID_CLEAN;
2698 }
2699 raidPtr->parity_rewrite_in_progress = 0;
2700
2701 /* Anyone waiting for us to stop? If so, inform them... */
2702 if (raidPtr->waitShutdown) {
2703 wakeup(&raidPtr->parity_rewrite_in_progress);
2704 }
2705
2706 /* That's all... */
2707 kthread_exit(0); /* does not return */
2708 }
2709
2710
2711 void
2712 rf_CopybackThread(RF_Raid_t *raidPtr)
2713 {
2714 int s;
2715
2716 raidPtr->copyback_in_progress = 1;
2717 s = splbio();
2718 rf_CopybackReconstructedData(raidPtr);
2719 splx(s);
2720 raidPtr->copyback_in_progress = 0;
2721
2722 /* That's all... */
2723 kthread_exit(0); /* does not return */
2724 }
2725
2726
2727 void
2728 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2729 {
2730 int s;
2731 RF_Raid_t *raidPtr;
2732
2733 s = splbio();
2734 raidPtr = req->raidPtr;
2735 raidPtr->recon_in_progress = 1;
2736 rf_ReconstructInPlace(raidPtr, req->col);
2737 RF_Free(req, sizeof(*req));
2738 raidPtr->recon_in_progress = 0;
2739 splx(s);
2740
2741 /* That's all... */
2742 kthread_exit(0); /* does not return */
2743 }
2744
2745 static RF_AutoConfig_t *
2746 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2747 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2748 unsigned secsize)
2749 {
2750 int good_one = 0;
2751 RF_ComponentLabel_t *clabel;
2752 RF_AutoConfig_t *ac;
2753
2754 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2755 if (clabel == NULL) {
2756 oomem:
2757 while(ac_list) {
2758 ac = ac_list;
2759 if (ac->clabel)
2760 free(ac->clabel, M_RAIDFRAME);
2761 ac_list = ac_list->next;
2762 free(ac, M_RAIDFRAME);
2763 }
2764 printf("RAID auto config: out of memory!\n");
2765 return NULL; /* XXX probably should panic? */
2766 }
2767
2768 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2769 /* Got the label. Does it look reasonable? */
2770 if (rf_reasonable_label(clabel, numsecs) &&
2771 (rf_component_label_partitionsize(clabel) <= size)) {
2772 #ifdef DEBUG
2773 printf("Component on: %s: %llu\n",
2774 cname, (unsigned long long)size);
2775 rf_print_component_label(clabel);
2776 #endif
2777 /* if it's reasonable, add it, else ignore it. */
2778 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2779 M_NOWAIT);
2780 if (ac == NULL) {
2781 free(clabel, M_RAIDFRAME);
2782 goto oomem;
2783 }
2784 strlcpy(ac->devname, cname, sizeof(ac->devname));
2785 ac->dev = dev;
2786 ac->vp = vp;
2787 ac->clabel = clabel;
2788 ac->next = ac_list;
2789 ac_list = ac;
2790 good_one = 1;
2791 }
2792 }
2793 if (!good_one) {
2794 /* cleanup */
2795 free(clabel, M_RAIDFRAME);
2796 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2797 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2798 vput(vp);
2799 }
2800 return ac_list;
2801 }
2802
2803 RF_AutoConfig_t *
2804 rf_find_raid_components(void)
2805 {
2806 struct vnode *vp;
2807 struct disklabel label;
2808 device_t dv;
2809 deviter_t di;
2810 dev_t dev;
2811 int bmajor, bminor, wedge, rf_part_found;
2812 int error;
2813 int i;
2814 RF_AutoConfig_t *ac_list;
2815 uint64_t numsecs;
2816 unsigned secsize;
2817 int dowedges;
2818
2819 /* initialize the AutoConfig list */
2820 ac_list = NULL;
2821
2822 /*
2823 * we begin by trolling through *all* the devices on the system *twice*
2824 * first we scan for wedges, second for other devices. This avoids
2825 * using a raw partition instead of a wedge that covers the whole disk
2826 */
2827
2828 for (dowedges=1; dowedges>=0; --dowedges) {
2829 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2830 dv = deviter_next(&di)) {
2831
2832 /* we are only interested in disks... */
2833 if (device_class(dv) != DV_DISK)
2834 continue;
2835
2836 /* we don't care about floppies... */
2837 if (device_is_a(dv, "fd")) {
2838 continue;
2839 }
2840
2841 /* we don't care about CD's... */
2842 if (device_is_a(dv, "cd")) {
2843 continue;
2844 }
2845
2846 /* we don't care about md's... */
2847 if (device_is_a(dv, "md")) {
2848 continue;
2849 }
2850
2851 /* hdfd is the Atari/Hades floppy driver */
2852 if (device_is_a(dv, "hdfd")) {
2853 continue;
2854 }
2855
2856 /* fdisa is the Atari/Milan floppy driver */
2857 if (device_is_a(dv, "fdisa")) {
2858 continue;
2859 }
2860
2861 /* are we in the wedges pass ? */
2862 wedge = device_is_a(dv, "dk");
2863 if (wedge != dowedges) {
2864 continue;
2865 }
2866
2867 /* need to find the device_name_to_block_device_major stuff */
2868 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2869
2870 rf_part_found = 0; /*No raid partition as yet*/
2871
2872 /* get a vnode for the raw partition of this disk */
2873 bminor = minor(device_unit(dv));
2874 dev = wedge ? makedev(bmajor, bminor) :
2875 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2876 if (bdevvp(dev, &vp))
2877 panic("RAID can't alloc vnode");
2878
2879 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2880
2881 if (error) {
2882 /* "Who cares." Continue looking
2883 for something that exists*/
2884 vput(vp);
2885 continue;
2886 }
2887
2888 error = getdisksize(vp, &numsecs, &secsize);
2889 if (error) {
2890 /*
2891 * Pseudo devices like vnd and cgd can be
2892 * opened but may still need some configuration.
2893 * Ignore these quietly.
2894 */
2895 if (error != ENXIO)
2896 printf("RAIDframe: can't get disk size"
2897 " for dev %s (%d)\n",
2898 device_xname(dv), error);
2899 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2900 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2901 vput(vp);
2902 continue;
2903 }
2904 if (wedge) {
2905 struct dkwedge_info dkw;
2906 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2907 NOCRED);
2908 if (error) {
2909 printf("RAIDframe: can't get wedge info for "
2910 "dev %s (%d)\n", device_xname(dv), error);
2911 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2912 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2913 vput(vp);
2914 continue;
2915 }
2916
2917 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2918 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2919 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2920 vput(vp);
2921 continue;
2922 }
2923
2924 ac_list = rf_get_component(ac_list, dev, vp,
2925 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2926 rf_part_found = 1; /*There is a raid component on this disk*/
2927 continue;
2928 }
2929
2930 /* Ok, the disk exists. Go get the disklabel. */
2931 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2932 if (error) {
2933 /*
2934 * XXX can't happen - open() would
2935 * have errored out (or faked up one)
2936 */
2937 if (error != ENOTTY)
2938 printf("RAIDframe: can't get label for dev "
2939 "%s (%d)\n", device_xname(dv), error);
2940 }
2941
2942 /* don't need this any more. We'll allocate it again
2943 a little later if we really do... */
2944 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2945 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2946 vput(vp);
2947
2948 if (error)
2949 continue;
2950
2951 rf_part_found = 0; /*No raid partitions yet*/
2952 for (i = 0; i < label.d_npartitions; i++) {
2953 char cname[sizeof(ac_list->devname)];
2954
2955 /* We only support partitions marked as RAID */
2956 if (label.d_partitions[i].p_fstype != FS_RAID)
2957 continue;
2958
2959 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2960 if (bdevvp(dev, &vp))
2961 panic("RAID can't alloc vnode");
2962
2963 error = VOP_OPEN(vp, FREAD, NOCRED);
2964 if (error) {
2965 /* Whatever... */
2966 vput(vp);
2967 continue;
2968 }
2969 snprintf(cname, sizeof(cname), "%s%c",
2970 device_xname(dv), 'a' + i);
2971 ac_list = rf_get_component(ac_list, dev, vp, cname,
2972 label.d_partitions[i].p_size, numsecs, secsize);
2973 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2974 }
2975
2976 /*
2977 *If there is no raid component on this disk, either in a
2978 *disklabel or inside a wedge, check the raw partition as well,
2979 *as it is possible to configure raid components on raw disk
2980 *devices.
2981 */
2982
2983 if (!rf_part_found) {
2984 char cname[sizeof(ac_list->devname)];
2985
2986 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2987 if (bdevvp(dev, &vp))
2988 panic("RAID can't alloc vnode");
2989
2990 error = VOP_OPEN(vp, FREAD, NOCRED);
2991 if (error) {
2992 /* Whatever... */
2993 vput(vp);
2994 continue;
2995 }
2996 snprintf(cname, sizeof(cname), "%s%c",
2997 device_xname(dv), 'a' + RAW_PART);
2998 ac_list = rf_get_component(ac_list, dev, vp, cname,
2999 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3000 }
3001 }
3002 deviter_release(&di);
3003 }
3004 return ac_list;
3005 }
3006
3007
3008 int
3009 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3010 {
3011
3012 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3013 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3014 ((clabel->clean == RF_RAID_CLEAN) ||
3015 (clabel->clean == RF_RAID_DIRTY)) &&
3016 clabel->row >=0 &&
3017 clabel->column >= 0 &&
3018 clabel->num_rows > 0 &&
3019 clabel->num_columns > 0 &&
3020 clabel->row < clabel->num_rows &&
3021 clabel->column < clabel->num_columns &&
3022 clabel->blockSize > 0 &&
3023 /*
3024 * numBlocksHi may contain garbage, but it is ok since
3025 * the type is unsigned. If it is really garbage,
3026 * rf_fix_old_label_size() will fix it.
3027 */
3028 rf_component_label_numblocks(clabel) > 0) {
3029 /*
3030 * label looks reasonable enough...
3031 * let's make sure it has no old garbage.
3032 */
3033 if (numsecs)
3034 rf_fix_old_label_size(clabel, numsecs);
3035 return(1);
3036 }
3037 return(0);
3038 }
3039
3040
3041 /*
3042 * For reasons yet unknown, some old component labels have garbage in
3043 * the newer numBlocksHi region, and this causes lossage. Since those
3044 * disks will also have numsecs set to less than 32 bits of sectors,
3045 * we can determine when this corruption has occurred, and fix it.
3046 *
3047 * The exact same problem, with the same unknown reason, happens to
3048 * the partitionSizeHi member as well.
3049 */
3050 static void
3051 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3052 {
3053
3054 if (numsecs < ((uint64_t)1 << 32)) {
3055 if (clabel->numBlocksHi) {
3056 printf("WARNING: total sectors < 32 bits, yet "
3057 "numBlocksHi set\n"
3058 "WARNING: resetting numBlocksHi to zero.\n");
3059 clabel->numBlocksHi = 0;
3060 }
3061
3062 if (clabel->partitionSizeHi) {
3063 printf("WARNING: total sectors < 32 bits, yet "
3064 "partitionSizeHi set\n"
3065 "WARNING: resetting partitionSizeHi to zero.\n");
3066 clabel->partitionSizeHi = 0;
3067 }
3068 }
3069 }
3070
3071
3072 #ifdef DEBUG
3073 void
3074 rf_print_component_label(RF_ComponentLabel_t *clabel)
3075 {
3076 uint64_t numBlocks;
3077 static const char *rp[] = {
3078 "No", "Force", "Soft", "*invalid*"
3079 };
3080
3081
3082 numBlocks = rf_component_label_numblocks(clabel);
3083
3084 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3085 clabel->row, clabel->column,
3086 clabel->num_rows, clabel->num_columns);
3087 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3088 clabel->version, clabel->serial_number,
3089 clabel->mod_counter);
3090 printf(" Clean: %s Status: %d\n",
3091 clabel->clean ? "Yes" : "No", clabel->status);
3092 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3093 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3094 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3095 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3096 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3097 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3098 printf(" Last configured as: raid%d\n", clabel->last_unit);
3099 #if 0
3100 printf(" Config order: %d\n", clabel->config_order);
3101 #endif
3102
3103 }
3104 #endif
3105
3106 RF_ConfigSet_t *
3107 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3108 {
3109 RF_AutoConfig_t *ac;
3110 RF_ConfigSet_t *config_sets;
3111 RF_ConfigSet_t *cset;
3112 RF_AutoConfig_t *ac_next;
3113
3114
3115 config_sets = NULL;
3116
3117 /* Go through the AutoConfig list, and figure out which components
3118 belong to what sets. */
3119 ac = ac_list;
3120 while(ac!=NULL) {
3121 /* we're going to putz with ac->next, so save it here
3122 for use at the end of the loop */
3123 ac_next = ac->next;
3124
3125 if (config_sets == NULL) {
3126 /* will need at least this one... */
3127 config_sets = (RF_ConfigSet_t *)
3128 malloc(sizeof(RF_ConfigSet_t),
3129 M_RAIDFRAME, M_NOWAIT);
3130 if (config_sets == NULL) {
3131 panic("rf_create_auto_sets: No memory!");
3132 }
3133 /* this one is easy :) */
3134 config_sets->ac = ac;
3135 config_sets->next = NULL;
3136 config_sets->rootable = 0;
3137 ac->next = NULL;
3138 } else {
3139 /* which set does this component fit into? */
3140 cset = config_sets;
3141 while(cset!=NULL) {
3142 if (rf_does_it_fit(cset, ac)) {
3143 /* looks like it matches... */
3144 ac->next = cset->ac;
3145 cset->ac = ac;
3146 break;
3147 }
3148 cset = cset->next;
3149 }
3150 if (cset==NULL) {
3151 /* didn't find a match above... new set..*/
3152 cset = (RF_ConfigSet_t *)
3153 malloc(sizeof(RF_ConfigSet_t),
3154 M_RAIDFRAME, M_NOWAIT);
3155 if (cset == NULL) {
3156 panic("rf_create_auto_sets: No memory!");
3157 }
3158 cset->ac = ac;
3159 ac->next = NULL;
3160 cset->next = config_sets;
3161 cset->rootable = 0;
3162 config_sets = cset;
3163 }
3164 }
3165 ac = ac_next;
3166 }
3167
3168
3169 return(config_sets);
3170 }
3171
3172 static int
3173 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3174 {
3175 RF_ComponentLabel_t *clabel1, *clabel2;
3176
3177 /* If this one matches the *first* one in the set, that's good
3178 enough, since the other members of the set would have been
3179 through here too... */
3180 /* note that we are not checking partitionSize here..
3181
3182 Note that we are also not checking the mod_counters here.
3183 If everything else matches except the mod_counter, that's
3184 good enough for this test. We will deal with the mod_counters
3185 a little later in the autoconfiguration process.
3186
3187 (clabel1->mod_counter == clabel2->mod_counter) &&
3188
3189 The reason we don't check for this is that failed disks
3190 will have lower modification counts. If those disks are
3191 not added to the set they used to belong to, then they will
3192 form their own set, which may result in 2 different sets,
3193 for example, competing to be configured at raid0, and
3194 perhaps competing to be the root filesystem set. If the
3195 wrong ones get configured, or both attempt to become /,
3196 weird behaviour and or serious lossage will occur. Thus we
3197 need to bring them into the fold here, and kick them out at
3198 a later point.
3199
3200 */
3201
3202 clabel1 = cset->ac->clabel;
3203 clabel2 = ac->clabel;
3204 if ((clabel1->version == clabel2->version) &&
3205 (clabel1->serial_number == clabel2->serial_number) &&
3206 (clabel1->num_rows == clabel2->num_rows) &&
3207 (clabel1->num_columns == clabel2->num_columns) &&
3208 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3209 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3210 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3211 (clabel1->parityConfig == clabel2->parityConfig) &&
3212 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3213 (clabel1->blockSize == clabel2->blockSize) &&
3214 rf_component_label_numblocks(clabel1) ==
3215 rf_component_label_numblocks(clabel2) &&
3216 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3217 (clabel1->root_partition == clabel2->root_partition) &&
3218 (clabel1->last_unit == clabel2->last_unit) &&
3219 (clabel1->config_order == clabel2->config_order)) {
3220 /* if it get's here, it almost *has* to be a match */
3221 } else {
3222 /* it's not consistent with somebody in the set..
3223 punt */
3224 return(0);
3225 }
3226 /* all was fine.. it must fit... */
3227 return(1);
3228 }
3229
3230 int
3231 rf_have_enough_components(RF_ConfigSet_t *cset)
3232 {
3233 RF_AutoConfig_t *ac;
3234 RF_AutoConfig_t *auto_config;
3235 RF_ComponentLabel_t *clabel;
3236 int c;
3237 int num_cols;
3238 int num_missing;
3239 int mod_counter;
3240 int mod_counter_found;
3241 int even_pair_failed;
3242 char parity_type;
3243
3244
3245 /* check to see that we have enough 'live' components
3246 of this set. If so, we can configure it if necessary */
3247
3248 num_cols = cset->ac->clabel->num_columns;
3249 parity_type = cset->ac->clabel->parityConfig;
3250
3251 /* XXX Check for duplicate components!?!?!? */
3252
3253 /* Determine what the mod_counter is supposed to be for this set. */
3254
3255 mod_counter_found = 0;
3256 mod_counter = 0;
3257 ac = cset->ac;
3258 while(ac!=NULL) {
3259 if (mod_counter_found==0) {
3260 mod_counter = ac->clabel->mod_counter;
3261 mod_counter_found = 1;
3262 } else {
3263 if (ac->clabel->mod_counter > mod_counter) {
3264 mod_counter = ac->clabel->mod_counter;
3265 }
3266 }
3267 ac = ac->next;
3268 }
3269
3270 num_missing = 0;
3271 auto_config = cset->ac;
3272
3273 even_pair_failed = 0;
3274 for(c=0; c<num_cols; c++) {
3275 ac = auto_config;
3276 while(ac!=NULL) {
3277 if ((ac->clabel->column == c) &&
3278 (ac->clabel->mod_counter == mod_counter)) {
3279 /* it's this one... */
3280 #ifdef DEBUG
3281 printf("Found: %s at %d\n",
3282 ac->devname,c);
3283 #endif
3284 break;
3285 }
3286 ac=ac->next;
3287 }
3288 if (ac==NULL) {
3289 /* Didn't find one here! */
3290 /* special case for RAID 1, especially
3291 where there are more than 2
3292 components (where RAIDframe treats
3293 things a little differently :( ) */
3294 if (parity_type == '1') {
3295 if (c%2 == 0) { /* even component */
3296 even_pair_failed = 1;
3297 } else { /* odd component. If
3298 we're failed, and
3299 so is the even
3300 component, it's
3301 "Good Night, Charlie" */
3302 if (even_pair_failed == 1) {
3303 return(0);
3304 }
3305 }
3306 } else {
3307 /* normal accounting */
3308 num_missing++;
3309 }
3310 }
3311 if ((parity_type == '1') && (c%2 == 1)) {
3312 /* Just did an even component, and we didn't
3313 bail.. reset the even_pair_failed flag,
3314 and go on to the next component.... */
3315 even_pair_failed = 0;
3316 }
3317 }
3318
3319 clabel = cset->ac->clabel;
3320
3321 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3322 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3323 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3324 /* XXX this needs to be made *much* more general */
3325 /* Too many failures */
3326 return(0);
3327 }
3328 /* otherwise, all is well, and we've got enough to take a kick
3329 at autoconfiguring this set */
3330 return(1);
3331 }
3332
3333 void
3334 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3335 RF_Raid_t *raidPtr)
3336 {
3337 RF_ComponentLabel_t *clabel;
3338 int i;
3339
3340 clabel = ac->clabel;
3341
3342 /* 1. Fill in the common stuff */
3343 config->numRow = clabel->num_rows = 1;
3344 config->numCol = clabel->num_columns;
3345 config->numSpare = 0; /* XXX should this be set here? */
3346 config->sectPerSU = clabel->sectPerSU;
3347 config->SUsPerPU = clabel->SUsPerPU;
3348 config->SUsPerRU = clabel->SUsPerRU;
3349 config->parityConfig = clabel->parityConfig;
3350 /* XXX... */
3351 strcpy(config->diskQueueType,"fifo");
3352 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3353 config->layoutSpecificSize = 0; /* XXX ?? */
3354
3355 while(ac!=NULL) {
3356 /* row/col values will be in range due to the checks
3357 in reasonable_label() */
3358 strcpy(config->devnames[0][ac->clabel->column],
3359 ac->devname);
3360 ac = ac->next;
3361 }
3362
3363 for(i=0;i<RF_MAXDBGV;i++) {
3364 config->debugVars[i][0] = 0;
3365 }
3366 }
3367
3368 int
3369 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3370 {
3371 RF_ComponentLabel_t *clabel;
3372 int column;
3373 int sparecol;
3374
3375 raidPtr->autoconfigure = new_value;
3376
3377 for(column=0; column<raidPtr->numCol; column++) {
3378 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3379 clabel = raidget_component_label(raidPtr, column);
3380 clabel->autoconfigure = new_value;
3381 raidflush_component_label(raidPtr, column);
3382 }
3383 }
3384 for(column = 0; column < raidPtr->numSpare ; column++) {
3385 sparecol = raidPtr->numCol + column;
3386 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3387 clabel = raidget_component_label(raidPtr, sparecol);
3388 clabel->autoconfigure = new_value;
3389 raidflush_component_label(raidPtr, sparecol);
3390 }
3391 }
3392 return(new_value);
3393 }
3394
3395 int
3396 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3397 {
3398 RF_ComponentLabel_t *clabel;
3399 int column;
3400 int sparecol;
3401
3402 raidPtr->root_partition = new_value;
3403 for(column=0; column<raidPtr->numCol; column++) {
3404 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3405 clabel = raidget_component_label(raidPtr, column);
3406 clabel->root_partition = new_value;
3407 raidflush_component_label(raidPtr, column);
3408 }
3409 }
3410 for(column = 0; column < raidPtr->numSpare ; column++) {
3411 sparecol = raidPtr->numCol + column;
3412 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3413 clabel = raidget_component_label(raidPtr, sparecol);
3414 clabel->root_partition = new_value;
3415 raidflush_component_label(raidPtr, sparecol);
3416 }
3417 }
3418 return(new_value);
3419 }
3420
3421 void
3422 rf_release_all_vps(RF_ConfigSet_t *cset)
3423 {
3424 RF_AutoConfig_t *ac;
3425
3426 ac = cset->ac;
3427 while(ac!=NULL) {
3428 /* Close the vp, and give it back */
3429 if (ac->vp) {
3430 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3431 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3432 vput(ac->vp);
3433 ac->vp = NULL;
3434 }
3435 ac = ac->next;
3436 }
3437 }
3438
3439
3440 void
3441 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3442 {
3443 RF_AutoConfig_t *ac;
3444 RF_AutoConfig_t *next_ac;
3445
3446 ac = cset->ac;
3447 while(ac!=NULL) {
3448 next_ac = ac->next;
3449 /* nuke the label */
3450 free(ac->clabel, M_RAIDFRAME);
3451 /* cleanup the config structure */
3452 free(ac, M_RAIDFRAME);
3453 /* "next.." */
3454 ac = next_ac;
3455 }
3456 /* and, finally, nuke the config set */
3457 free(cset, M_RAIDFRAME);
3458 }
3459
3460
3461 void
3462 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3463 {
3464 /* current version number */
3465 clabel->version = RF_COMPONENT_LABEL_VERSION;
3466 clabel->serial_number = raidPtr->serial_number;
3467 clabel->mod_counter = raidPtr->mod_counter;
3468
3469 clabel->num_rows = 1;
3470 clabel->num_columns = raidPtr->numCol;
3471 clabel->clean = RF_RAID_DIRTY; /* not clean */
3472 clabel->status = rf_ds_optimal; /* "It's good!" */
3473
3474 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3475 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3476 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3477
3478 clabel->blockSize = raidPtr->bytesPerSector;
3479 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3480
3481 /* XXX not portable */
3482 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3483 clabel->maxOutstanding = raidPtr->maxOutstanding;
3484 clabel->autoconfigure = raidPtr->autoconfigure;
3485 clabel->root_partition = raidPtr->root_partition;
3486 clabel->last_unit = raidPtr->raidid;
3487 clabel->config_order = raidPtr->config_order;
3488
3489 #ifndef RF_NO_PARITY_MAP
3490 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3491 #endif
3492 }
3493
3494 struct raid_softc *
3495 rf_auto_config_set(RF_ConfigSet_t *cset)
3496 {
3497 RF_Raid_t *raidPtr;
3498 RF_Config_t *config;
3499 int raidID;
3500 struct raid_softc *sc;
3501
3502 #ifdef DEBUG
3503 printf("RAID autoconfigure\n");
3504 #endif
3505
3506 /* 1. Create a config structure */
3507 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3508 if (config == NULL) {
3509 printf("%s: Out of mem - config!?!?\n", __func__);
3510 /* XXX do something more intelligent here. */
3511 return NULL;
3512 }
3513
3514 /*
3515 2. Figure out what RAID ID this one is supposed to live at
3516 See if we can get the same RAID dev that it was configured
3517 on last time..
3518 */
3519
3520 raidID = cset->ac->clabel->last_unit;
3521 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3522 sc = raidget(++raidID, false))
3523 continue;
3524 #ifdef DEBUG
3525 printf("Configuring raid%d:\n",raidID);
3526 #endif
3527
3528 if (sc == NULL)
3529 sc = raidget(raidID, true);
3530 if (sc == NULL) {
3531 printf("%s: Out of mem - softc!?!?\n", __func__);
3532 /* XXX do something more intelligent here. */
3533 free(config, M_RAIDFRAME);
3534 return NULL;
3535 }
3536
3537 raidPtr = &sc->sc_r;
3538
3539 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3540 raidPtr->softc = sc;
3541 raidPtr->raidid = raidID;
3542 raidPtr->openings = RAIDOUTSTANDING;
3543
3544 /* 3. Build the configuration structure */
3545 rf_create_configuration(cset->ac, config, raidPtr);
3546
3547 /* 4. Do the configuration */
3548 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3549 raidinit(sc);
3550
3551 rf_markalldirty(raidPtr);
3552 raidPtr->autoconfigure = 1; /* XXX do this here? */
3553 switch (cset->ac->clabel->root_partition) {
3554 case 1: /* Force Root */
3555 case 2: /* Soft Root: root when boot partition part of raid */
3556 /*
3557 * everything configured just fine. Make a note
3558 * that this set is eligible to be root,
3559 * or forced to be root
3560 */
3561 cset->rootable = cset->ac->clabel->root_partition;
3562 /* XXX do this here? */
3563 raidPtr->root_partition = cset->rootable;
3564 break;
3565 default:
3566 break;
3567 }
3568 } else {
3569 raidput(sc);
3570 sc = NULL;
3571 }
3572
3573 /* 5. Cleanup */
3574 free(config, M_RAIDFRAME);
3575 return sc;
3576 }
3577
3578 void
3579 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3580 size_t xmin, size_t xmax)
3581 {
3582 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3583 pool_sethiwat(p, xmax);
3584 pool_prime(p, xmin);
3585 pool_setlowat(p, xmin);
3586 }
3587
3588 /*
3589 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3590 * to see if there is IO pending and if that IO could possibly be done
3591 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3592 * otherwise.
3593 *
3594 */
3595 int
3596 rf_buf_queue_check(RF_Raid_t *raidPtr)
3597 {
3598 struct raid_softc *rs;
3599 struct dk_softc *dksc;
3600
3601 rs = raidPtr->softc;
3602 dksc = &rs->sc_dksc;
3603
3604 if ((rs->sc_flags & RAIDF_INITED) == 0)
3605 return 1;
3606
3607 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3608 /* there is work to do */
3609 return 0;
3610 }
3611 /* default is nothing to do */
3612 return 1;
3613 }
3614
3615 int
3616 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3617 {
3618 uint64_t numsecs;
3619 unsigned secsize;
3620 int error;
3621
3622 error = getdisksize(vp, &numsecs, &secsize);
3623 if (error == 0) {
3624 diskPtr->blockSize = secsize;
3625 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3626 diskPtr->partitionSize = numsecs;
3627 return 0;
3628 }
3629 return error;
3630 }
3631
3632 static int
3633 raid_match(device_t self, cfdata_t cfdata, void *aux)
3634 {
3635 return 1;
3636 }
3637
3638 static void
3639 raid_attach(device_t parent, device_t self, void *aux)
3640 {
3641 }
3642
3643
3644 static int
3645 raid_detach(device_t self, int flags)
3646 {
3647 int error;
3648 struct raid_softc *rs = raidsoftc(self);
3649
3650 if (rs == NULL)
3651 return ENXIO;
3652
3653 if ((error = raidlock(rs)) != 0)
3654 return (error);
3655
3656 error = raid_detach_unlocked(rs);
3657
3658 raidunlock(rs);
3659
3660 /* XXX raid can be referenced here */
3661
3662 if (error)
3663 return error;
3664
3665 /* Free the softc */
3666 raidput(rs);
3667
3668 return 0;
3669 }
3670
3671 static void
3672 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3673 {
3674 struct dk_softc *dksc = &rs->sc_dksc;
3675 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3676
3677 memset(dg, 0, sizeof(*dg));
3678
3679 dg->dg_secperunit = raidPtr->totalSectors;
3680 dg->dg_secsize = raidPtr->bytesPerSector;
3681 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3682 dg->dg_ntracks = 4 * raidPtr->numCol;
3683
3684 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3685 }
3686
3687 /*
3688 * Get cache info for all the components (including spares).
3689 * Returns intersection of all the cache flags of all disks, or first
3690 * error if any encountered.
3691 * XXXfua feature flags can change as spares are added - lock down somehow
3692 */
3693 static int
3694 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3695 {
3696 int c;
3697 int error;
3698 int dkwhole = 0, dkpart;
3699
3700 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3701 /*
3702 * Check any non-dead disk, even when currently being
3703 * reconstructed.
3704 */
3705 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3706 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3707 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3708 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3709 if (error) {
3710 if (error != ENODEV) {
3711 printf("raid%d: get cache for component %s failed\n",
3712 raidPtr->raidid,
3713 raidPtr->Disks[c].devname);
3714 }
3715
3716 return error;
3717 }
3718
3719 if (c == 0)
3720 dkwhole = dkpart;
3721 else
3722 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3723 }
3724 }
3725
3726 *data = dkwhole;
3727
3728 return 0;
3729 }
3730
3731 /*
3732 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3733 * We end up returning whatever error was returned by the first cache flush
3734 * that fails.
3735 */
3736
3737 int
3738 rf_sync_component_caches(RF_Raid_t *raidPtr)
3739 {
3740 int c, sparecol;
3741 int e,error;
3742 int force = 1;
3743
3744 error = 0;
3745 for (c = 0; c < raidPtr->numCol; c++) {
3746 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3747 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3748 &force, FWRITE, NOCRED);
3749 if (e) {
3750 if (e != ENODEV)
3751 printf("raid%d: cache flush to component %s failed.\n",
3752 raidPtr->raidid, raidPtr->Disks[c].devname);
3753 if (error == 0) {
3754 error = e;
3755 }
3756 }
3757 }
3758 }
3759
3760 for( c = 0; c < raidPtr->numSpare ; c++) {
3761 sparecol = raidPtr->numCol + c;
3762 /* Need to ensure that the reconstruct actually completed! */
3763 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3764 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3765 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3766 if (e) {
3767 if (e != ENODEV)
3768 printf("raid%d: cache flush to component %s failed.\n",
3769 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3770 if (error == 0) {
3771 error = e;
3772 }
3773 }
3774 }
3775 }
3776 return error;
3777 }
3778
3779 /*
3780 * Module interface
3781 */
3782
3783 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3784
3785 #ifdef _MODULE
3786 CFDRIVER_DECL(raid, DV_DISK, NULL);
3787 #endif
3788
3789 static int raid_modcmd(modcmd_t, void *);
3790 static int raid_modcmd_init(void);
3791 static int raid_modcmd_fini(void);
3792
3793 static int
3794 raid_modcmd(modcmd_t cmd, void *data)
3795 {
3796 int error;
3797
3798 error = 0;
3799 switch (cmd) {
3800 case MODULE_CMD_INIT:
3801 error = raid_modcmd_init();
3802 break;
3803 case MODULE_CMD_FINI:
3804 error = raid_modcmd_fini();
3805 break;
3806 default:
3807 error = ENOTTY;
3808 break;
3809 }
3810 return error;
3811 }
3812
3813 static int
3814 raid_modcmd_init(void)
3815 {
3816 int error;
3817 int bmajor, cmajor;
3818
3819 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3820 mutex_enter(&raid_lock);
3821 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3822 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3823 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3824 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3825
3826 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3827 #endif
3828
3829 bmajor = cmajor = -1;
3830 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3831 &raid_cdevsw, &cmajor);
3832 if (error != 0 && error != EEXIST) {
3833 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3834 mutex_exit(&raid_lock);
3835 return error;
3836 }
3837 #ifdef _MODULE
3838 error = config_cfdriver_attach(&raid_cd);
3839 if (error != 0) {
3840 aprint_error("%s: config_cfdriver_attach failed %d\n",
3841 __func__, error);
3842 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3843 mutex_exit(&raid_lock);
3844 return error;
3845 }
3846 #endif
3847 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3848 if (error != 0) {
3849 aprint_error("%s: config_cfattach_attach failed %d\n",
3850 __func__, error);
3851 #ifdef _MODULE
3852 config_cfdriver_detach(&raid_cd);
3853 #endif
3854 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3855 mutex_exit(&raid_lock);
3856 return error;
3857 }
3858
3859 raidautoconfigdone = false;
3860
3861 mutex_exit(&raid_lock);
3862
3863 if (error == 0) {
3864 if (rf_BootRaidframe(true) == 0)
3865 aprint_verbose("Kernelized RAIDframe activated\n");
3866 else
3867 panic("Serious error activating RAID!!");
3868 }
3869
3870 /*
3871 * Register a finalizer which will be used to auto-config RAID
3872 * sets once all real hardware devices have been found.
3873 */
3874 error = config_finalize_register(NULL, rf_autoconfig);
3875 if (error != 0) {
3876 aprint_error("WARNING: unable to register RAIDframe "
3877 "finalizer\n");
3878 error = 0;
3879 }
3880
3881 return error;
3882 }
3883
3884 static int
3885 raid_modcmd_fini(void)
3886 {
3887 int error;
3888
3889 mutex_enter(&raid_lock);
3890
3891 /* Don't allow unload if raid device(s) exist. */
3892 if (!LIST_EMPTY(&raids)) {
3893 mutex_exit(&raid_lock);
3894 return EBUSY;
3895 }
3896
3897 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3898 if (error != 0) {
3899 aprint_error("%s: cannot detach cfattach\n",__func__);
3900 mutex_exit(&raid_lock);
3901 return error;
3902 }
3903 #ifdef _MODULE
3904 error = config_cfdriver_detach(&raid_cd);
3905 if (error != 0) {
3906 aprint_error("%s: cannot detach cfdriver\n",__func__);
3907 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3908 mutex_exit(&raid_lock);
3909 return error;
3910 }
3911 #endif
3912 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3913 if (error != 0) {
3914 aprint_error("%s: cannot detach devsw\n",__func__);
3915 #ifdef _MODULE
3916 config_cfdriver_attach(&raid_cd);
3917 #endif
3918 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3919 mutex_exit(&raid_lock);
3920 return error;
3921 }
3922 rf_BootRaidframe(false);
3923 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3924 rf_destroy_mutex2(rf_sparet_wait_mutex);
3925 rf_destroy_cond2(rf_sparet_wait_cv);
3926 rf_destroy_cond2(rf_sparet_resp_cv);
3927 #endif
3928 mutex_exit(&raid_lock);
3929 mutex_destroy(&raid_lock);
3930
3931 return error;
3932 }
3933