rf_netbsdkintf.c revision 1.316.2.8 1 /* $NetBSD: rf_netbsdkintf.c,v 1.316.2.8 2017/08/28 17:52:26 skrll Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.316.2.8 2017/08/28 17:52:26 skrll Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #include "ioconf.h"
156
157 #ifdef DEBUG
158 int rf_kdebug_level = 0;
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #else /* DEBUG */
161 #define db1_printf(a) { }
162 #endif /* DEBUG */
163
164 #ifdef DEBUG_ROOT
165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
166 #else
167 #define DPRINTF(a, ...)
168 #endif
169
170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
171 static rf_declare_mutex2(rf_sparet_wait_mutex);
172 static rf_declare_cond2(rf_sparet_wait_cv);
173 static rf_declare_cond2(rf_sparet_resp_cv);
174
175 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
176 * spare table */
177 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
178 * installation process */
179 #endif
180
181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
182
183 /* prototypes */
184 static void KernelWakeupFunc(struct buf *);
185 static void InitBP(struct buf *, struct vnode *, unsigned,
186 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
187 void *, int, struct proc *);
188 struct raid_softc;
189 static void raidinit(struct raid_softc *);
190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
191 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
192
193 static int raid_match(device_t, cfdata_t, void *);
194 static void raid_attach(device_t, device_t, void *);
195 static int raid_detach(device_t, int);
196
197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
198 daddr_t, daddr_t);
199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
200 daddr_t, daddr_t, int);
201
202 static int raidwrite_component_label(unsigned,
203 dev_t, struct vnode *, RF_ComponentLabel_t *);
204 static int raidread_component_label(unsigned,
205 dev_t, struct vnode *, RF_ComponentLabel_t *);
206
207 static int raid_diskstart(device_t, struct buf *bp);
208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
209 static int raid_lastclose(device_t);
210
211 static dev_type_open(raidopen);
212 static dev_type_close(raidclose);
213 static dev_type_read(raidread);
214 static dev_type_write(raidwrite);
215 static dev_type_ioctl(raidioctl);
216 static dev_type_strategy(raidstrategy);
217 static dev_type_dump(raiddump);
218 static dev_type_size(raidsize);
219
220 const struct bdevsw raid_bdevsw = {
221 .d_open = raidopen,
222 .d_close = raidclose,
223 .d_strategy = raidstrategy,
224 .d_ioctl = raidioctl,
225 .d_dump = raiddump,
226 .d_psize = raidsize,
227 .d_discard = nodiscard,
228 .d_flag = D_DISK
229 };
230
231 const struct cdevsw raid_cdevsw = {
232 .d_open = raidopen,
233 .d_close = raidclose,
234 .d_read = raidread,
235 .d_write = raidwrite,
236 .d_ioctl = raidioctl,
237 .d_stop = nostop,
238 .d_tty = notty,
239 .d_poll = nopoll,
240 .d_mmap = nommap,
241 .d_kqfilter = nokqfilter,
242 .d_discard = nodiscard,
243 .d_flag = D_DISK
244 };
245
246 static struct dkdriver rf_dkdriver = {
247 .d_open = raidopen,
248 .d_close = raidclose,
249 .d_strategy = raidstrategy,
250 .d_diskstart = raid_diskstart,
251 .d_dumpblocks = raid_dumpblocks,
252 .d_lastclose = raid_lastclose,
253 .d_minphys = minphys
254 };
255
256 struct raid_softc {
257 struct dk_softc sc_dksc;
258 int sc_unit;
259 int sc_flags; /* flags */
260 int sc_cflags; /* configuration flags */
261 kmutex_t sc_mutex; /* interlock mutex */
262 kcondvar_t sc_cv; /* and the condvar */
263 uint64_t sc_size; /* size of the raid device */
264 char sc_xname[20]; /* XXX external name */
265 RF_Raid_t sc_r;
266 LIST_ENTRY(raid_softc) sc_link;
267 };
268 /* sc_flags */
269 #define RAIDF_INITED 0x01 /* unit has been initialized */
270 #define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */
271 #define RAIDF_DETACH 0x04 /* detach after final close */
272 #define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */
273 #define RAIDF_LOCKED 0x10 /* unit is locked */
274 #define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */
275
276 #define raidunit(x) DISKUNIT(x)
277 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
278
279 extern struct cfdriver raid_cd;
280 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
281 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
282 DVF_DETACH_SHUTDOWN);
283
284 /*
285 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
286 * Be aware that large numbers can allow the driver to consume a lot of
287 * kernel memory, especially on writes, and in degraded mode reads.
288 *
289 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
290 * a single 64K write will typically require 64K for the old data,
291 * 64K for the old parity, and 64K for the new parity, for a total
292 * of 192K (if the parity buffer is not re-used immediately).
293 * Even it if is used immediately, that's still 128K, which when multiplied
294 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
295 *
296 * Now in degraded mode, for example, a 64K read on the above setup may
297 * require data reconstruction, which will require *all* of the 4 remaining
298 * disks to participate -- 4 * 32K/disk == 128K again.
299 */
300
301 #ifndef RAIDOUTSTANDING
302 #define RAIDOUTSTANDING 6
303 #endif
304
305 #define RAIDLABELDEV(dev) \
306 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
307
308 /* declared here, and made public, for the benefit of KVM stuff.. */
309
310 static int raidlock(struct raid_softc *);
311 static void raidunlock(struct raid_softc *);
312
313 static int raid_detach_unlocked(struct raid_softc *);
314
315 static void rf_markalldirty(RF_Raid_t *);
316 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
317
318 void rf_ReconThread(struct rf_recon_req *);
319 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
320 void rf_CopybackThread(RF_Raid_t *raidPtr);
321 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
322 int rf_autoconfig(device_t);
323 void rf_buildroothack(RF_ConfigSet_t *);
324
325 RF_AutoConfig_t *rf_find_raid_components(void);
326 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
327 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
328 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
329 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
330 int rf_set_autoconfig(RF_Raid_t *, int);
331 int rf_set_rootpartition(RF_Raid_t *, int);
332 void rf_release_all_vps(RF_ConfigSet_t *);
333 void rf_cleanup_config_set(RF_ConfigSet_t *);
334 int rf_have_enough_components(RF_ConfigSet_t *);
335 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
336 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
337
338 /*
339 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
340 * Note that this is overridden by having RAID_AUTOCONFIG as an option
341 * in the kernel config file.
342 */
343 #ifdef RAID_AUTOCONFIG
344 int raidautoconfig = 1;
345 #else
346 int raidautoconfig = 0;
347 #endif
348 static bool raidautoconfigdone = false;
349
350 struct RF_Pools_s rf_pools;
351
352 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
353 static kmutex_t raid_lock;
354
355 static struct raid_softc *
356 raidcreate(int unit) {
357 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
358 sc->sc_unit = unit;
359 cv_init(&sc->sc_cv, "raidunit");
360 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
361 return sc;
362 }
363
364 static void
365 raiddestroy(struct raid_softc *sc) {
366 cv_destroy(&sc->sc_cv);
367 mutex_destroy(&sc->sc_mutex);
368 kmem_free(sc, sizeof(*sc));
369 }
370
371 static struct raid_softc *
372 raidget(int unit, bool create) {
373 struct raid_softc *sc;
374 if (unit < 0) {
375 #ifdef DIAGNOSTIC
376 panic("%s: unit %d!", __func__, unit);
377 #endif
378 return NULL;
379 }
380 mutex_enter(&raid_lock);
381 LIST_FOREACH(sc, &raids, sc_link) {
382 if (sc->sc_unit == unit) {
383 mutex_exit(&raid_lock);
384 return sc;
385 }
386 }
387 mutex_exit(&raid_lock);
388 if (!create)
389 return NULL;
390 if ((sc = raidcreate(unit)) == NULL)
391 return NULL;
392 mutex_enter(&raid_lock);
393 LIST_INSERT_HEAD(&raids, sc, sc_link);
394 mutex_exit(&raid_lock);
395 return sc;
396 }
397
398 static void
399 raidput(struct raid_softc *sc) {
400 mutex_enter(&raid_lock);
401 LIST_REMOVE(sc, sc_link);
402 mutex_exit(&raid_lock);
403 raiddestroy(sc);
404 }
405
406 void
407 raidattach(int num)
408 {
409
410 /*
411 * Device attachment and associated initialization now occurs
412 * as part of the module initialization.
413 */
414 }
415
416 int
417 rf_autoconfig(device_t self)
418 {
419 RF_AutoConfig_t *ac_list;
420 RF_ConfigSet_t *config_sets;
421
422 if (!raidautoconfig || raidautoconfigdone == true)
423 return (0);
424
425 /* XXX This code can only be run once. */
426 raidautoconfigdone = true;
427
428 #ifdef __HAVE_CPU_BOOTCONF
429 /*
430 * 0. find the boot device if needed first so we can use it later
431 * this needs to be done before we autoconfigure any raid sets,
432 * because if we use wedges we are not going to be able to open
433 * the boot device later
434 */
435 if (booted_device == NULL)
436 cpu_bootconf();
437 #endif
438 /* 1. locate all RAID components on the system */
439 aprint_debug("Searching for RAID components...\n");
440 ac_list = rf_find_raid_components();
441
442 /* 2. Sort them into their respective sets. */
443 config_sets = rf_create_auto_sets(ac_list);
444
445 /*
446 * 3. Evaluate each set and configure the valid ones.
447 * This gets done in rf_buildroothack().
448 */
449 rf_buildroothack(config_sets);
450
451 return 1;
452 }
453
454 static int
455 rf_containsboot(RF_Raid_t *r, device_t bdv) {
456 const char *bootname = device_xname(bdv);
457 size_t len = strlen(bootname);
458
459 for (int col = 0; col < r->numCol; col++) {
460 const char *devname = r->Disks[col].devname;
461 devname += sizeof("/dev/") - 1;
462 if (strncmp(devname, "dk", 2) == 0) {
463 const char *parent =
464 dkwedge_get_parent_name(r->Disks[col].dev);
465 if (parent != NULL)
466 devname = parent;
467 }
468 if (strncmp(devname, bootname, len) == 0) {
469 struct raid_softc *sc = r->softc;
470 aprint_debug("raid%d includes boot device %s\n",
471 sc->sc_unit, devname);
472 return 1;
473 }
474 }
475 return 0;
476 }
477
478 void
479 rf_buildroothack(RF_ConfigSet_t *config_sets)
480 {
481 RF_ConfigSet_t *cset;
482 RF_ConfigSet_t *next_cset;
483 int num_root;
484 struct raid_softc *sc, *rsc;
485 struct dk_softc *dksc;
486
487 sc = rsc = NULL;
488 num_root = 0;
489 cset = config_sets;
490 while (cset != NULL) {
491 next_cset = cset->next;
492 if (rf_have_enough_components(cset) &&
493 cset->ac->clabel->autoconfigure == 1) {
494 sc = rf_auto_config_set(cset);
495 if (sc != NULL) {
496 aprint_debug("raid%d: configured ok\n",
497 sc->sc_unit);
498 if (cset->rootable) {
499 rsc = sc;
500 num_root++;
501 }
502 } else {
503 /* The autoconfig didn't work :( */
504 aprint_debug("Autoconfig failed\n");
505 rf_release_all_vps(cset);
506 }
507 } else {
508 /* we're not autoconfiguring this set...
509 release the associated resources */
510 rf_release_all_vps(cset);
511 }
512 /* cleanup */
513 rf_cleanup_config_set(cset);
514 cset = next_cset;
515 }
516 dksc = &rsc->sc_dksc;
517
518 /* if the user has specified what the root device should be
519 then we don't touch booted_device or boothowto... */
520
521 if (rootspec != NULL)
522 return;
523
524 /* we found something bootable... */
525
526 /*
527 * XXX: The following code assumes that the root raid
528 * is the first ('a') partition. This is about the best
529 * we can do with a BSD disklabel, but we might be able
530 * to do better with a GPT label, by setting a specified
531 * attribute to indicate the root partition. We can then
532 * stash the partition number in the r->root_partition
533 * high bits (the bottom 2 bits are already used). For
534 * now we just set booted_partition to 0 when we override
535 * root.
536 */
537 if (num_root == 1) {
538 device_t candidate_root;
539 if (dksc->sc_dkdev.dk_nwedges != 0) {
540 char cname[sizeof(cset->ac->devname)];
541 /* XXX: assume partition 'a' first */
542 snprintf(cname, sizeof(cname), "%s%c",
543 device_xname(dksc->sc_dev), 'a');
544 candidate_root = dkwedge_find_by_wname(cname);
545 DPRINTF("%s: candidate wedge root=%s\n", __func__,
546 cname);
547 if (candidate_root == NULL) {
548 /*
549 * If that is not found, because we don't use
550 * disklabel, return the first dk child
551 * XXX: we can skip the 'a' check above
552 * and always do this...
553 */
554 size_t i = 0;
555 candidate_root = dkwedge_find_by_parent(
556 device_xname(dksc->sc_dev), &i);
557 }
558 DPRINTF("%s: candidate wedge root=%p\n", __func__,
559 candidate_root);
560 } else
561 candidate_root = dksc->sc_dev;
562 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
563 DPRINTF("%s: booted_device=%p root_partition=%d "
564 "contains_boot=%d\n", __func__, booted_device,
565 rsc->sc_r.root_partition,
566 rf_containsboot(&rsc->sc_r, booted_device));
567 if (booted_device == NULL ||
568 rsc->sc_r.root_partition == 1 ||
569 rf_containsboot(&rsc->sc_r, booted_device)) {
570 booted_device = candidate_root;
571 booted_partition = 0; /* XXX assume 'a' */
572 }
573 } else if (num_root > 1) {
574 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
575 booted_device);
576
577 /*
578 * Maybe the MD code can help. If it cannot, then
579 * setroot() will discover that we have no
580 * booted_device and will ask the user if nothing was
581 * hardwired in the kernel config file
582 */
583 if (booted_device == NULL)
584 return;
585
586 num_root = 0;
587 mutex_enter(&raid_lock);
588 LIST_FOREACH(sc, &raids, sc_link) {
589 RF_Raid_t *r = &sc->sc_r;
590 if (r->valid == 0)
591 continue;
592
593 if (r->root_partition == 0)
594 continue;
595
596 if (rf_containsboot(r, booted_device)) {
597 num_root++;
598 rsc = sc;
599 dksc = &rsc->sc_dksc;
600 }
601 }
602 mutex_exit(&raid_lock);
603
604 if (num_root == 1) {
605 booted_device = dksc->sc_dev;
606 booted_partition = 0; /* XXX assume 'a' */
607 } else {
608 /* we can't guess.. require the user to answer... */
609 boothowto |= RB_ASKNAME;
610 }
611 }
612 }
613
614 static int
615 raidsize(dev_t dev)
616 {
617 struct raid_softc *rs;
618 struct dk_softc *dksc;
619 unsigned int unit;
620
621 unit = raidunit(dev);
622 if ((rs = raidget(unit, false)) == NULL)
623 return -1;
624 dksc = &rs->sc_dksc;
625
626 if ((rs->sc_flags & RAIDF_INITED) == 0)
627 return -1;
628
629 return dk_size(dksc, dev);
630 }
631
632 static int
633 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
634 {
635 unsigned int unit;
636 struct raid_softc *rs;
637 struct dk_softc *dksc;
638
639 unit = raidunit(dev);
640 if ((rs = raidget(unit, false)) == NULL)
641 return ENXIO;
642 dksc = &rs->sc_dksc;
643
644 if ((rs->sc_flags & RAIDF_INITED) == 0)
645 return ENODEV;
646
647 /*
648 Note that blkno is relative to this particular partition.
649 By adding adding RF_PROTECTED_SECTORS, we get a value that
650 is relative to the partition used for the underlying component.
651 */
652 blkno += RF_PROTECTED_SECTORS;
653
654 return dk_dump(dksc, dev, blkno, va, size);
655 }
656
657 static int
658 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
659 {
660 struct raid_softc *rs = raidsoftc(dev);
661 const struct bdevsw *bdev;
662 RF_Raid_t *raidPtr;
663 int c, sparecol, j, scol, dumpto;
664 int error = 0;
665
666 raidPtr = &rs->sc_r;
667
668 /* we only support dumping to RAID 1 sets */
669 if (raidPtr->Layout.numDataCol != 1 ||
670 raidPtr->Layout.numParityCol != 1)
671 return EINVAL;
672
673 if ((error = raidlock(rs)) != 0)
674 return error;
675
676 /* figure out what device is alive.. */
677
678 /*
679 Look for a component to dump to. The preference for the
680 component to dump to is as follows:
681 1) the master
682 2) a used_spare of the master
683 3) the slave
684 4) a used_spare of the slave
685 */
686
687 dumpto = -1;
688 for (c = 0; c < raidPtr->numCol; c++) {
689 if (raidPtr->Disks[c].status == rf_ds_optimal) {
690 /* this might be the one */
691 dumpto = c;
692 break;
693 }
694 }
695
696 /*
697 At this point we have possibly selected a live master or a
698 live slave. We now check to see if there is a spared
699 master (or a spared slave), if we didn't find a live master
700 or a live slave.
701 */
702
703 for (c = 0; c < raidPtr->numSpare; c++) {
704 sparecol = raidPtr->numCol + c;
705 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
706 /* How about this one? */
707 scol = -1;
708 for(j=0;j<raidPtr->numCol;j++) {
709 if (raidPtr->Disks[j].spareCol == sparecol) {
710 scol = j;
711 break;
712 }
713 }
714 if (scol == 0) {
715 /*
716 We must have found a spared master!
717 We'll take that over anything else
718 found so far. (We couldn't have
719 found a real master before, since
720 this is a used spare, and it's
721 saying that it's replacing the
722 master.) On reboot (with
723 autoconfiguration turned on)
724 sparecol will become the 1st
725 component (component0) of this set.
726 */
727 dumpto = sparecol;
728 break;
729 } else if (scol != -1) {
730 /*
731 Must be a spared slave. We'll dump
732 to that if we havn't found anything
733 else so far.
734 */
735 if (dumpto == -1)
736 dumpto = sparecol;
737 }
738 }
739 }
740
741 if (dumpto == -1) {
742 /* we couldn't find any live components to dump to!?!?
743 */
744 error = EINVAL;
745 goto out;
746 }
747
748 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
749 if (bdev == NULL) {
750 error = ENXIO;
751 goto out;
752 }
753
754 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
755 blkno, va, nblk * raidPtr->bytesPerSector);
756
757 out:
758 raidunlock(rs);
759
760 return error;
761 }
762
763 /* ARGSUSED */
764 static int
765 raidopen(dev_t dev, int flags, int fmt,
766 struct lwp *l)
767 {
768 int unit = raidunit(dev);
769 struct raid_softc *rs;
770 struct dk_softc *dksc;
771 int error = 0;
772 int part, pmask;
773
774 if ((rs = raidget(unit, true)) == NULL)
775 return ENXIO;
776 if ((error = raidlock(rs)) != 0)
777 return (error);
778
779 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
780 error = EBUSY;
781 goto bad;
782 }
783
784 dksc = &rs->sc_dksc;
785
786 part = DISKPART(dev);
787 pmask = (1 << part);
788
789 if (!DK_BUSY(dksc, pmask) &&
790 ((rs->sc_flags & RAIDF_INITED) != 0)) {
791 /* First one... mark things as dirty... Note that we *MUST*
792 have done a configure before this. I DO NOT WANT TO BE
793 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
794 THAT THEY BELONG TOGETHER!!!!! */
795 /* XXX should check to see if we're only open for reading
796 here... If so, we needn't do this, but then need some
797 other way of keeping track of what's happened.. */
798
799 rf_markalldirty(&rs->sc_r);
800 }
801
802 if ((rs->sc_flags & RAIDF_INITED) != 0)
803 error = dk_open(dksc, dev, flags, fmt, l);
804
805 bad:
806 raidunlock(rs);
807
808 return (error);
809
810
811 }
812
813 static int
814 raid_lastclose(device_t self)
815 {
816 struct raid_softc *rs = raidsoftc(self);
817
818 /* Last one... device is not unconfigured yet.
819 Device shutdown has taken care of setting the
820 clean bits if RAIDF_INITED is not set
821 mark things as clean... */
822
823 rf_update_component_labels(&rs->sc_r,
824 RF_FINAL_COMPONENT_UPDATE);
825
826 /* pass to unlocked code */
827 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
828 rs->sc_flags |= RAIDF_DETACH;
829
830 return 0;
831 }
832
833 /* ARGSUSED */
834 static int
835 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
836 {
837 int unit = raidunit(dev);
838 struct raid_softc *rs;
839 struct dk_softc *dksc;
840 cfdata_t cf;
841 int error = 0, do_detach = 0, do_put = 0;
842
843 if ((rs = raidget(unit, false)) == NULL)
844 return ENXIO;
845 dksc = &rs->sc_dksc;
846
847 if ((error = raidlock(rs)) != 0)
848 return (error);
849
850 if ((rs->sc_flags & RAIDF_INITED) != 0) {
851 error = dk_close(dksc, dev, flags, fmt, l);
852 if ((rs->sc_flags & RAIDF_DETACH) != 0)
853 do_detach = 1;
854 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
855 do_put = 1;
856
857 raidunlock(rs);
858
859 if (do_detach) {
860 /* free the pseudo device attach bits */
861 cf = device_cfdata(dksc->sc_dev);
862 error = config_detach(dksc->sc_dev, 0);
863 if (error == 0)
864 free(cf, M_RAIDFRAME);
865 } else if (do_put) {
866 raidput(rs);
867 }
868
869 return (error);
870
871 }
872
873 static void
874 raid_wakeup(RF_Raid_t *raidPtr)
875 {
876 rf_lock_mutex2(raidPtr->iodone_lock);
877 rf_signal_cond2(raidPtr->iodone_cv);
878 rf_unlock_mutex2(raidPtr->iodone_lock);
879 }
880
881 static void
882 raidstrategy(struct buf *bp)
883 {
884 unsigned int unit;
885 struct raid_softc *rs;
886 struct dk_softc *dksc;
887 RF_Raid_t *raidPtr;
888
889 unit = raidunit(bp->b_dev);
890 if ((rs = raidget(unit, false)) == NULL) {
891 bp->b_error = ENXIO;
892 goto fail;
893 }
894 if ((rs->sc_flags & RAIDF_INITED) == 0) {
895 bp->b_error = ENXIO;
896 goto fail;
897 }
898 dksc = &rs->sc_dksc;
899 raidPtr = &rs->sc_r;
900
901 /* Queue IO only */
902 if (dk_strategy_defer(dksc, bp))
903 goto done;
904
905 /* schedule the IO to happen at the next convenient time */
906 raid_wakeup(raidPtr);
907
908 done:
909 return;
910
911 fail:
912 bp->b_resid = bp->b_bcount;
913 biodone(bp);
914 }
915
916 static int
917 raid_diskstart(device_t dev, struct buf *bp)
918 {
919 struct raid_softc *rs = raidsoftc(dev);
920 RF_Raid_t *raidPtr;
921
922 raidPtr = &rs->sc_r;
923 if (!raidPtr->valid) {
924 db1_printf(("raid is not valid..\n"));
925 return ENODEV;
926 }
927
928 /* XXX */
929 bp->b_resid = 0;
930
931 return raiddoaccess(raidPtr, bp);
932 }
933
934 void
935 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
936 {
937 struct raid_softc *rs;
938 struct dk_softc *dksc;
939
940 rs = raidPtr->softc;
941 dksc = &rs->sc_dksc;
942
943 dk_done(dksc, bp);
944
945 rf_lock_mutex2(raidPtr->mutex);
946 raidPtr->openings++;
947 rf_unlock_mutex2(raidPtr->mutex);
948
949 /* schedule more IO */
950 raid_wakeup(raidPtr);
951 }
952
953 /* ARGSUSED */
954 static int
955 raidread(dev_t dev, struct uio *uio, int flags)
956 {
957 int unit = raidunit(dev);
958 struct raid_softc *rs;
959
960 if ((rs = raidget(unit, false)) == NULL)
961 return ENXIO;
962
963 if ((rs->sc_flags & RAIDF_INITED) == 0)
964 return (ENXIO);
965
966 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
967
968 }
969
970 /* ARGSUSED */
971 static int
972 raidwrite(dev_t dev, struct uio *uio, int flags)
973 {
974 int unit = raidunit(dev);
975 struct raid_softc *rs;
976
977 if ((rs = raidget(unit, false)) == NULL)
978 return ENXIO;
979
980 if ((rs->sc_flags & RAIDF_INITED) == 0)
981 return (ENXIO);
982
983 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
984
985 }
986
987 static int
988 raid_detach_unlocked(struct raid_softc *rs)
989 {
990 struct dk_softc *dksc = &rs->sc_dksc;
991 RF_Raid_t *raidPtr;
992 int error;
993
994 raidPtr = &rs->sc_r;
995
996 if (DK_BUSY(dksc, 0) ||
997 raidPtr->recon_in_progress != 0 ||
998 raidPtr->parity_rewrite_in_progress != 0 ||
999 raidPtr->copyback_in_progress != 0)
1000 return EBUSY;
1001
1002 if ((rs->sc_flags & RAIDF_INITED) == 0)
1003 return 0;
1004
1005 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1006
1007 if ((error = rf_Shutdown(raidPtr)) != 0)
1008 return error;
1009
1010 rs->sc_flags &= ~RAIDF_INITED;
1011
1012 /* Kill off any queued buffers */
1013 dk_drain(dksc);
1014 bufq_free(dksc->sc_bufq);
1015
1016 /* Detach the disk. */
1017 dkwedge_delall(&dksc->sc_dkdev);
1018 disk_detach(&dksc->sc_dkdev);
1019 disk_destroy(&dksc->sc_dkdev);
1020 dk_detach(dksc);
1021
1022 return 0;
1023 }
1024
1025 static int
1026 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1027 {
1028 int unit = raidunit(dev);
1029 int error = 0;
1030 int part, pmask;
1031 struct raid_softc *rs;
1032 struct dk_softc *dksc;
1033 RF_Config_t *k_cfg, *u_cfg;
1034 RF_Raid_t *raidPtr;
1035 RF_RaidDisk_t *diskPtr;
1036 RF_AccTotals_t *totals;
1037 RF_DeviceConfig_t *d_cfg, **ucfgp;
1038 u_char *specific_buf;
1039 int retcode = 0;
1040 int column;
1041 /* int raidid; */
1042 struct rf_recon_req *rrcopy, *rr;
1043 RF_ComponentLabel_t *clabel;
1044 RF_ComponentLabel_t *ci_label;
1045 RF_ComponentLabel_t **clabel_ptr;
1046 RF_SingleComponent_t *sparePtr,*componentPtr;
1047 RF_SingleComponent_t component;
1048 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1049 int i, j, d;
1050
1051 if ((rs = raidget(unit, false)) == NULL)
1052 return ENXIO;
1053 dksc = &rs->sc_dksc;
1054 raidPtr = &rs->sc_r;
1055
1056 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1057 (int) DISKPART(dev), (int) unit, cmd));
1058
1059 /* Must be initialized for these... */
1060 switch (cmd) {
1061 case RAIDFRAME_REWRITEPARITY:
1062 case RAIDFRAME_GET_INFO:
1063 case RAIDFRAME_RESET_ACCTOTALS:
1064 case RAIDFRAME_GET_ACCTOTALS:
1065 case RAIDFRAME_KEEP_ACCTOTALS:
1066 case RAIDFRAME_GET_SIZE:
1067 case RAIDFRAME_FAIL_DISK:
1068 case RAIDFRAME_COPYBACK:
1069 case RAIDFRAME_CHECK_RECON_STATUS:
1070 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1071 case RAIDFRAME_GET_COMPONENT_LABEL:
1072 case RAIDFRAME_SET_COMPONENT_LABEL:
1073 case RAIDFRAME_ADD_HOT_SPARE:
1074 case RAIDFRAME_REMOVE_HOT_SPARE:
1075 case RAIDFRAME_INIT_LABELS:
1076 case RAIDFRAME_REBUILD_IN_PLACE:
1077 case RAIDFRAME_CHECK_PARITY:
1078 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1079 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1080 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1081 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1082 case RAIDFRAME_SET_AUTOCONFIG:
1083 case RAIDFRAME_SET_ROOT:
1084 case RAIDFRAME_DELETE_COMPONENT:
1085 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1086 case RAIDFRAME_PARITYMAP_STATUS:
1087 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1088 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1089 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1090 if ((rs->sc_flags & RAIDF_INITED) == 0)
1091 return (ENXIO);
1092 }
1093
1094 switch (cmd) {
1095 #ifdef COMPAT_50
1096 case RAIDFRAME_GET_INFO50:
1097 return rf_get_info50(raidPtr, data);
1098
1099 case RAIDFRAME_CONFIGURE50:
1100 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1101 return retcode;
1102 goto config;
1103 #endif
1104 /* configure the system */
1105 case RAIDFRAME_CONFIGURE:
1106
1107 if (raidPtr->valid) {
1108 /* There is a valid RAID set running on this unit! */
1109 printf("raid%d: Device already configured!\n",unit);
1110 return(EINVAL);
1111 }
1112
1113 /* copy-in the configuration information */
1114 /* data points to a pointer to the configuration structure */
1115
1116 u_cfg = *((RF_Config_t **) data);
1117 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1118 if (k_cfg == NULL) {
1119 return (ENOMEM);
1120 }
1121 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1122 if (retcode) {
1123 RF_Free(k_cfg, sizeof(RF_Config_t));
1124 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1125 retcode));
1126 goto no_config;
1127 }
1128 goto config;
1129 config:
1130 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1131
1132 /* allocate a buffer for the layout-specific data, and copy it
1133 * in */
1134 if (k_cfg->layoutSpecificSize) {
1135 if (k_cfg->layoutSpecificSize > 10000) {
1136 /* sanity check */
1137 RF_Free(k_cfg, sizeof(RF_Config_t));
1138 retcode = EINVAL;
1139 goto no_config;
1140 }
1141 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1142 (u_char *));
1143 if (specific_buf == NULL) {
1144 RF_Free(k_cfg, sizeof(RF_Config_t));
1145 retcode = ENOMEM;
1146 goto no_config;
1147 }
1148 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1149 k_cfg->layoutSpecificSize);
1150 if (retcode) {
1151 RF_Free(k_cfg, sizeof(RF_Config_t));
1152 RF_Free(specific_buf,
1153 k_cfg->layoutSpecificSize);
1154 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1155 retcode));
1156 goto no_config;
1157 }
1158 } else
1159 specific_buf = NULL;
1160 k_cfg->layoutSpecific = specific_buf;
1161
1162 /* should do some kind of sanity check on the configuration.
1163 * Store the sum of all the bytes in the last byte? */
1164
1165 /* configure the system */
1166
1167 /*
1168 * Clear the entire RAID descriptor, just to make sure
1169 * there is no stale data left in the case of a
1170 * reconfiguration
1171 */
1172 memset(raidPtr, 0, sizeof(*raidPtr));
1173 raidPtr->softc = rs;
1174 raidPtr->raidid = unit;
1175
1176 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1177
1178 if (retcode == 0) {
1179
1180 /* allow this many simultaneous IO's to
1181 this RAID device */
1182 raidPtr->openings = RAIDOUTSTANDING;
1183
1184 raidinit(rs);
1185 raid_wakeup(raidPtr);
1186 rf_markalldirty(raidPtr);
1187 }
1188 /* free the buffers. No return code here. */
1189 if (k_cfg->layoutSpecificSize) {
1190 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1191 }
1192 RF_Free(k_cfg, sizeof(RF_Config_t));
1193
1194 no_config:
1195 /*
1196 * If configuration failed, set sc_flags so that we
1197 * will detach the device when we close it.
1198 */
1199 if (retcode != 0)
1200 rs->sc_flags |= RAIDF_SHUTDOWN;
1201 return (retcode);
1202
1203 /* shutdown the system */
1204 case RAIDFRAME_SHUTDOWN:
1205
1206 part = DISKPART(dev);
1207 pmask = (1 << part);
1208
1209 if ((error = raidlock(rs)) != 0)
1210 return (error);
1211
1212 if (DK_BUSY(dksc, pmask) ||
1213 raidPtr->recon_in_progress != 0 ||
1214 raidPtr->parity_rewrite_in_progress != 0 ||
1215 raidPtr->copyback_in_progress != 0)
1216 retcode = EBUSY;
1217 else {
1218 /* detach and free on close */
1219 rs->sc_flags |= RAIDF_SHUTDOWN;
1220 retcode = 0;
1221 }
1222
1223 raidunlock(rs);
1224
1225 return (retcode);
1226 case RAIDFRAME_GET_COMPONENT_LABEL:
1227 clabel_ptr = (RF_ComponentLabel_t **) data;
1228 /* need to read the component label for the disk indicated
1229 by row,column in clabel */
1230
1231 /*
1232 * Perhaps there should be an option to skip the in-core
1233 * copy and hit the disk, as with disklabel(8).
1234 */
1235 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1236
1237 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1238
1239 if (retcode) {
1240 RF_Free(clabel, sizeof(*clabel));
1241 return retcode;
1242 }
1243
1244 clabel->row = 0; /* Don't allow looking at anything else.*/
1245
1246 column = clabel->column;
1247
1248 if ((column < 0) || (column >= raidPtr->numCol +
1249 raidPtr->numSpare)) {
1250 RF_Free(clabel, sizeof(*clabel));
1251 return EINVAL;
1252 }
1253
1254 RF_Free(clabel, sizeof(*clabel));
1255
1256 clabel = raidget_component_label(raidPtr, column);
1257
1258 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1259
1260 #if 0
1261 case RAIDFRAME_SET_COMPONENT_LABEL:
1262 clabel = (RF_ComponentLabel_t *) data;
1263
1264 /* XXX check the label for valid stuff... */
1265 /* Note that some things *should not* get modified --
1266 the user should be re-initing the labels instead of
1267 trying to patch things.
1268 */
1269
1270 raidid = raidPtr->raidid;
1271 #ifdef DEBUG
1272 printf("raid%d: Got component label:\n", raidid);
1273 printf("raid%d: Version: %d\n", raidid, clabel->version);
1274 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1275 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1276 printf("raid%d: Column: %d\n", raidid, clabel->column);
1277 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1278 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1279 printf("raid%d: Status: %d\n", raidid, clabel->status);
1280 #endif
1281 clabel->row = 0;
1282 column = clabel->column;
1283
1284 if ((column < 0) || (column >= raidPtr->numCol)) {
1285 return(EINVAL);
1286 }
1287
1288 /* XXX this isn't allowed to do anything for now :-) */
1289
1290 /* XXX and before it is, we need to fill in the rest
1291 of the fields!?!?!?! */
1292 memcpy(raidget_component_label(raidPtr, column),
1293 clabel, sizeof(*clabel));
1294 raidflush_component_label(raidPtr, column);
1295 return (0);
1296 #endif
1297
1298 case RAIDFRAME_INIT_LABELS:
1299 clabel = (RF_ComponentLabel_t *) data;
1300 /*
1301 we only want the serial number from
1302 the above. We get all the rest of the information
1303 from the config that was used to create this RAID
1304 set.
1305 */
1306
1307 raidPtr->serial_number = clabel->serial_number;
1308
1309 for(column=0;column<raidPtr->numCol;column++) {
1310 diskPtr = &raidPtr->Disks[column];
1311 if (!RF_DEAD_DISK(diskPtr->status)) {
1312 ci_label = raidget_component_label(raidPtr,
1313 column);
1314 /* Zeroing this is important. */
1315 memset(ci_label, 0, sizeof(*ci_label));
1316 raid_init_component_label(raidPtr, ci_label);
1317 ci_label->serial_number =
1318 raidPtr->serial_number;
1319 ci_label->row = 0; /* we dont' pretend to support more */
1320 rf_component_label_set_partitionsize(ci_label,
1321 diskPtr->partitionSize);
1322 ci_label->column = column;
1323 raidflush_component_label(raidPtr, column);
1324 }
1325 /* XXXjld what about the spares? */
1326 }
1327
1328 return (retcode);
1329 case RAIDFRAME_SET_AUTOCONFIG:
1330 d = rf_set_autoconfig(raidPtr, *(int *) data);
1331 printf("raid%d: New autoconfig value is: %d\n",
1332 raidPtr->raidid, d);
1333 *(int *) data = d;
1334 return (retcode);
1335
1336 case RAIDFRAME_SET_ROOT:
1337 d = rf_set_rootpartition(raidPtr, *(int *) data);
1338 printf("raid%d: New rootpartition value is: %d\n",
1339 raidPtr->raidid, d);
1340 *(int *) data = d;
1341 return (retcode);
1342
1343 /* initialize all parity */
1344 case RAIDFRAME_REWRITEPARITY:
1345
1346 if (raidPtr->Layout.map->faultsTolerated == 0) {
1347 /* Parity for RAID 0 is trivially correct */
1348 raidPtr->parity_good = RF_RAID_CLEAN;
1349 return(0);
1350 }
1351
1352 if (raidPtr->parity_rewrite_in_progress == 1) {
1353 /* Re-write is already in progress! */
1354 return(EINVAL);
1355 }
1356
1357 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1358 rf_RewriteParityThread,
1359 raidPtr,"raid_parity");
1360 return (retcode);
1361
1362
1363 case RAIDFRAME_ADD_HOT_SPARE:
1364 sparePtr = (RF_SingleComponent_t *) data;
1365 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1366 retcode = rf_add_hot_spare(raidPtr, &component);
1367 return(retcode);
1368
1369 case RAIDFRAME_REMOVE_HOT_SPARE:
1370 return(retcode);
1371
1372 case RAIDFRAME_DELETE_COMPONENT:
1373 componentPtr = (RF_SingleComponent_t *)data;
1374 memcpy( &component, componentPtr,
1375 sizeof(RF_SingleComponent_t));
1376 retcode = rf_delete_component(raidPtr, &component);
1377 return(retcode);
1378
1379 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1380 componentPtr = (RF_SingleComponent_t *)data;
1381 memcpy( &component, componentPtr,
1382 sizeof(RF_SingleComponent_t));
1383 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1384 return(retcode);
1385
1386 case RAIDFRAME_REBUILD_IN_PLACE:
1387
1388 if (raidPtr->Layout.map->faultsTolerated == 0) {
1389 /* Can't do this on a RAID 0!! */
1390 return(EINVAL);
1391 }
1392
1393 if (raidPtr->recon_in_progress == 1) {
1394 /* a reconstruct is already in progress! */
1395 return(EINVAL);
1396 }
1397
1398 componentPtr = (RF_SingleComponent_t *) data;
1399 memcpy( &component, componentPtr,
1400 sizeof(RF_SingleComponent_t));
1401 component.row = 0; /* we don't support any more */
1402 column = component.column;
1403
1404 if ((column < 0) || (column >= raidPtr->numCol)) {
1405 return(EINVAL);
1406 }
1407
1408 rf_lock_mutex2(raidPtr->mutex);
1409 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1410 (raidPtr->numFailures > 0)) {
1411 /* XXX 0 above shouldn't be constant!!! */
1412 /* some component other than this has failed.
1413 Let's not make things worse than they already
1414 are... */
1415 printf("raid%d: Unable to reconstruct to disk at:\n",
1416 raidPtr->raidid);
1417 printf("raid%d: Col: %d Too many failures.\n",
1418 raidPtr->raidid, column);
1419 rf_unlock_mutex2(raidPtr->mutex);
1420 return (EINVAL);
1421 }
1422 if (raidPtr->Disks[column].status ==
1423 rf_ds_reconstructing) {
1424 printf("raid%d: Unable to reconstruct to disk at:\n",
1425 raidPtr->raidid);
1426 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1427
1428 rf_unlock_mutex2(raidPtr->mutex);
1429 return (EINVAL);
1430 }
1431 if (raidPtr->Disks[column].status == rf_ds_spared) {
1432 rf_unlock_mutex2(raidPtr->mutex);
1433 return (EINVAL);
1434 }
1435 rf_unlock_mutex2(raidPtr->mutex);
1436
1437 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1438 if (rrcopy == NULL)
1439 return(ENOMEM);
1440
1441 rrcopy->raidPtr = (void *) raidPtr;
1442 rrcopy->col = column;
1443
1444 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1445 rf_ReconstructInPlaceThread,
1446 rrcopy,"raid_reconip");
1447 return(retcode);
1448
1449 case RAIDFRAME_GET_INFO:
1450 if (!raidPtr->valid)
1451 return (ENODEV);
1452 ucfgp = (RF_DeviceConfig_t **) data;
1453 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1454 (RF_DeviceConfig_t *));
1455 if (d_cfg == NULL)
1456 return (ENOMEM);
1457 d_cfg->rows = 1; /* there is only 1 row now */
1458 d_cfg->cols = raidPtr->numCol;
1459 d_cfg->ndevs = raidPtr->numCol;
1460 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1461 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1462 return (ENOMEM);
1463 }
1464 d_cfg->nspares = raidPtr->numSpare;
1465 if (d_cfg->nspares >= RF_MAX_DISKS) {
1466 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1467 return (ENOMEM);
1468 }
1469 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1470 d = 0;
1471 for (j = 0; j < d_cfg->cols; j++) {
1472 d_cfg->devs[d] = raidPtr->Disks[j];
1473 d++;
1474 }
1475 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1476 d_cfg->spares[i] = raidPtr->Disks[j];
1477 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1478 /* XXX: raidctl(8) expects to see this as a used spare */
1479 d_cfg->spares[i].status = rf_ds_used_spare;
1480 }
1481 }
1482 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1483 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1484
1485 return (retcode);
1486
1487 case RAIDFRAME_CHECK_PARITY:
1488 *(int *) data = raidPtr->parity_good;
1489 return (0);
1490
1491 case RAIDFRAME_PARITYMAP_STATUS:
1492 if (rf_paritymap_ineligible(raidPtr))
1493 return EINVAL;
1494 rf_paritymap_status(raidPtr->parity_map,
1495 (struct rf_pmstat *)data);
1496 return 0;
1497
1498 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1499 if (rf_paritymap_ineligible(raidPtr))
1500 return EINVAL;
1501 if (raidPtr->parity_map == NULL)
1502 return ENOENT; /* ??? */
1503 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1504 (struct rf_pmparams *)data, 1))
1505 return EINVAL;
1506 return 0;
1507
1508 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1509 if (rf_paritymap_ineligible(raidPtr))
1510 return EINVAL;
1511 *(int *) data = rf_paritymap_get_disable(raidPtr);
1512 return 0;
1513
1514 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1515 if (rf_paritymap_ineligible(raidPtr))
1516 return EINVAL;
1517 rf_paritymap_set_disable(raidPtr, *(int *)data);
1518 /* XXX should errors be passed up? */
1519 return 0;
1520
1521 case RAIDFRAME_RESET_ACCTOTALS:
1522 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1523 return (0);
1524
1525 case RAIDFRAME_GET_ACCTOTALS:
1526 totals = (RF_AccTotals_t *) data;
1527 *totals = raidPtr->acc_totals;
1528 return (0);
1529
1530 case RAIDFRAME_KEEP_ACCTOTALS:
1531 raidPtr->keep_acc_totals = *(int *)data;
1532 return (0);
1533
1534 case RAIDFRAME_GET_SIZE:
1535 *(int *) data = raidPtr->totalSectors;
1536 return (0);
1537
1538 /* fail a disk & optionally start reconstruction */
1539 case RAIDFRAME_FAIL_DISK:
1540
1541 if (raidPtr->Layout.map->faultsTolerated == 0) {
1542 /* Can't do this on a RAID 0!! */
1543 return(EINVAL);
1544 }
1545
1546 rr = (struct rf_recon_req *) data;
1547 rr->row = 0;
1548 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1549 return (EINVAL);
1550
1551
1552 rf_lock_mutex2(raidPtr->mutex);
1553 if (raidPtr->status == rf_rs_reconstructing) {
1554 /* you can't fail a disk while we're reconstructing! */
1555 /* XXX wrong for RAID6 */
1556 rf_unlock_mutex2(raidPtr->mutex);
1557 return (EINVAL);
1558 }
1559 if ((raidPtr->Disks[rr->col].status ==
1560 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1561 /* some other component has failed. Let's not make
1562 things worse. XXX wrong for RAID6 */
1563 rf_unlock_mutex2(raidPtr->mutex);
1564 return (EINVAL);
1565 }
1566 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1567 /* Can't fail a spared disk! */
1568 rf_unlock_mutex2(raidPtr->mutex);
1569 return (EINVAL);
1570 }
1571 rf_unlock_mutex2(raidPtr->mutex);
1572
1573 /* make a copy of the recon request so that we don't rely on
1574 * the user's buffer */
1575 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1576 if (rrcopy == NULL)
1577 return(ENOMEM);
1578 memcpy(rrcopy, rr, sizeof(*rr));
1579 rrcopy->raidPtr = (void *) raidPtr;
1580
1581 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1582 rf_ReconThread,
1583 rrcopy,"raid_recon");
1584 return (0);
1585
1586 /* invoke a copyback operation after recon on whatever disk
1587 * needs it, if any */
1588 case RAIDFRAME_COPYBACK:
1589
1590 if (raidPtr->Layout.map->faultsTolerated == 0) {
1591 /* This makes no sense on a RAID 0!! */
1592 return(EINVAL);
1593 }
1594
1595 if (raidPtr->copyback_in_progress == 1) {
1596 /* Copyback is already in progress! */
1597 return(EINVAL);
1598 }
1599
1600 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1601 rf_CopybackThread,
1602 raidPtr,"raid_copyback");
1603 return (retcode);
1604
1605 /* return the percentage completion of reconstruction */
1606 case RAIDFRAME_CHECK_RECON_STATUS:
1607 if (raidPtr->Layout.map->faultsTolerated == 0) {
1608 /* This makes no sense on a RAID 0, so tell the
1609 user it's done. */
1610 *(int *) data = 100;
1611 return(0);
1612 }
1613 if (raidPtr->status != rf_rs_reconstructing)
1614 *(int *) data = 100;
1615 else {
1616 if (raidPtr->reconControl->numRUsTotal > 0) {
1617 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1618 } else {
1619 *(int *) data = 0;
1620 }
1621 }
1622 return (0);
1623 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1624 progressInfoPtr = (RF_ProgressInfo_t **) data;
1625 if (raidPtr->status != rf_rs_reconstructing) {
1626 progressInfo.remaining = 0;
1627 progressInfo.completed = 100;
1628 progressInfo.total = 100;
1629 } else {
1630 progressInfo.total =
1631 raidPtr->reconControl->numRUsTotal;
1632 progressInfo.completed =
1633 raidPtr->reconControl->numRUsComplete;
1634 progressInfo.remaining = progressInfo.total -
1635 progressInfo.completed;
1636 }
1637 retcode = copyout(&progressInfo, *progressInfoPtr,
1638 sizeof(RF_ProgressInfo_t));
1639 return (retcode);
1640
1641 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1642 if (raidPtr->Layout.map->faultsTolerated == 0) {
1643 /* This makes no sense on a RAID 0, so tell the
1644 user it's done. */
1645 *(int *) data = 100;
1646 return(0);
1647 }
1648 if (raidPtr->parity_rewrite_in_progress == 1) {
1649 *(int *) data = 100 *
1650 raidPtr->parity_rewrite_stripes_done /
1651 raidPtr->Layout.numStripe;
1652 } else {
1653 *(int *) data = 100;
1654 }
1655 return (0);
1656
1657 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1658 progressInfoPtr = (RF_ProgressInfo_t **) data;
1659 if (raidPtr->parity_rewrite_in_progress == 1) {
1660 progressInfo.total = raidPtr->Layout.numStripe;
1661 progressInfo.completed =
1662 raidPtr->parity_rewrite_stripes_done;
1663 progressInfo.remaining = progressInfo.total -
1664 progressInfo.completed;
1665 } else {
1666 progressInfo.remaining = 0;
1667 progressInfo.completed = 100;
1668 progressInfo.total = 100;
1669 }
1670 retcode = copyout(&progressInfo, *progressInfoPtr,
1671 sizeof(RF_ProgressInfo_t));
1672 return (retcode);
1673
1674 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1675 if (raidPtr->Layout.map->faultsTolerated == 0) {
1676 /* This makes no sense on a RAID 0 */
1677 *(int *) data = 100;
1678 return(0);
1679 }
1680 if (raidPtr->copyback_in_progress == 1) {
1681 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1682 raidPtr->Layout.numStripe;
1683 } else {
1684 *(int *) data = 100;
1685 }
1686 return (0);
1687
1688 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1689 progressInfoPtr = (RF_ProgressInfo_t **) data;
1690 if (raidPtr->copyback_in_progress == 1) {
1691 progressInfo.total = raidPtr->Layout.numStripe;
1692 progressInfo.completed =
1693 raidPtr->copyback_stripes_done;
1694 progressInfo.remaining = progressInfo.total -
1695 progressInfo.completed;
1696 } else {
1697 progressInfo.remaining = 0;
1698 progressInfo.completed = 100;
1699 progressInfo.total = 100;
1700 }
1701 retcode = copyout(&progressInfo, *progressInfoPtr,
1702 sizeof(RF_ProgressInfo_t));
1703 return (retcode);
1704
1705 case RAIDFRAME_SET_LAST_UNIT:
1706 for (column = 0; column < raidPtr->numCol; column++)
1707 if (raidPtr->Disks[column].status != rf_ds_optimal)
1708 return EBUSY;
1709
1710 for (column = 0; column < raidPtr->numCol; column++) {
1711 clabel = raidget_component_label(raidPtr, column);
1712 clabel->last_unit = *(int *)data;
1713 raidflush_component_label(raidPtr, column);
1714 }
1715 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1716 return 0;
1717
1718 /* the sparetable daemon calls this to wait for the kernel to
1719 * need a spare table. this ioctl does not return until a
1720 * spare table is needed. XXX -- calling mpsleep here in the
1721 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1722 * -- I should either compute the spare table in the kernel,
1723 * or have a different -- XXX XXX -- interface (a different
1724 * character device) for delivering the table -- XXX */
1725 #if 0
1726 case RAIDFRAME_SPARET_WAIT:
1727 rf_lock_mutex2(rf_sparet_wait_mutex);
1728 while (!rf_sparet_wait_queue)
1729 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1730 waitreq = rf_sparet_wait_queue;
1731 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1732 rf_unlock_mutex2(rf_sparet_wait_mutex);
1733
1734 /* structure assignment */
1735 *((RF_SparetWait_t *) data) = *waitreq;
1736
1737 RF_Free(waitreq, sizeof(*waitreq));
1738 return (0);
1739
1740 /* wakes up a process waiting on SPARET_WAIT and puts an error
1741 * code in it that will cause the dameon to exit */
1742 case RAIDFRAME_ABORT_SPARET_WAIT:
1743 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1744 waitreq->fcol = -1;
1745 rf_lock_mutex2(rf_sparet_wait_mutex);
1746 waitreq->next = rf_sparet_wait_queue;
1747 rf_sparet_wait_queue = waitreq;
1748 rf_broadcast_conf2(rf_sparet_wait_cv);
1749 rf_unlock_mutex2(rf_sparet_wait_mutex);
1750 return (0);
1751
1752 /* used by the spare table daemon to deliver a spare table
1753 * into the kernel */
1754 case RAIDFRAME_SEND_SPARET:
1755
1756 /* install the spare table */
1757 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1758
1759 /* respond to the requestor. the return status of the spare
1760 * table installation is passed in the "fcol" field */
1761 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1762 waitreq->fcol = retcode;
1763 rf_lock_mutex2(rf_sparet_wait_mutex);
1764 waitreq->next = rf_sparet_resp_queue;
1765 rf_sparet_resp_queue = waitreq;
1766 rf_broadcast_cond2(rf_sparet_resp_cv);
1767 rf_unlock_mutex2(rf_sparet_wait_mutex);
1768
1769 return (retcode);
1770 #endif
1771
1772 default:
1773 break; /* fall through to the os-specific code below */
1774
1775 }
1776
1777 if (!raidPtr->valid)
1778 return (EINVAL);
1779
1780 /*
1781 * Add support for "regular" device ioctls here.
1782 */
1783
1784 switch (cmd) {
1785 case DIOCGCACHE:
1786 retcode = rf_get_component_caches(raidPtr, (int *)data);
1787 break;
1788
1789 case DIOCCACHESYNC:
1790 retcode = rf_sync_component_caches(raidPtr);
1791 break;
1792
1793 default:
1794 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1795 break;
1796 }
1797
1798 return (retcode);
1799
1800 }
1801
1802
1803 /* raidinit -- complete the rest of the initialization for the
1804 RAIDframe device. */
1805
1806
1807 static void
1808 raidinit(struct raid_softc *rs)
1809 {
1810 cfdata_t cf;
1811 unsigned int unit;
1812 struct dk_softc *dksc = &rs->sc_dksc;
1813 RF_Raid_t *raidPtr = &rs->sc_r;
1814 device_t dev;
1815
1816 unit = raidPtr->raidid;
1817
1818 /* XXX doesn't check bounds. */
1819 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1820
1821 /* attach the pseudo device */
1822 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1823 cf->cf_name = raid_cd.cd_name;
1824 cf->cf_atname = raid_cd.cd_name;
1825 cf->cf_unit = unit;
1826 cf->cf_fstate = FSTATE_STAR;
1827
1828 dev = config_attach_pseudo(cf);
1829 if (dev == NULL) {
1830 printf("raid%d: config_attach_pseudo failed\n",
1831 raidPtr->raidid);
1832 free(cf, M_RAIDFRAME);
1833 return;
1834 }
1835
1836 /* provide a backpointer to the real softc */
1837 raidsoftc(dev) = rs;
1838
1839 /* disk_attach actually creates space for the CPU disklabel, among
1840 * other things, so it's critical to call this *BEFORE* we try putzing
1841 * with disklabels. */
1842 dk_init(dksc, dev, DKTYPE_RAID);
1843 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1844
1845 /* XXX There may be a weird interaction here between this, and
1846 * protectedSectors, as used in RAIDframe. */
1847
1848 rs->sc_size = raidPtr->totalSectors;
1849
1850 /* Attach dk and disk subsystems */
1851 dk_attach(dksc);
1852 disk_attach(&dksc->sc_dkdev);
1853 rf_set_geometry(rs, raidPtr);
1854
1855 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1856
1857 /* mark unit as usuable */
1858 rs->sc_flags |= RAIDF_INITED;
1859
1860 dkwedge_discover(&dksc->sc_dkdev);
1861 }
1862
1863 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1864 /* wake up the daemon & tell it to get us a spare table
1865 * XXX
1866 * the entries in the queues should be tagged with the raidPtr
1867 * so that in the extremely rare case that two recons happen at once,
1868 * we know for which device were requesting a spare table
1869 * XXX
1870 *
1871 * XXX This code is not currently used. GO
1872 */
1873 int
1874 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1875 {
1876 int retcode;
1877
1878 rf_lock_mutex2(rf_sparet_wait_mutex);
1879 req->next = rf_sparet_wait_queue;
1880 rf_sparet_wait_queue = req;
1881 rf_broadcast_cond2(rf_sparet_wait_cv);
1882
1883 /* mpsleep unlocks the mutex */
1884 while (!rf_sparet_resp_queue) {
1885 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1886 }
1887 req = rf_sparet_resp_queue;
1888 rf_sparet_resp_queue = req->next;
1889 rf_unlock_mutex2(rf_sparet_wait_mutex);
1890
1891 retcode = req->fcol;
1892 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1893 * alloc'd */
1894 return (retcode);
1895 }
1896 #endif
1897
1898 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1899 * bp & passes it down.
1900 * any calls originating in the kernel must use non-blocking I/O
1901 * do some extra sanity checking to return "appropriate" error values for
1902 * certain conditions (to make some standard utilities work)
1903 *
1904 * Formerly known as: rf_DoAccessKernel
1905 */
1906 void
1907 raidstart(RF_Raid_t *raidPtr)
1908 {
1909 struct raid_softc *rs;
1910 struct dk_softc *dksc;
1911
1912 rs = raidPtr->softc;
1913 dksc = &rs->sc_dksc;
1914 /* quick check to see if anything has died recently */
1915 rf_lock_mutex2(raidPtr->mutex);
1916 if (raidPtr->numNewFailures > 0) {
1917 rf_unlock_mutex2(raidPtr->mutex);
1918 rf_update_component_labels(raidPtr,
1919 RF_NORMAL_COMPONENT_UPDATE);
1920 rf_lock_mutex2(raidPtr->mutex);
1921 raidPtr->numNewFailures--;
1922 }
1923 rf_unlock_mutex2(raidPtr->mutex);
1924
1925 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1926 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1927 return;
1928 }
1929
1930 dk_start(dksc, NULL);
1931 }
1932
1933 static int
1934 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1935 {
1936 RF_SectorCount_t num_blocks, pb, sum;
1937 RF_RaidAddr_t raid_addr;
1938 daddr_t blocknum;
1939 int do_async;
1940 int rc;
1941
1942 rf_lock_mutex2(raidPtr->mutex);
1943 if (raidPtr->openings == 0) {
1944 rf_unlock_mutex2(raidPtr->mutex);
1945 return EAGAIN;
1946 }
1947 rf_unlock_mutex2(raidPtr->mutex);
1948
1949 blocknum = bp->b_rawblkno;
1950
1951 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1952 (int) blocknum));
1953
1954 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1955 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1956
1957 /* *THIS* is where we adjust what block we're going to...
1958 * but DO NOT TOUCH bp->b_blkno!!! */
1959 raid_addr = blocknum;
1960
1961 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1962 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1963 sum = raid_addr + num_blocks + pb;
1964 if (1 || rf_debugKernelAccess) {
1965 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1966 (int) raid_addr, (int) sum, (int) num_blocks,
1967 (int) pb, (int) bp->b_resid));
1968 }
1969 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1970 || (sum < num_blocks) || (sum < pb)) {
1971 rc = ENOSPC;
1972 goto done;
1973 }
1974 /*
1975 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1976 */
1977
1978 if (bp->b_bcount & raidPtr->sectorMask) {
1979 rc = ENOSPC;
1980 goto done;
1981 }
1982 db1_printf(("Calling DoAccess..\n"));
1983
1984
1985 rf_lock_mutex2(raidPtr->mutex);
1986 raidPtr->openings--;
1987 rf_unlock_mutex2(raidPtr->mutex);
1988
1989 /*
1990 * Everything is async.
1991 */
1992 do_async = 1;
1993
1994 /* don't ever condition on bp->b_flags & B_WRITE.
1995 * always condition on B_READ instead */
1996
1997 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1998 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1999 do_async, raid_addr, num_blocks,
2000 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2001
2002 done:
2003 return rc;
2004 }
2005
2006 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2007
2008 int
2009 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2010 {
2011 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2012 struct buf *bp;
2013
2014 req->queue = queue;
2015 bp = req->bp;
2016
2017 switch (req->type) {
2018 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2019 /* XXX need to do something extra here.. */
2020 /* I'm leaving this in, as I've never actually seen it used,
2021 * and I'd like folks to report it... GO */
2022 printf(("WAKEUP CALLED\n"));
2023 queue->numOutstanding++;
2024
2025 bp->b_flags = 0;
2026 bp->b_private = req;
2027
2028 KernelWakeupFunc(bp);
2029 break;
2030
2031 case RF_IO_TYPE_READ:
2032 case RF_IO_TYPE_WRITE:
2033 #if RF_ACC_TRACE > 0
2034 if (req->tracerec) {
2035 RF_ETIMER_START(req->tracerec->timer);
2036 }
2037 #endif
2038 InitBP(bp, queue->rf_cinfo->ci_vp,
2039 op, queue->rf_cinfo->ci_dev,
2040 req->sectorOffset, req->numSector,
2041 req->buf, KernelWakeupFunc, (void *) req,
2042 queue->raidPtr->logBytesPerSector, req->b_proc);
2043
2044 if (rf_debugKernelAccess) {
2045 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2046 (long) bp->b_blkno));
2047 }
2048 queue->numOutstanding++;
2049 queue->last_deq_sector = req->sectorOffset;
2050 /* acc wouldn't have been let in if there were any pending
2051 * reqs at any other priority */
2052 queue->curPriority = req->priority;
2053
2054 db1_printf(("Going for %c to unit %d col %d\n",
2055 req->type, queue->raidPtr->raidid,
2056 queue->col));
2057 db1_printf(("sector %d count %d (%d bytes) %d\n",
2058 (int) req->sectorOffset, (int) req->numSector,
2059 (int) (req->numSector <<
2060 queue->raidPtr->logBytesPerSector),
2061 (int) queue->raidPtr->logBytesPerSector));
2062
2063 /*
2064 * XXX: drop lock here since this can block at
2065 * least with backing SCSI devices. Retake it
2066 * to minimize fuss with calling interfaces.
2067 */
2068
2069 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2070 bdev_strategy(bp);
2071 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2072 break;
2073
2074 default:
2075 panic("bad req->type in rf_DispatchKernelIO");
2076 }
2077 db1_printf(("Exiting from DispatchKernelIO\n"));
2078
2079 return (0);
2080 }
2081 /* this is the callback function associated with a I/O invoked from
2082 kernel code.
2083 */
2084 static void
2085 KernelWakeupFunc(struct buf *bp)
2086 {
2087 RF_DiskQueueData_t *req = NULL;
2088 RF_DiskQueue_t *queue;
2089
2090 db1_printf(("recovering the request queue:\n"));
2091
2092 req = bp->b_private;
2093
2094 queue = (RF_DiskQueue_t *) req->queue;
2095
2096 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2097
2098 #if RF_ACC_TRACE > 0
2099 if (req->tracerec) {
2100 RF_ETIMER_STOP(req->tracerec->timer);
2101 RF_ETIMER_EVAL(req->tracerec->timer);
2102 rf_lock_mutex2(rf_tracing_mutex);
2103 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2104 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2105 req->tracerec->num_phys_ios++;
2106 rf_unlock_mutex2(rf_tracing_mutex);
2107 }
2108 #endif
2109
2110 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2111 * ballistic, and mark the component as hosed... */
2112
2113 if (bp->b_error != 0) {
2114 /* Mark the disk as dead */
2115 /* but only mark it once... */
2116 /* and only if it wouldn't leave this RAID set
2117 completely broken */
2118 if (((queue->raidPtr->Disks[queue->col].status ==
2119 rf_ds_optimal) ||
2120 (queue->raidPtr->Disks[queue->col].status ==
2121 rf_ds_used_spare)) &&
2122 (queue->raidPtr->numFailures <
2123 queue->raidPtr->Layout.map->faultsTolerated)) {
2124 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2125 queue->raidPtr->raidid,
2126 bp->b_error,
2127 queue->raidPtr->Disks[queue->col].devname);
2128 queue->raidPtr->Disks[queue->col].status =
2129 rf_ds_failed;
2130 queue->raidPtr->status = rf_rs_degraded;
2131 queue->raidPtr->numFailures++;
2132 queue->raidPtr->numNewFailures++;
2133 } else { /* Disk is already dead... */
2134 /* printf("Disk already marked as dead!\n"); */
2135 }
2136
2137 }
2138
2139 /* Fill in the error value */
2140 req->error = bp->b_error;
2141
2142 /* Drop this one on the "finished" queue... */
2143 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2144
2145 /* Let the raidio thread know there is work to be done. */
2146 rf_signal_cond2(queue->raidPtr->iodone_cv);
2147
2148 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2149 }
2150
2151
2152 /*
2153 * initialize a buf structure for doing an I/O in the kernel.
2154 */
2155 static void
2156 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2157 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2158 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2159 struct proc *b_proc)
2160 {
2161 /* bp->b_flags = B_PHYS | rw_flag; */
2162 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2163 bp->b_oflags = 0;
2164 bp->b_cflags = 0;
2165 bp->b_bcount = numSect << logBytesPerSector;
2166 bp->b_bufsize = bp->b_bcount;
2167 bp->b_error = 0;
2168 bp->b_dev = dev;
2169 bp->b_data = bf;
2170 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2171 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2172 if (bp->b_bcount == 0) {
2173 panic("bp->b_bcount is zero in InitBP!!");
2174 }
2175 bp->b_proc = b_proc;
2176 bp->b_iodone = cbFunc;
2177 bp->b_private = cbArg;
2178 }
2179
2180 /*
2181 * Wait interruptibly for an exclusive lock.
2182 *
2183 * XXX
2184 * Several drivers do this; it should be abstracted and made MP-safe.
2185 * (Hmm... where have we seen this warning before :-> GO )
2186 */
2187 static int
2188 raidlock(struct raid_softc *rs)
2189 {
2190 int error;
2191
2192 error = 0;
2193 mutex_enter(&rs->sc_mutex);
2194 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2195 rs->sc_flags |= RAIDF_WANTED;
2196 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2197 if (error != 0)
2198 goto done;
2199 }
2200 rs->sc_flags |= RAIDF_LOCKED;
2201 done:
2202 mutex_exit(&rs->sc_mutex);
2203 return (error);
2204 }
2205 /*
2206 * Unlock and wake up any waiters.
2207 */
2208 static void
2209 raidunlock(struct raid_softc *rs)
2210 {
2211
2212 mutex_enter(&rs->sc_mutex);
2213 rs->sc_flags &= ~RAIDF_LOCKED;
2214 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2215 rs->sc_flags &= ~RAIDF_WANTED;
2216 cv_broadcast(&rs->sc_cv);
2217 }
2218 mutex_exit(&rs->sc_mutex);
2219 }
2220
2221
2222 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2223 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2224 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2225
2226 static daddr_t
2227 rf_component_info_offset(void)
2228 {
2229
2230 return RF_COMPONENT_INFO_OFFSET;
2231 }
2232
2233 static daddr_t
2234 rf_component_info_size(unsigned secsize)
2235 {
2236 daddr_t info_size;
2237
2238 KASSERT(secsize);
2239 if (secsize > RF_COMPONENT_INFO_SIZE)
2240 info_size = secsize;
2241 else
2242 info_size = RF_COMPONENT_INFO_SIZE;
2243
2244 return info_size;
2245 }
2246
2247 static daddr_t
2248 rf_parity_map_offset(RF_Raid_t *raidPtr)
2249 {
2250 daddr_t map_offset;
2251
2252 KASSERT(raidPtr->bytesPerSector);
2253 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2254 map_offset = raidPtr->bytesPerSector;
2255 else
2256 map_offset = RF_COMPONENT_INFO_SIZE;
2257 map_offset += rf_component_info_offset();
2258
2259 return map_offset;
2260 }
2261
2262 static daddr_t
2263 rf_parity_map_size(RF_Raid_t *raidPtr)
2264 {
2265 daddr_t map_size;
2266
2267 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2268 map_size = raidPtr->bytesPerSector;
2269 else
2270 map_size = RF_PARITY_MAP_SIZE;
2271
2272 return map_size;
2273 }
2274
2275 int
2276 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2277 {
2278 RF_ComponentLabel_t *clabel;
2279
2280 clabel = raidget_component_label(raidPtr, col);
2281 clabel->clean = RF_RAID_CLEAN;
2282 raidflush_component_label(raidPtr, col);
2283 return(0);
2284 }
2285
2286
2287 int
2288 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2289 {
2290 RF_ComponentLabel_t *clabel;
2291
2292 clabel = raidget_component_label(raidPtr, col);
2293 clabel->clean = RF_RAID_DIRTY;
2294 raidflush_component_label(raidPtr, col);
2295 return(0);
2296 }
2297
2298 int
2299 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2300 {
2301 KASSERT(raidPtr->bytesPerSector);
2302 return raidread_component_label(raidPtr->bytesPerSector,
2303 raidPtr->Disks[col].dev,
2304 raidPtr->raid_cinfo[col].ci_vp,
2305 &raidPtr->raid_cinfo[col].ci_label);
2306 }
2307
2308 RF_ComponentLabel_t *
2309 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2310 {
2311 return &raidPtr->raid_cinfo[col].ci_label;
2312 }
2313
2314 int
2315 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2316 {
2317 RF_ComponentLabel_t *label;
2318
2319 label = &raidPtr->raid_cinfo[col].ci_label;
2320 label->mod_counter = raidPtr->mod_counter;
2321 #ifndef RF_NO_PARITY_MAP
2322 label->parity_map_modcount = label->mod_counter;
2323 #endif
2324 return raidwrite_component_label(raidPtr->bytesPerSector,
2325 raidPtr->Disks[col].dev,
2326 raidPtr->raid_cinfo[col].ci_vp, label);
2327 }
2328
2329
2330 static int
2331 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2332 RF_ComponentLabel_t *clabel)
2333 {
2334 return raidread_component_area(dev, b_vp, clabel,
2335 sizeof(RF_ComponentLabel_t),
2336 rf_component_info_offset(),
2337 rf_component_info_size(secsize));
2338 }
2339
2340 /* ARGSUSED */
2341 static int
2342 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2343 size_t msize, daddr_t offset, daddr_t dsize)
2344 {
2345 struct buf *bp;
2346 int error;
2347
2348 /* XXX should probably ensure that we don't try to do this if
2349 someone has changed rf_protected_sectors. */
2350
2351 if (b_vp == NULL) {
2352 /* For whatever reason, this component is not valid.
2353 Don't try to read a component label from it. */
2354 return(EINVAL);
2355 }
2356
2357 /* get a block of the appropriate size... */
2358 bp = geteblk((int)dsize);
2359 bp->b_dev = dev;
2360
2361 /* get our ducks in a row for the read */
2362 bp->b_blkno = offset / DEV_BSIZE;
2363 bp->b_bcount = dsize;
2364 bp->b_flags |= B_READ;
2365 bp->b_resid = dsize;
2366
2367 bdev_strategy(bp);
2368 error = biowait(bp);
2369
2370 if (!error) {
2371 memcpy(data, bp->b_data, msize);
2372 }
2373
2374 brelse(bp, 0);
2375 return(error);
2376 }
2377
2378
2379 static int
2380 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2381 RF_ComponentLabel_t *clabel)
2382 {
2383 return raidwrite_component_area(dev, b_vp, clabel,
2384 sizeof(RF_ComponentLabel_t),
2385 rf_component_info_offset(),
2386 rf_component_info_size(secsize), 0);
2387 }
2388
2389 /* ARGSUSED */
2390 static int
2391 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2392 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2393 {
2394 struct buf *bp;
2395 int error;
2396
2397 /* get a block of the appropriate size... */
2398 bp = geteblk((int)dsize);
2399 bp->b_dev = dev;
2400
2401 /* get our ducks in a row for the write */
2402 bp->b_blkno = offset / DEV_BSIZE;
2403 bp->b_bcount = dsize;
2404 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2405 bp->b_resid = dsize;
2406
2407 memset(bp->b_data, 0, dsize);
2408 memcpy(bp->b_data, data, msize);
2409
2410 bdev_strategy(bp);
2411 if (asyncp)
2412 return 0;
2413 error = biowait(bp);
2414 brelse(bp, 0);
2415 if (error) {
2416 #if 1
2417 printf("Failed to write RAID component info!\n");
2418 #endif
2419 }
2420
2421 return(error);
2422 }
2423
2424 void
2425 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2426 {
2427 int c;
2428
2429 for (c = 0; c < raidPtr->numCol; c++) {
2430 /* Skip dead disks. */
2431 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2432 continue;
2433 /* XXXjld: what if an error occurs here? */
2434 raidwrite_component_area(raidPtr->Disks[c].dev,
2435 raidPtr->raid_cinfo[c].ci_vp, map,
2436 RF_PARITYMAP_NBYTE,
2437 rf_parity_map_offset(raidPtr),
2438 rf_parity_map_size(raidPtr), 0);
2439 }
2440 }
2441
2442 void
2443 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2444 {
2445 struct rf_paritymap_ondisk tmp;
2446 int c,first;
2447
2448 first=1;
2449 for (c = 0; c < raidPtr->numCol; c++) {
2450 /* Skip dead disks. */
2451 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2452 continue;
2453 raidread_component_area(raidPtr->Disks[c].dev,
2454 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2455 RF_PARITYMAP_NBYTE,
2456 rf_parity_map_offset(raidPtr),
2457 rf_parity_map_size(raidPtr));
2458 if (first) {
2459 memcpy(map, &tmp, sizeof(*map));
2460 first = 0;
2461 } else {
2462 rf_paritymap_merge(map, &tmp);
2463 }
2464 }
2465 }
2466
2467 void
2468 rf_markalldirty(RF_Raid_t *raidPtr)
2469 {
2470 RF_ComponentLabel_t *clabel;
2471 int sparecol;
2472 int c;
2473 int j;
2474 int scol = -1;
2475
2476 raidPtr->mod_counter++;
2477 for (c = 0; c < raidPtr->numCol; c++) {
2478 /* we don't want to touch (at all) a disk that has
2479 failed */
2480 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2481 clabel = raidget_component_label(raidPtr, c);
2482 if (clabel->status == rf_ds_spared) {
2483 /* XXX do something special...
2484 but whatever you do, don't
2485 try to access it!! */
2486 } else {
2487 raidmarkdirty(raidPtr, c);
2488 }
2489 }
2490 }
2491
2492 for( c = 0; c < raidPtr->numSpare ; c++) {
2493 sparecol = raidPtr->numCol + c;
2494 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2495 /*
2496
2497 we claim this disk is "optimal" if it's
2498 rf_ds_used_spare, as that means it should be
2499 directly substitutable for the disk it replaced.
2500 We note that too...
2501
2502 */
2503
2504 for(j=0;j<raidPtr->numCol;j++) {
2505 if (raidPtr->Disks[j].spareCol == sparecol) {
2506 scol = j;
2507 break;
2508 }
2509 }
2510
2511 clabel = raidget_component_label(raidPtr, sparecol);
2512 /* make sure status is noted */
2513
2514 raid_init_component_label(raidPtr, clabel);
2515
2516 clabel->row = 0;
2517 clabel->column = scol;
2518 /* Note: we *don't* change status from rf_ds_used_spare
2519 to rf_ds_optimal */
2520 /* clabel.status = rf_ds_optimal; */
2521
2522 raidmarkdirty(raidPtr, sparecol);
2523 }
2524 }
2525 }
2526
2527
2528 void
2529 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2530 {
2531 RF_ComponentLabel_t *clabel;
2532 int sparecol;
2533 int c;
2534 int j;
2535 int scol;
2536 struct raid_softc *rs = raidPtr->softc;
2537
2538 scol = -1;
2539
2540 /* XXX should do extra checks to make sure things really are clean,
2541 rather than blindly setting the clean bit... */
2542
2543 raidPtr->mod_counter++;
2544
2545 for (c = 0; c < raidPtr->numCol; c++) {
2546 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2547 clabel = raidget_component_label(raidPtr, c);
2548 /* make sure status is noted */
2549 clabel->status = rf_ds_optimal;
2550
2551 /* note what unit we are configured as */
2552 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2553 clabel->last_unit = raidPtr->raidid;
2554
2555 raidflush_component_label(raidPtr, c);
2556 if (final == RF_FINAL_COMPONENT_UPDATE) {
2557 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2558 raidmarkclean(raidPtr, c);
2559 }
2560 }
2561 }
2562 /* else we don't touch it.. */
2563 }
2564
2565 for( c = 0; c < raidPtr->numSpare ; c++) {
2566 sparecol = raidPtr->numCol + c;
2567 /* Need to ensure that the reconstruct actually completed! */
2568 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2569 /*
2570
2571 we claim this disk is "optimal" if it's
2572 rf_ds_used_spare, as that means it should be
2573 directly substitutable for the disk it replaced.
2574 We note that too...
2575
2576 */
2577
2578 for(j=0;j<raidPtr->numCol;j++) {
2579 if (raidPtr->Disks[j].spareCol == sparecol) {
2580 scol = j;
2581 break;
2582 }
2583 }
2584
2585 /* XXX shouldn't *really* need this... */
2586 clabel = raidget_component_label(raidPtr, sparecol);
2587 /* make sure status is noted */
2588
2589 raid_init_component_label(raidPtr, clabel);
2590
2591 clabel->column = scol;
2592 clabel->status = rf_ds_optimal;
2593 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2594 clabel->last_unit = raidPtr->raidid;
2595
2596 raidflush_component_label(raidPtr, sparecol);
2597 if (final == RF_FINAL_COMPONENT_UPDATE) {
2598 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2599 raidmarkclean(raidPtr, sparecol);
2600 }
2601 }
2602 }
2603 }
2604 }
2605
2606 void
2607 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2608 {
2609
2610 if (vp != NULL) {
2611 if (auto_configured == 1) {
2612 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2613 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2614 vput(vp);
2615
2616 } else {
2617 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2618 }
2619 }
2620 }
2621
2622
2623 void
2624 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2625 {
2626 int r,c;
2627 struct vnode *vp;
2628 int acd;
2629
2630
2631 /* We take this opportunity to close the vnodes like we should.. */
2632
2633 for (c = 0; c < raidPtr->numCol; c++) {
2634 vp = raidPtr->raid_cinfo[c].ci_vp;
2635 acd = raidPtr->Disks[c].auto_configured;
2636 rf_close_component(raidPtr, vp, acd);
2637 raidPtr->raid_cinfo[c].ci_vp = NULL;
2638 raidPtr->Disks[c].auto_configured = 0;
2639 }
2640
2641 for (r = 0; r < raidPtr->numSpare; r++) {
2642 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2643 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2644 rf_close_component(raidPtr, vp, acd);
2645 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2646 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2647 }
2648 }
2649
2650
2651 void
2652 rf_ReconThread(struct rf_recon_req *req)
2653 {
2654 int s;
2655 RF_Raid_t *raidPtr;
2656
2657 s = splbio();
2658 raidPtr = (RF_Raid_t *) req->raidPtr;
2659 raidPtr->recon_in_progress = 1;
2660
2661 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2662 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2663
2664 RF_Free(req, sizeof(*req));
2665
2666 raidPtr->recon_in_progress = 0;
2667 splx(s);
2668
2669 /* That's all... */
2670 kthread_exit(0); /* does not return */
2671 }
2672
2673 void
2674 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2675 {
2676 int retcode;
2677 int s;
2678
2679 raidPtr->parity_rewrite_stripes_done = 0;
2680 raidPtr->parity_rewrite_in_progress = 1;
2681 s = splbio();
2682 retcode = rf_RewriteParity(raidPtr);
2683 splx(s);
2684 if (retcode) {
2685 printf("raid%d: Error re-writing parity (%d)!\n",
2686 raidPtr->raidid, retcode);
2687 } else {
2688 /* set the clean bit! If we shutdown correctly,
2689 the clean bit on each component label will get
2690 set */
2691 raidPtr->parity_good = RF_RAID_CLEAN;
2692 }
2693 raidPtr->parity_rewrite_in_progress = 0;
2694
2695 /* Anyone waiting for us to stop? If so, inform them... */
2696 if (raidPtr->waitShutdown) {
2697 wakeup(&raidPtr->parity_rewrite_in_progress);
2698 }
2699
2700 /* That's all... */
2701 kthread_exit(0); /* does not return */
2702 }
2703
2704
2705 void
2706 rf_CopybackThread(RF_Raid_t *raidPtr)
2707 {
2708 int s;
2709
2710 raidPtr->copyback_in_progress = 1;
2711 s = splbio();
2712 rf_CopybackReconstructedData(raidPtr);
2713 splx(s);
2714 raidPtr->copyback_in_progress = 0;
2715
2716 /* That's all... */
2717 kthread_exit(0); /* does not return */
2718 }
2719
2720
2721 void
2722 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2723 {
2724 int s;
2725 RF_Raid_t *raidPtr;
2726
2727 s = splbio();
2728 raidPtr = req->raidPtr;
2729 raidPtr->recon_in_progress = 1;
2730 rf_ReconstructInPlace(raidPtr, req->col);
2731 RF_Free(req, sizeof(*req));
2732 raidPtr->recon_in_progress = 0;
2733 splx(s);
2734
2735 /* That's all... */
2736 kthread_exit(0); /* does not return */
2737 }
2738
2739 static RF_AutoConfig_t *
2740 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2741 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2742 unsigned secsize)
2743 {
2744 int good_one = 0;
2745 RF_ComponentLabel_t *clabel;
2746 RF_AutoConfig_t *ac;
2747
2748 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2749 if (clabel == NULL) {
2750 oomem:
2751 while(ac_list) {
2752 ac = ac_list;
2753 if (ac->clabel)
2754 free(ac->clabel, M_RAIDFRAME);
2755 ac_list = ac_list->next;
2756 free(ac, M_RAIDFRAME);
2757 }
2758 printf("RAID auto config: out of memory!\n");
2759 return NULL; /* XXX probably should panic? */
2760 }
2761
2762 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2763 /* Got the label. Does it look reasonable? */
2764 if (rf_reasonable_label(clabel, numsecs) &&
2765 (rf_component_label_partitionsize(clabel) <= size)) {
2766 #ifdef DEBUG
2767 printf("Component on: %s: %llu\n",
2768 cname, (unsigned long long)size);
2769 rf_print_component_label(clabel);
2770 #endif
2771 /* if it's reasonable, add it, else ignore it. */
2772 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2773 M_NOWAIT);
2774 if (ac == NULL) {
2775 free(clabel, M_RAIDFRAME);
2776 goto oomem;
2777 }
2778 strlcpy(ac->devname, cname, sizeof(ac->devname));
2779 ac->dev = dev;
2780 ac->vp = vp;
2781 ac->clabel = clabel;
2782 ac->next = ac_list;
2783 ac_list = ac;
2784 good_one = 1;
2785 }
2786 }
2787 if (!good_one) {
2788 /* cleanup */
2789 free(clabel, M_RAIDFRAME);
2790 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2791 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2792 vput(vp);
2793 }
2794 return ac_list;
2795 }
2796
2797 RF_AutoConfig_t *
2798 rf_find_raid_components(void)
2799 {
2800 struct vnode *vp;
2801 struct disklabel label;
2802 device_t dv;
2803 deviter_t di;
2804 dev_t dev;
2805 int bmajor, bminor, wedge, rf_part_found;
2806 int error;
2807 int i;
2808 RF_AutoConfig_t *ac_list;
2809 uint64_t numsecs;
2810 unsigned secsize;
2811 int dowedges;
2812
2813 /* initialize the AutoConfig list */
2814 ac_list = NULL;
2815
2816 /*
2817 * we begin by trolling through *all* the devices on the system *twice*
2818 * first we scan for wedges, second for other devices. This avoids
2819 * using a raw partition instead of a wedge that covers the whole disk
2820 */
2821
2822 for (dowedges=1; dowedges>=0; --dowedges) {
2823 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2824 dv = deviter_next(&di)) {
2825
2826 /* we are only interested in disks... */
2827 if (device_class(dv) != DV_DISK)
2828 continue;
2829
2830 /* we don't care about floppies... */
2831 if (device_is_a(dv, "fd")) {
2832 continue;
2833 }
2834
2835 /* we don't care about CD's... */
2836 if (device_is_a(dv, "cd")) {
2837 continue;
2838 }
2839
2840 /* we don't care about md's... */
2841 if (device_is_a(dv, "md")) {
2842 continue;
2843 }
2844
2845 /* hdfd is the Atari/Hades floppy driver */
2846 if (device_is_a(dv, "hdfd")) {
2847 continue;
2848 }
2849
2850 /* fdisa is the Atari/Milan floppy driver */
2851 if (device_is_a(dv, "fdisa")) {
2852 continue;
2853 }
2854
2855 /* are we in the wedges pass ? */
2856 wedge = device_is_a(dv, "dk");
2857 if (wedge != dowedges) {
2858 continue;
2859 }
2860
2861 /* need to find the device_name_to_block_device_major stuff */
2862 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2863
2864 rf_part_found = 0; /*No raid partition as yet*/
2865
2866 /* get a vnode for the raw partition of this disk */
2867 bminor = minor(device_unit(dv));
2868 dev = wedge ? makedev(bmajor, bminor) :
2869 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2870 if (bdevvp(dev, &vp))
2871 panic("RAID can't alloc vnode");
2872
2873 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2874
2875 if (error) {
2876 /* "Who cares." Continue looking
2877 for something that exists*/
2878 vput(vp);
2879 continue;
2880 }
2881
2882 error = getdisksize(vp, &numsecs, &secsize);
2883 if (error) {
2884 /*
2885 * Pseudo devices like vnd and cgd can be
2886 * opened but may still need some configuration.
2887 * Ignore these quietly.
2888 */
2889 if (error != ENXIO)
2890 printf("RAIDframe: can't get disk size"
2891 " for dev %s (%d)\n",
2892 device_xname(dv), error);
2893 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2894 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2895 vput(vp);
2896 continue;
2897 }
2898 if (wedge) {
2899 struct dkwedge_info dkw;
2900 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2901 NOCRED);
2902 if (error) {
2903 printf("RAIDframe: can't get wedge info for "
2904 "dev %s (%d)\n", device_xname(dv), error);
2905 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2906 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2907 vput(vp);
2908 continue;
2909 }
2910
2911 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2912 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2913 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2914 vput(vp);
2915 continue;
2916 }
2917
2918 ac_list = rf_get_component(ac_list, dev, vp,
2919 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2920 rf_part_found = 1; /*There is a raid component on this disk*/
2921 continue;
2922 }
2923
2924 /* Ok, the disk exists. Go get the disklabel. */
2925 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2926 if (error) {
2927 /*
2928 * XXX can't happen - open() would
2929 * have errored out (or faked up one)
2930 */
2931 if (error != ENOTTY)
2932 printf("RAIDframe: can't get label for dev "
2933 "%s (%d)\n", device_xname(dv), error);
2934 }
2935
2936 /* don't need this any more. We'll allocate it again
2937 a little later if we really do... */
2938 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2939 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2940 vput(vp);
2941
2942 if (error)
2943 continue;
2944
2945 rf_part_found = 0; /*No raid partitions yet*/
2946 for (i = 0; i < label.d_npartitions; i++) {
2947 char cname[sizeof(ac_list->devname)];
2948
2949 /* We only support partitions marked as RAID */
2950 if (label.d_partitions[i].p_fstype != FS_RAID)
2951 continue;
2952
2953 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2954 if (bdevvp(dev, &vp))
2955 panic("RAID can't alloc vnode");
2956
2957 error = VOP_OPEN(vp, FREAD, NOCRED);
2958 if (error) {
2959 /* Whatever... */
2960 vput(vp);
2961 continue;
2962 }
2963 snprintf(cname, sizeof(cname), "%s%c",
2964 device_xname(dv), 'a' + i);
2965 ac_list = rf_get_component(ac_list, dev, vp, cname,
2966 label.d_partitions[i].p_size, numsecs, secsize);
2967 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2968 }
2969
2970 /*
2971 *If there is no raid component on this disk, either in a
2972 *disklabel or inside a wedge, check the raw partition as well,
2973 *as it is possible to configure raid components on raw disk
2974 *devices.
2975 */
2976
2977 if (!rf_part_found) {
2978 char cname[sizeof(ac_list->devname)];
2979
2980 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2981 if (bdevvp(dev, &vp))
2982 panic("RAID can't alloc vnode");
2983
2984 error = VOP_OPEN(vp, FREAD, NOCRED);
2985 if (error) {
2986 /* Whatever... */
2987 vput(vp);
2988 continue;
2989 }
2990 snprintf(cname, sizeof(cname), "%s%c",
2991 device_xname(dv), 'a' + RAW_PART);
2992 ac_list = rf_get_component(ac_list, dev, vp, cname,
2993 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2994 }
2995 }
2996 deviter_release(&di);
2997 }
2998 return ac_list;
2999 }
3000
3001
3002 int
3003 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3004 {
3005
3006 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3007 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3008 ((clabel->clean == RF_RAID_CLEAN) ||
3009 (clabel->clean == RF_RAID_DIRTY)) &&
3010 clabel->row >=0 &&
3011 clabel->column >= 0 &&
3012 clabel->num_rows > 0 &&
3013 clabel->num_columns > 0 &&
3014 clabel->row < clabel->num_rows &&
3015 clabel->column < clabel->num_columns &&
3016 clabel->blockSize > 0 &&
3017 /*
3018 * numBlocksHi may contain garbage, but it is ok since
3019 * the type is unsigned. If it is really garbage,
3020 * rf_fix_old_label_size() will fix it.
3021 */
3022 rf_component_label_numblocks(clabel) > 0) {
3023 /*
3024 * label looks reasonable enough...
3025 * let's make sure it has no old garbage.
3026 */
3027 if (numsecs)
3028 rf_fix_old_label_size(clabel, numsecs);
3029 return(1);
3030 }
3031 return(0);
3032 }
3033
3034
3035 /*
3036 * For reasons yet unknown, some old component labels have garbage in
3037 * the newer numBlocksHi region, and this causes lossage. Since those
3038 * disks will also have numsecs set to less than 32 bits of sectors,
3039 * we can determine when this corruption has occurred, and fix it.
3040 *
3041 * The exact same problem, with the same unknown reason, happens to
3042 * the partitionSizeHi member as well.
3043 */
3044 static void
3045 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3046 {
3047
3048 if (numsecs < ((uint64_t)1 << 32)) {
3049 if (clabel->numBlocksHi) {
3050 printf("WARNING: total sectors < 32 bits, yet "
3051 "numBlocksHi set\n"
3052 "WARNING: resetting numBlocksHi to zero.\n");
3053 clabel->numBlocksHi = 0;
3054 }
3055
3056 if (clabel->partitionSizeHi) {
3057 printf("WARNING: total sectors < 32 bits, yet "
3058 "partitionSizeHi set\n"
3059 "WARNING: resetting partitionSizeHi to zero.\n");
3060 clabel->partitionSizeHi = 0;
3061 }
3062 }
3063 }
3064
3065
3066 #ifdef DEBUG
3067 void
3068 rf_print_component_label(RF_ComponentLabel_t *clabel)
3069 {
3070 uint64_t numBlocks;
3071 static const char *rp[] = {
3072 "No", "Force", "Soft", "*invalid*"
3073 };
3074
3075
3076 numBlocks = rf_component_label_numblocks(clabel);
3077
3078 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3079 clabel->row, clabel->column,
3080 clabel->num_rows, clabel->num_columns);
3081 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3082 clabel->version, clabel->serial_number,
3083 clabel->mod_counter);
3084 printf(" Clean: %s Status: %d\n",
3085 clabel->clean ? "Yes" : "No", clabel->status);
3086 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3087 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3088 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3089 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3090 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3091 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3092 printf(" Last configured as: raid%d\n", clabel->last_unit);
3093 #if 0
3094 printf(" Config order: %d\n", clabel->config_order);
3095 #endif
3096
3097 }
3098 #endif
3099
3100 RF_ConfigSet_t *
3101 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3102 {
3103 RF_AutoConfig_t *ac;
3104 RF_ConfigSet_t *config_sets;
3105 RF_ConfigSet_t *cset;
3106 RF_AutoConfig_t *ac_next;
3107
3108
3109 config_sets = NULL;
3110
3111 /* Go through the AutoConfig list, and figure out which components
3112 belong to what sets. */
3113 ac = ac_list;
3114 while(ac!=NULL) {
3115 /* we're going to putz with ac->next, so save it here
3116 for use at the end of the loop */
3117 ac_next = ac->next;
3118
3119 if (config_sets == NULL) {
3120 /* will need at least this one... */
3121 config_sets = (RF_ConfigSet_t *)
3122 malloc(sizeof(RF_ConfigSet_t),
3123 M_RAIDFRAME, M_NOWAIT);
3124 if (config_sets == NULL) {
3125 panic("rf_create_auto_sets: No memory!");
3126 }
3127 /* this one is easy :) */
3128 config_sets->ac = ac;
3129 config_sets->next = NULL;
3130 config_sets->rootable = 0;
3131 ac->next = NULL;
3132 } else {
3133 /* which set does this component fit into? */
3134 cset = config_sets;
3135 while(cset!=NULL) {
3136 if (rf_does_it_fit(cset, ac)) {
3137 /* looks like it matches... */
3138 ac->next = cset->ac;
3139 cset->ac = ac;
3140 break;
3141 }
3142 cset = cset->next;
3143 }
3144 if (cset==NULL) {
3145 /* didn't find a match above... new set..*/
3146 cset = (RF_ConfigSet_t *)
3147 malloc(sizeof(RF_ConfigSet_t),
3148 M_RAIDFRAME, M_NOWAIT);
3149 if (cset == NULL) {
3150 panic("rf_create_auto_sets: No memory!");
3151 }
3152 cset->ac = ac;
3153 ac->next = NULL;
3154 cset->next = config_sets;
3155 cset->rootable = 0;
3156 config_sets = cset;
3157 }
3158 }
3159 ac = ac_next;
3160 }
3161
3162
3163 return(config_sets);
3164 }
3165
3166 static int
3167 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3168 {
3169 RF_ComponentLabel_t *clabel1, *clabel2;
3170
3171 /* If this one matches the *first* one in the set, that's good
3172 enough, since the other members of the set would have been
3173 through here too... */
3174 /* note that we are not checking partitionSize here..
3175
3176 Note that we are also not checking the mod_counters here.
3177 If everything else matches except the mod_counter, that's
3178 good enough for this test. We will deal with the mod_counters
3179 a little later in the autoconfiguration process.
3180
3181 (clabel1->mod_counter == clabel2->mod_counter) &&
3182
3183 The reason we don't check for this is that failed disks
3184 will have lower modification counts. If those disks are
3185 not added to the set they used to belong to, then they will
3186 form their own set, which may result in 2 different sets,
3187 for example, competing to be configured at raid0, and
3188 perhaps competing to be the root filesystem set. If the
3189 wrong ones get configured, or both attempt to become /,
3190 weird behaviour and or serious lossage will occur. Thus we
3191 need to bring them into the fold here, and kick them out at
3192 a later point.
3193
3194 */
3195
3196 clabel1 = cset->ac->clabel;
3197 clabel2 = ac->clabel;
3198 if ((clabel1->version == clabel2->version) &&
3199 (clabel1->serial_number == clabel2->serial_number) &&
3200 (clabel1->num_rows == clabel2->num_rows) &&
3201 (clabel1->num_columns == clabel2->num_columns) &&
3202 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3203 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3204 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3205 (clabel1->parityConfig == clabel2->parityConfig) &&
3206 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3207 (clabel1->blockSize == clabel2->blockSize) &&
3208 rf_component_label_numblocks(clabel1) ==
3209 rf_component_label_numblocks(clabel2) &&
3210 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3211 (clabel1->root_partition == clabel2->root_partition) &&
3212 (clabel1->last_unit == clabel2->last_unit) &&
3213 (clabel1->config_order == clabel2->config_order)) {
3214 /* if it get's here, it almost *has* to be a match */
3215 } else {
3216 /* it's not consistent with somebody in the set..
3217 punt */
3218 return(0);
3219 }
3220 /* all was fine.. it must fit... */
3221 return(1);
3222 }
3223
3224 int
3225 rf_have_enough_components(RF_ConfigSet_t *cset)
3226 {
3227 RF_AutoConfig_t *ac;
3228 RF_AutoConfig_t *auto_config;
3229 RF_ComponentLabel_t *clabel;
3230 int c;
3231 int num_cols;
3232 int num_missing;
3233 int mod_counter;
3234 int mod_counter_found;
3235 int even_pair_failed;
3236 char parity_type;
3237
3238
3239 /* check to see that we have enough 'live' components
3240 of this set. If so, we can configure it if necessary */
3241
3242 num_cols = cset->ac->clabel->num_columns;
3243 parity_type = cset->ac->clabel->parityConfig;
3244
3245 /* XXX Check for duplicate components!?!?!? */
3246
3247 /* Determine what the mod_counter is supposed to be for this set. */
3248
3249 mod_counter_found = 0;
3250 mod_counter = 0;
3251 ac = cset->ac;
3252 while(ac!=NULL) {
3253 if (mod_counter_found==0) {
3254 mod_counter = ac->clabel->mod_counter;
3255 mod_counter_found = 1;
3256 } else {
3257 if (ac->clabel->mod_counter > mod_counter) {
3258 mod_counter = ac->clabel->mod_counter;
3259 }
3260 }
3261 ac = ac->next;
3262 }
3263
3264 num_missing = 0;
3265 auto_config = cset->ac;
3266
3267 even_pair_failed = 0;
3268 for(c=0; c<num_cols; c++) {
3269 ac = auto_config;
3270 while(ac!=NULL) {
3271 if ((ac->clabel->column == c) &&
3272 (ac->clabel->mod_counter == mod_counter)) {
3273 /* it's this one... */
3274 #ifdef DEBUG
3275 printf("Found: %s at %d\n",
3276 ac->devname,c);
3277 #endif
3278 break;
3279 }
3280 ac=ac->next;
3281 }
3282 if (ac==NULL) {
3283 /* Didn't find one here! */
3284 /* special case for RAID 1, especially
3285 where there are more than 2
3286 components (where RAIDframe treats
3287 things a little differently :( ) */
3288 if (parity_type == '1') {
3289 if (c%2 == 0) { /* even component */
3290 even_pair_failed = 1;
3291 } else { /* odd component. If
3292 we're failed, and
3293 so is the even
3294 component, it's
3295 "Good Night, Charlie" */
3296 if (even_pair_failed == 1) {
3297 return(0);
3298 }
3299 }
3300 } else {
3301 /* normal accounting */
3302 num_missing++;
3303 }
3304 }
3305 if ((parity_type == '1') && (c%2 == 1)) {
3306 /* Just did an even component, and we didn't
3307 bail.. reset the even_pair_failed flag,
3308 and go on to the next component.... */
3309 even_pair_failed = 0;
3310 }
3311 }
3312
3313 clabel = cset->ac->clabel;
3314
3315 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3316 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3317 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3318 /* XXX this needs to be made *much* more general */
3319 /* Too many failures */
3320 return(0);
3321 }
3322 /* otherwise, all is well, and we've got enough to take a kick
3323 at autoconfiguring this set */
3324 return(1);
3325 }
3326
3327 void
3328 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3329 RF_Raid_t *raidPtr)
3330 {
3331 RF_ComponentLabel_t *clabel;
3332 int i;
3333
3334 clabel = ac->clabel;
3335
3336 /* 1. Fill in the common stuff */
3337 config->numRow = clabel->num_rows = 1;
3338 config->numCol = clabel->num_columns;
3339 config->numSpare = 0; /* XXX should this be set here? */
3340 config->sectPerSU = clabel->sectPerSU;
3341 config->SUsPerPU = clabel->SUsPerPU;
3342 config->SUsPerRU = clabel->SUsPerRU;
3343 config->parityConfig = clabel->parityConfig;
3344 /* XXX... */
3345 strcpy(config->diskQueueType,"fifo");
3346 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3347 config->layoutSpecificSize = 0; /* XXX ?? */
3348
3349 while(ac!=NULL) {
3350 /* row/col values will be in range due to the checks
3351 in reasonable_label() */
3352 strcpy(config->devnames[0][ac->clabel->column],
3353 ac->devname);
3354 ac = ac->next;
3355 }
3356
3357 for(i=0;i<RF_MAXDBGV;i++) {
3358 config->debugVars[i][0] = 0;
3359 }
3360 }
3361
3362 int
3363 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3364 {
3365 RF_ComponentLabel_t *clabel;
3366 int column;
3367 int sparecol;
3368
3369 raidPtr->autoconfigure = new_value;
3370
3371 for(column=0; column<raidPtr->numCol; column++) {
3372 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3373 clabel = raidget_component_label(raidPtr, column);
3374 clabel->autoconfigure = new_value;
3375 raidflush_component_label(raidPtr, column);
3376 }
3377 }
3378 for(column = 0; column < raidPtr->numSpare ; column++) {
3379 sparecol = raidPtr->numCol + column;
3380 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3381 clabel = raidget_component_label(raidPtr, sparecol);
3382 clabel->autoconfigure = new_value;
3383 raidflush_component_label(raidPtr, sparecol);
3384 }
3385 }
3386 return(new_value);
3387 }
3388
3389 int
3390 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3391 {
3392 RF_ComponentLabel_t *clabel;
3393 int column;
3394 int sparecol;
3395
3396 raidPtr->root_partition = new_value;
3397 for(column=0; column<raidPtr->numCol; column++) {
3398 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3399 clabel = raidget_component_label(raidPtr, column);
3400 clabel->root_partition = new_value;
3401 raidflush_component_label(raidPtr, column);
3402 }
3403 }
3404 for(column = 0; column < raidPtr->numSpare ; column++) {
3405 sparecol = raidPtr->numCol + column;
3406 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3407 clabel = raidget_component_label(raidPtr, sparecol);
3408 clabel->root_partition = new_value;
3409 raidflush_component_label(raidPtr, sparecol);
3410 }
3411 }
3412 return(new_value);
3413 }
3414
3415 void
3416 rf_release_all_vps(RF_ConfigSet_t *cset)
3417 {
3418 RF_AutoConfig_t *ac;
3419
3420 ac = cset->ac;
3421 while(ac!=NULL) {
3422 /* Close the vp, and give it back */
3423 if (ac->vp) {
3424 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3425 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3426 vput(ac->vp);
3427 ac->vp = NULL;
3428 }
3429 ac = ac->next;
3430 }
3431 }
3432
3433
3434 void
3435 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3436 {
3437 RF_AutoConfig_t *ac;
3438 RF_AutoConfig_t *next_ac;
3439
3440 ac = cset->ac;
3441 while(ac!=NULL) {
3442 next_ac = ac->next;
3443 /* nuke the label */
3444 free(ac->clabel, M_RAIDFRAME);
3445 /* cleanup the config structure */
3446 free(ac, M_RAIDFRAME);
3447 /* "next.." */
3448 ac = next_ac;
3449 }
3450 /* and, finally, nuke the config set */
3451 free(cset, M_RAIDFRAME);
3452 }
3453
3454
3455 void
3456 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3457 {
3458 /* current version number */
3459 clabel->version = RF_COMPONENT_LABEL_VERSION;
3460 clabel->serial_number = raidPtr->serial_number;
3461 clabel->mod_counter = raidPtr->mod_counter;
3462
3463 clabel->num_rows = 1;
3464 clabel->num_columns = raidPtr->numCol;
3465 clabel->clean = RF_RAID_DIRTY; /* not clean */
3466 clabel->status = rf_ds_optimal; /* "It's good!" */
3467
3468 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3469 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3470 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3471
3472 clabel->blockSize = raidPtr->bytesPerSector;
3473 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3474
3475 /* XXX not portable */
3476 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3477 clabel->maxOutstanding = raidPtr->maxOutstanding;
3478 clabel->autoconfigure = raidPtr->autoconfigure;
3479 clabel->root_partition = raidPtr->root_partition;
3480 clabel->last_unit = raidPtr->raidid;
3481 clabel->config_order = raidPtr->config_order;
3482
3483 #ifndef RF_NO_PARITY_MAP
3484 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3485 #endif
3486 }
3487
3488 struct raid_softc *
3489 rf_auto_config_set(RF_ConfigSet_t *cset)
3490 {
3491 RF_Raid_t *raidPtr;
3492 RF_Config_t *config;
3493 int raidID;
3494 struct raid_softc *sc;
3495
3496 #ifdef DEBUG
3497 printf("RAID autoconfigure\n");
3498 #endif
3499
3500 /* 1. Create a config structure */
3501 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3502 if (config == NULL) {
3503 printf("%s: Out of mem - config!?!?\n", __func__);
3504 /* XXX do something more intelligent here. */
3505 return NULL;
3506 }
3507
3508 /*
3509 2. Figure out what RAID ID this one is supposed to live at
3510 See if we can get the same RAID dev that it was configured
3511 on last time..
3512 */
3513
3514 raidID = cset->ac->clabel->last_unit;
3515 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3516 sc = raidget(++raidID, false))
3517 continue;
3518 #ifdef DEBUG
3519 printf("Configuring raid%d:\n",raidID);
3520 #endif
3521
3522 if (sc == NULL)
3523 sc = raidget(raidID, true);
3524 if (sc == NULL) {
3525 printf("%s: Out of mem - softc!?!?\n", __func__);
3526 /* XXX do something more intelligent here. */
3527 free(config, M_RAIDFRAME);
3528 return NULL;
3529 }
3530
3531 raidPtr = &sc->sc_r;
3532
3533 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3534 raidPtr->softc = sc;
3535 raidPtr->raidid = raidID;
3536 raidPtr->openings = RAIDOUTSTANDING;
3537
3538 /* 3. Build the configuration structure */
3539 rf_create_configuration(cset->ac, config, raidPtr);
3540
3541 /* 4. Do the configuration */
3542 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3543 raidinit(sc);
3544
3545 rf_markalldirty(raidPtr);
3546 raidPtr->autoconfigure = 1; /* XXX do this here? */
3547 switch (cset->ac->clabel->root_partition) {
3548 case 1: /* Force Root */
3549 case 2: /* Soft Root: root when boot partition part of raid */
3550 /*
3551 * everything configured just fine. Make a note
3552 * that this set is eligible to be root,
3553 * or forced to be root
3554 */
3555 cset->rootable = cset->ac->clabel->root_partition;
3556 /* XXX do this here? */
3557 raidPtr->root_partition = cset->rootable;
3558 break;
3559 default:
3560 break;
3561 }
3562 } else {
3563 raidput(sc);
3564 sc = NULL;
3565 }
3566
3567 /* 5. Cleanup */
3568 free(config, M_RAIDFRAME);
3569 return sc;
3570 }
3571
3572 void
3573 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3574 size_t xmin, size_t xmax)
3575 {
3576 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3577 pool_sethiwat(p, xmax);
3578 pool_prime(p, xmin);
3579 pool_setlowat(p, xmin);
3580 }
3581
3582 /*
3583 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3584 * to see if there is IO pending and if that IO could possibly be done
3585 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3586 * otherwise.
3587 *
3588 */
3589 int
3590 rf_buf_queue_check(RF_Raid_t *raidPtr)
3591 {
3592 struct raid_softc *rs;
3593 struct dk_softc *dksc;
3594
3595 rs = raidPtr->softc;
3596 dksc = &rs->sc_dksc;
3597
3598 if ((rs->sc_flags & RAIDF_INITED) == 0)
3599 return 1;
3600
3601 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3602 /* there is work to do */
3603 return 0;
3604 }
3605 /* default is nothing to do */
3606 return 1;
3607 }
3608
3609 int
3610 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3611 {
3612 uint64_t numsecs;
3613 unsigned secsize;
3614 int error;
3615
3616 error = getdisksize(vp, &numsecs, &secsize);
3617 if (error == 0) {
3618 diskPtr->blockSize = secsize;
3619 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3620 diskPtr->partitionSize = numsecs;
3621 return 0;
3622 }
3623 return error;
3624 }
3625
3626 static int
3627 raid_match(device_t self, cfdata_t cfdata, void *aux)
3628 {
3629 return 1;
3630 }
3631
3632 static void
3633 raid_attach(device_t parent, device_t self, void *aux)
3634 {
3635 }
3636
3637
3638 static int
3639 raid_detach(device_t self, int flags)
3640 {
3641 int error;
3642 struct raid_softc *rs = raidsoftc(self);
3643
3644 if (rs == NULL)
3645 return ENXIO;
3646
3647 if ((error = raidlock(rs)) != 0)
3648 return (error);
3649
3650 error = raid_detach_unlocked(rs);
3651
3652 raidunlock(rs);
3653
3654 /* XXX raid can be referenced here */
3655
3656 if (error)
3657 return error;
3658
3659 /* Free the softc */
3660 raidput(rs);
3661
3662 return 0;
3663 }
3664
3665 static void
3666 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3667 {
3668 struct dk_softc *dksc = &rs->sc_dksc;
3669 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3670
3671 memset(dg, 0, sizeof(*dg));
3672
3673 dg->dg_secperunit = raidPtr->totalSectors;
3674 dg->dg_secsize = raidPtr->bytesPerSector;
3675 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3676 dg->dg_ntracks = 4 * raidPtr->numCol;
3677
3678 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3679 }
3680
3681 /*
3682 * Get cache info for all the components (including spares).
3683 * Returns intersection of all the cache flags of all disks, or first
3684 * error if any encountered.
3685 * XXXfua feature flags can change as spares are added - lock down somehow
3686 */
3687 static int
3688 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3689 {
3690 int c;
3691 int error;
3692 int dkwhole = 0, dkpart;
3693
3694 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3695 /*
3696 * Check any non-dead disk, even when currently being
3697 * reconstructed.
3698 */
3699 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3700 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3701 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3702 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3703 if (error) {
3704 if (error != ENODEV) {
3705 printf("raid%d: get cache for component %s failed\n",
3706 raidPtr->raidid,
3707 raidPtr->Disks[c].devname);
3708 }
3709
3710 return error;
3711 }
3712
3713 if (c == 0)
3714 dkwhole = dkpart;
3715 else
3716 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3717 }
3718 }
3719
3720 *data = dkwhole;
3721
3722 return 0;
3723 }
3724
3725 /*
3726 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3727 * We end up returning whatever error was returned by the first cache flush
3728 * that fails.
3729 */
3730
3731 int
3732 rf_sync_component_caches(RF_Raid_t *raidPtr)
3733 {
3734 int c, sparecol;
3735 int e,error;
3736 int force = 1;
3737
3738 error = 0;
3739 for (c = 0; c < raidPtr->numCol; c++) {
3740 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3741 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3742 &force, FWRITE, NOCRED);
3743 if (e) {
3744 if (e != ENODEV)
3745 printf("raid%d: cache flush to component %s failed.\n",
3746 raidPtr->raidid, raidPtr->Disks[c].devname);
3747 if (error == 0) {
3748 error = e;
3749 }
3750 }
3751 }
3752 }
3753
3754 for( c = 0; c < raidPtr->numSpare ; c++) {
3755 sparecol = raidPtr->numCol + c;
3756 /* Need to ensure that the reconstruct actually completed! */
3757 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3758 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3759 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3760 if (e) {
3761 if (e != ENODEV)
3762 printf("raid%d: cache flush to component %s failed.\n",
3763 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3764 if (error == 0) {
3765 error = e;
3766 }
3767 }
3768 }
3769 }
3770 return error;
3771 }
3772
3773 /*
3774 * Module interface
3775 */
3776
3777 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3778
3779 #ifdef _MODULE
3780 CFDRIVER_DECL(raid, DV_DISK, NULL);
3781 #endif
3782
3783 static int raid_modcmd(modcmd_t, void *);
3784 static int raid_modcmd_init(void);
3785 static int raid_modcmd_fini(void);
3786
3787 static int
3788 raid_modcmd(modcmd_t cmd, void *data)
3789 {
3790 int error;
3791
3792 error = 0;
3793 switch (cmd) {
3794 case MODULE_CMD_INIT:
3795 error = raid_modcmd_init();
3796 break;
3797 case MODULE_CMD_FINI:
3798 error = raid_modcmd_fini();
3799 break;
3800 default:
3801 error = ENOTTY;
3802 break;
3803 }
3804 return error;
3805 }
3806
3807 static int
3808 raid_modcmd_init(void)
3809 {
3810 int error;
3811 int bmajor, cmajor;
3812
3813 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3814 mutex_enter(&raid_lock);
3815 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3816 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3817 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3818 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3819
3820 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3821 #endif
3822
3823 bmajor = cmajor = -1;
3824 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3825 &raid_cdevsw, &cmajor);
3826 if (error != 0 && error != EEXIST) {
3827 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3828 mutex_exit(&raid_lock);
3829 return error;
3830 }
3831 #ifdef _MODULE
3832 error = config_cfdriver_attach(&raid_cd);
3833 if (error != 0) {
3834 aprint_error("%s: config_cfdriver_attach failed %d\n",
3835 __func__, error);
3836 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3837 mutex_exit(&raid_lock);
3838 return error;
3839 }
3840 #endif
3841 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3842 if (error != 0) {
3843 aprint_error("%s: config_cfattach_attach failed %d\n",
3844 __func__, error);
3845 #ifdef _MODULE
3846 config_cfdriver_detach(&raid_cd);
3847 #endif
3848 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3849 mutex_exit(&raid_lock);
3850 return error;
3851 }
3852
3853 raidautoconfigdone = false;
3854
3855 mutex_exit(&raid_lock);
3856
3857 if (error == 0) {
3858 if (rf_BootRaidframe(true) == 0)
3859 aprint_verbose("Kernelized RAIDframe activated\n");
3860 else
3861 panic("Serious error activating RAID!!");
3862 }
3863
3864 /*
3865 * Register a finalizer which will be used to auto-config RAID
3866 * sets once all real hardware devices have been found.
3867 */
3868 error = config_finalize_register(NULL, rf_autoconfig);
3869 if (error != 0) {
3870 aprint_error("WARNING: unable to register RAIDframe "
3871 "finalizer\n");
3872 error = 0;
3873 }
3874
3875 return error;
3876 }
3877
3878 static int
3879 raid_modcmd_fini(void)
3880 {
3881 int error;
3882
3883 mutex_enter(&raid_lock);
3884
3885 /* Don't allow unload if raid device(s) exist. */
3886 if (!LIST_EMPTY(&raids)) {
3887 mutex_exit(&raid_lock);
3888 return EBUSY;
3889 }
3890
3891 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3892 if (error != 0) {
3893 aprint_error("%s: cannot detach cfattach\n",__func__);
3894 mutex_exit(&raid_lock);
3895 return error;
3896 }
3897 #ifdef _MODULE
3898 error = config_cfdriver_detach(&raid_cd);
3899 if (error != 0) {
3900 aprint_error("%s: cannot detach cfdriver\n",__func__);
3901 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3902 mutex_exit(&raid_lock);
3903 return error;
3904 }
3905 #endif
3906 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3907 if (error != 0) {
3908 aprint_error("%s: cannot detach devsw\n",__func__);
3909 #ifdef _MODULE
3910 config_cfdriver_attach(&raid_cd);
3911 #endif
3912 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3913 mutex_exit(&raid_lock);
3914 return error;
3915 }
3916 rf_BootRaidframe(false);
3917 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3918 rf_destroy_mutex2(rf_sparet_wait_mutex);
3919 rf_destroy_cond2(rf_sparet_wait_cv);
3920 rf_destroy_cond2(rf_sparet_resp_cv);
3921 #endif
3922 mutex_exit(&raid_lock);
3923 mutex_destroy(&raid_lock);
3924
3925 return error;
3926 }
3927