rf_netbsdkintf.c revision 1.339 1 /* $NetBSD: rf_netbsdkintf.c,v 1.339 2016/01/05 17:06:34 mlelstv Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.339 2016/01/05 17:06:34 mlelstv Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #include "ioconf.h"
156
157 #ifdef DEBUG
158 int rf_kdebug_level = 0;
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #else /* DEBUG */
161 #define db1_printf(a) { }
162 #endif /* DEBUG */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 static rf_declare_mutex2(rf_sparet_wait_mutex);
166 static rf_declare_cond2(rf_sparet_wait_cv);
167 static rf_declare_cond2(rf_sparet_resp_cv);
168
169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
170 * spare table */
171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
172 * installation process */
173 #endif
174
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181 void *, int, struct proc *);
182 struct raid_softc;
183 static void raidinit(struct raid_softc *);
184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
185
186 static int raid_match(device_t, cfdata_t, void *);
187 static void raid_attach(device_t, device_t, void *);
188 static int raid_detach(device_t, int);
189
190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
191 daddr_t, daddr_t);
192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
193 daddr_t, daddr_t, int);
194
195 static int raidwrite_component_label(unsigned,
196 dev_t, struct vnode *, RF_ComponentLabel_t *);
197 static int raidread_component_label(unsigned,
198 dev_t, struct vnode *, RF_ComponentLabel_t *);
199
200 static int raid_diskstart(device_t, struct buf *bp);
201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
202 static int raid_lastclose(device_t);
203
204 static dev_type_open(raidopen);
205 static dev_type_close(raidclose);
206 static dev_type_read(raidread);
207 static dev_type_write(raidwrite);
208 static dev_type_ioctl(raidioctl);
209 static dev_type_strategy(raidstrategy);
210 static dev_type_dump(raiddump);
211 static dev_type_size(raidsize);
212
213 const struct bdevsw raid_bdevsw = {
214 .d_open = raidopen,
215 .d_close = raidclose,
216 .d_strategy = raidstrategy,
217 .d_ioctl = raidioctl,
218 .d_dump = raiddump,
219 .d_psize = raidsize,
220 .d_discard = nodiscard,
221 .d_flag = D_DISK
222 };
223
224 const struct cdevsw raid_cdevsw = {
225 .d_open = raidopen,
226 .d_close = raidclose,
227 .d_read = raidread,
228 .d_write = raidwrite,
229 .d_ioctl = raidioctl,
230 .d_stop = nostop,
231 .d_tty = notty,
232 .d_poll = nopoll,
233 .d_mmap = nommap,
234 .d_kqfilter = nokqfilter,
235 .d_discard = nodiscard,
236 .d_flag = D_DISK
237 };
238
239 static struct dkdriver rf_dkdriver = {
240 .d_open = raidopen,
241 .d_close = raidclose,
242 .d_strategy = raidstrategy,
243 .d_diskstart = raid_diskstart,
244 .d_dumpblocks = raid_dumpblocks,
245 .d_lastclose = raid_lastclose,
246 .d_minphys = minphys
247 };
248
249 struct raid_softc {
250 struct dk_softc sc_dksc;
251 int sc_unit;
252 int sc_flags; /* flags */
253 int sc_cflags; /* configuration flags */
254 kmutex_t sc_mutex; /* interlock mutex */
255 kcondvar_t sc_cv; /* and the condvar */
256 uint64_t sc_size; /* size of the raid device */
257 char sc_xname[20]; /* XXX external name */
258 RF_Raid_t sc_r;
259 LIST_ENTRY(raid_softc) sc_link;
260 };
261 /* sc_flags */
262 #define RAIDF_INITED 0x01 /* unit has been initialized */
263 #define RAIDF_WLABEL 0x02 /* label area is writable */
264 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
265 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
266 #define RAIDF_DETACH 0x10 /* detach after final close */
267 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
268 #define RAIDF_LOCKED 0x80 /* unit is locked */
269
270 #define raidunit(x) DISKUNIT(x)
271 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
272
273 extern struct cfdriver raid_cd;
274 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
275 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
276 DVF_DETACH_SHUTDOWN);
277
278 /*
279 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
280 * Be aware that large numbers can allow the driver to consume a lot of
281 * kernel memory, especially on writes, and in degraded mode reads.
282 *
283 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
284 * a single 64K write will typically require 64K for the old data,
285 * 64K for the old parity, and 64K for the new parity, for a total
286 * of 192K (if the parity buffer is not re-used immediately).
287 * Even it if is used immediately, that's still 128K, which when multiplied
288 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
289 *
290 * Now in degraded mode, for example, a 64K read on the above setup may
291 * require data reconstruction, which will require *all* of the 4 remaining
292 * disks to participate -- 4 * 32K/disk == 128K again.
293 */
294
295 #ifndef RAIDOUTSTANDING
296 #define RAIDOUTSTANDING 6
297 #endif
298
299 #define RAIDLABELDEV(dev) \
300 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
301
302 /* declared here, and made public, for the benefit of KVM stuff.. */
303
304 static int raidlock(struct raid_softc *);
305 static void raidunlock(struct raid_softc *);
306
307 static int raid_detach_unlocked(struct raid_softc *);
308
309 static void rf_markalldirty(RF_Raid_t *);
310 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
311
312 void rf_ReconThread(struct rf_recon_req *);
313 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
314 void rf_CopybackThread(RF_Raid_t *raidPtr);
315 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
316 int rf_autoconfig(device_t);
317 void rf_buildroothack(RF_ConfigSet_t *);
318
319 RF_AutoConfig_t *rf_find_raid_components(void);
320 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
321 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
322 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
323 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
324 int rf_set_autoconfig(RF_Raid_t *, int);
325 int rf_set_rootpartition(RF_Raid_t *, int);
326 void rf_release_all_vps(RF_ConfigSet_t *);
327 void rf_cleanup_config_set(RF_ConfigSet_t *);
328 int rf_have_enough_components(RF_ConfigSet_t *);
329 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
330 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
331
332 /*
333 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
334 * Note that this is overridden by having RAID_AUTOCONFIG as an option
335 * in the kernel config file.
336 */
337 #ifdef RAID_AUTOCONFIG
338 int raidautoconfig = 1;
339 #else
340 int raidautoconfig = 0;
341 #endif
342 static bool raidautoconfigdone = false;
343
344 struct RF_Pools_s rf_pools;
345
346 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
347 static kmutex_t raid_lock;
348
349 static struct raid_softc *
350 raidcreate(int unit) {
351 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
352 if (sc == NULL) {
353 #ifdef DIAGNOSTIC
354 printf("%s: out of memory\n", __func__);
355 #endif
356 return NULL;
357 }
358 sc->sc_unit = unit;
359 cv_init(&sc->sc_cv, "raidunit");
360 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
361 return sc;
362 }
363
364 static void
365 raiddestroy(struct raid_softc *sc) {
366 cv_destroy(&sc->sc_cv);
367 mutex_destroy(&sc->sc_mutex);
368 kmem_free(sc, sizeof(*sc));
369 }
370
371 static struct raid_softc *
372 raidget(int unit, bool create) {
373 struct raid_softc *sc;
374 if (unit < 0) {
375 #ifdef DIAGNOSTIC
376 panic("%s: unit %d!", __func__, unit);
377 #endif
378 return NULL;
379 }
380 mutex_enter(&raid_lock);
381 LIST_FOREACH(sc, &raids, sc_link) {
382 if (sc->sc_unit == unit) {
383 mutex_exit(&raid_lock);
384 return sc;
385 }
386 }
387 mutex_exit(&raid_lock);
388 if (!create)
389 return NULL;
390 if ((sc = raidcreate(unit)) == NULL)
391 return NULL;
392 mutex_enter(&raid_lock);
393 LIST_INSERT_HEAD(&raids, sc, sc_link);
394 mutex_exit(&raid_lock);
395 return sc;
396 }
397
398 static void
399 raidput(struct raid_softc *sc) {
400 mutex_enter(&raid_lock);
401 LIST_REMOVE(sc, sc_link);
402 mutex_exit(&raid_lock);
403 raiddestroy(sc);
404 }
405
406 void
407 raidattach(int num)
408 {
409
410 /*
411 * Device attachment and associated initialization now occurs
412 * as part of the module initialization.
413 */
414 }
415
416 int
417 rf_autoconfig(device_t self)
418 {
419 RF_AutoConfig_t *ac_list;
420 RF_ConfigSet_t *config_sets;
421
422 if (!raidautoconfig || raidautoconfigdone == true)
423 return (0);
424
425 /* XXX This code can only be run once. */
426 raidautoconfigdone = true;
427
428 #ifdef __HAVE_CPU_BOOTCONF
429 /*
430 * 0. find the boot device if needed first so we can use it later
431 * this needs to be done before we autoconfigure any raid sets,
432 * because if we use wedges we are not going to be able to open
433 * the boot device later
434 */
435 if (booted_device == NULL)
436 cpu_bootconf();
437 #endif
438 /* 1. locate all RAID components on the system */
439 aprint_debug("Searching for RAID components...\n");
440 ac_list = rf_find_raid_components();
441
442 /* 2. Sort them into their respective sets. */
443 config_sets = rf_create_auto_sets(ac_list);
444
445 /*
446 * 3. Evaluate each set and configure the valid ones.
447 * This gets done in rf_buildroothack().
448 */
449 rf_buildroothack(config_sets);
450
451 return 1;
452 }
453
454 static int
455 rf_containsboot(RF_Raid_t *r, device_t bdv) {
456 const char *bootname = device_xname(bdv);
457 size_t len = strlen(bootname);
458
459 for (int col = 0; col < r->numCol; col++) {
460 const char *devname = r->Disks[col].devname;
461 devname += sizeof("/dev/") - 1;
462 if (strncmp(devname, "dk", 2) == 0) {
463 const char *parent =
464 dkwedge_get_parent_name(r->Disks[col].dev);
465 if (parent != NULL)
466 devname = parent;
467 }
468 if (strncmp(devname, bootname, len) == 0) {
469 struct raid_softc *sc = r->softc;
470 aprint_debug("raid%d includes boot device %s\n",
471 sc->sc_unit, devname);
472 return 1;
473 }
474 }
475 return 0;
476 }
477
478 void
479 rf_buildroothack(RF_ConfigSet_t *config_sets)
480 {
481 RF_ConfigSet_t *cset;
482 RF_ConfigSet_t *next_cset;
483 int num_root;
484 struct raid_softc *sc, *rsc;
485 struct dk_softc *dksc;
486
487 sc = rsc = NULL;
488 num_root = 0;
489 cset = config_sets;
490 while (cset != NULL) {
491 next_cset = cset->next;
492 if (rf_have_enough_components(cset) &&
493 cset->ac->clabel->autoconfigure == 1) {
494 sc = rf_auto_config_set(cset);
495 if (sc != NULL) {
496 aprint_debug("raid%d: configured ok\n",
497 sc->sc_unit);
498 if (cset->rootable) {
499 rsc = sc;
500 num_root++;
501 }
502 } else {
503 /* The autoconfig didn't work :( */
504 aprint_debug("Autoconfig failed\n");
505 rf_release_all_vps(cset);
506 }
507 } else {
508 /* we're not autoconfiguring this set...
509 release the associated resources */
510 rf_release_all_vps(cset);
511 }
512 /* cleanup */
513 rf_cleanup_config_set(cset);
514 cset = next_cset;
515 }
516 dksc = &rsc->sc_dksc;
517
518 /* if the user has specified what the root device should be
519 then we don't touch booted_device or boothowto... */
520
521 if (rootspec != NULL)
522 return;
523
524 /* we found something bootable... */
525
526 /*
527 * XXX: The following code assumes that the root raid
528 * is the first ('a') partition. This is about the best
529 * we can do with a BSD disklabel, but we might be able
530 * to do better with a GPT label, by setting a specified
531 * attribute to indicate the root partition. We can then
532 * stash the partition number in the r->root_partition
533 * high bits (the bottom 2 bits are already used). For
534 * now we just set booted_partition to 0 when we override
535 * root.
536 */
537 if (num_root == 1) {
538 device_t candidate_root;
539 if (dksc->sc_dkdev.dk_nwedges != 0) {
540 char cname[sizeof(cset->ac->devname)];
541 /* XXX: assume 'a' */
542 snprintf(cname, sizeof(cname), "%s%c",
543 device_xname(dksc->sc_dev), 'a');
544 candidate_root = dkwedge_find_by_wname(cname);
545 } else
546 candidate_root = dksc->sc_dev;
547 if (booted_device == NULL ||
548 rsc->sc_r.root_partition == 1 ||
549 rf_containsboot(&rsc->sc_r, booted_device)) {
550 booted_device = candidate_root;
551 booted_partition = 0; /* XXX assume 'a' */
552 }
553 } else if (num_root > 1) {
554
555 /*
556 * Maybe the MD code can help. If it cannot, then
557 * setroot() will discover that we have no
558 * booted_device and will ask the user if nothing was
559 * hardwired in the kernel config file
560 */
561 if (booted_device == NULL)
562 return;
563
564 num_root = 0;
565 mutex_enter(&raid_lock);
566 LIST_FOREACH(sc, &raids, sc_link) {
567 RF_Raid_t *r = &sc->sc_r;
568 if (r->valid == 0)
569 continue;
570
571 if (r->root_partition == 0)
572 continue;
573
574 if (rf_containsboot(r, booted_device)) {
575 num_root++;
576 rsc = sc;
577 dksc = &rsc->sc_dksc;
578 }
579 }
580 mutex_exit(&raid_lock);
581
582 if (num_root == 1) {
583 booted_device = dksc->sc_dev;
584 booted_partition = 0; /* XXX assume 'a' */
585 } else {
586 /* we can't guess.. require the user to answer... */
587 boothowto |= RB_ASKNAME;
588 }
589 }
590 }
591
592 static int
593 raidsize(dev_t dev)
594 {
595 struct raid_softc *rs;
596 struct dk_softc *dksc;
597 unsigned int unit;
598
599 unit = raidunit(dev);
600 if ((rs = raidget(unit, false)) == NULL)
601 return -1;
602 dksc = &rs->sc_dksc;
603
604 if ((rs->sc_flags & RAIDF_INITED) == 0)
605 return -1;
606
607 return dk_size(dksc, dev);
608 }
609
610 static int
611 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
612 {
613 unsigned int unit;
614 struct raid_softc *rs;
615 struct dk_softc *dksc;
616
617 unit = raidunit(dev);
618 if ((rs = raidget(unit, false)) == NULL)
619 return ENXIO;
620 dksc = &rs->sc_dksc;
621
622 if ((rs->sc_flags & RAIDF_INITED) == 0)
623 return ENODEV;
624
625 /*
626 Note that blkno is relative to this particular partition.
627 By adding adding RF_PROTECTED_SECTORS, we get a value that
628 is relative to the partition used for the underlying component.
629 */
630 blkno += RF_PROTECTED_SECTORS;
631
632 return dk_dump(dksc, dev, blkno, va, size);
633 }
634
635 static int
636 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
637 {
638 struct raid_softc *rs = raidsoftc(dev);
639 const struct bdevsw *bdev;
640 RF_Raid_t *raidPtr;
641 int c, sparecol, j, scol, dumpto;
642 int error = 0;
643
644 raidPtr = &rs->sc_r;
645
646 /* we only support dumping to RAID 1 sets */
647 if (raidPtr->Layout.numDataCol != 1 ||
648 raidPtr->Layout.numParityCol != 1)
649 return EINVAL;
650
651 if ((error = raidlock(rs)) != 0)
652 return error;
653
654 /* figure out what device is alive.. */
655
656 /*
657 Look for a component to dump to. The preference for the
658 component to dump to is as follows:
659 1) the master
660 2) a used_spare of the master
661 3) the slave
662 4) a used_spare of the slave
663 */
664
665 dumpto = -1;
666 for (c = 0; c < raidPtr->numCol; c++) {
667 if (raidPtr->Disks[c].status == rf_ds_optimal) {
668 /* this might be the one */
669 dumpto = c;
670 break;
671 }
672 }
673
674 /*
675 At this point we have possibly selected a live master or a
676 live slave. We now check to see if there is a spared
677 master (or a spared slave), if we didn't find a live master
678 or a live slave.
679 */
680
681 for (c = 0; c < raidPtr->numSpare; c++) {
682 sparecol = raidPtr->numCol + c;
683 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
684 /* How about this one? */
685 scol = -1;
686 for(j=0;j<raidPtr->numCol;j++) {
687 if (raidPtr->Disks[j].spareCol == sparecol) {
688 scol = j;
689 break;
690 }
691 }
692 if (scol == 0) {
693 /*
694 We must have found a spared master!
695 We'll take that over anything else
696 found so far. (We couldn't have
697 found a real master before, since
698 this is a used spare, and it's
699 saying that it's replacing the
700 master.) On reboot (with
701 autoconfiguration turned on)
702 sparecol will become the 1st
703 component (component0) of this set.
704 */
705 dumpto = sparecol;
706 break;
707 } else if (scol != -1) {
708 /*
709 Must be a spared slave. We'll dump
710 to that if we havn't found anything
711 else so far.
712 */
713 if (dumpto == -1)
714 dumpto = sparecol;
715 }
716 }
717 }
718
719 if (dumpto == -1) {
720 /* we couldn't find any live components to dump to!?!?
721 */
722 error = EINVAL;
723 goto out;
724 }
725
726 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
727
728 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
729 blkno, va, nblk * raidPtr->bytesPerSector);
730
731 out:
732 raidunlock(rs);
733
734 return error;
735 }
736
737 /* ARGSUSED */
738 static int
739 raidopen(dev_t dev, int flags, int fmt,
740 struct lwp *l)
741 {
742 int unit = raidunit(dev);
743 struct raid_softc *rs;
744 struct dk_softc *dksc;
745 int error = 0;
746 int part, pmask;
747
748 if ((rs = raidget(unit, true)) == NULL)
749 return ENXIO;
750 if ((error = raidlock(rs)) != 0)
751 return (error);
752
753 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
754 error = EBUSY;
755 goto bad;
756 }
757
758 dksc = &rs->sc_dksc;
759
760 part = DISKPART(dev);
761 pmask = (1 << part);
762
763 if (!DK_BUSY(dksc, pmask) &&
764 ((rs->sc_flags & RAIDF_INITED) != 0)) {
765 /* First one... mark things as dirty... Note that we *MUST*
766 have done a configure before this. I DO NOT WANT TO BE
767 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
768 THAT THEY BELONG TOGETHER!!!!! */
769 /* XXX should check to see if we're only open for reading
770 here... If so, we needn't do this, but then need some
771 other way of keeping track of what's happened.. */
772
773 rf_markalldirty(&rs->sc_r);
774 }
775
776 if ((rs->sc_flags & RAIDF_INITED) != 0)
777 error = dk_open(dksc, dev, flags, fmt, l);
778
779 bad:
780 raidunlock(rs);
781
782 return (error);
783
784
785 }
786
787 static int
788 raid_lastclose(device_t self)
789 {
790 struct raid_softc *rs = raidsoftc(self);
791
792 /* Last one... device is not unconfigured yet.
793 Device shutdown has taken care of setting the
794 clean bits if RAIDF_INITED is not set
795 mark things as clean... */
796
797 rf_update_component_labels(&rs->sc_r,
798 RF_FINAL_COMPONENT_UPDATE);
799
800 /* pass to unlocked code */
801 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
802 rs->sc_flags |= RAIDF_DETACH;
803
804 return 0;
805 }
806
807 /* ARGSUSED */
808 static int
809 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
810 {
811 int unit = raidunit(dev);
812 struct raid_softc *rs;
813 struct dk_softc *dksc;
814 cfdata_t cf;
815 int error = 0, do_detach = 0, do_put = 0;
816
817 if ((rs = raidget(unit, false)) == NULL)
818 return ENXIO;
819 dksc = &rs->sc_dksc;
820
821 if ((error = raidlock(rs)) != 0)
822 return (error);
823
824 if ((rs->sc_flags & RAIDF_INITED) != 0) {
825 error = dk_close(dksc, dev, flags, fmt, l);
826 if ((rs->sc_flags & RAIDF_DETACH) != 0)
827 do_detach = 1;
828 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
829 do_put = 1;
830
831 raidunlock(rs);
832
833 if (do_detach) {
834 /* free the pseudo device attach bits */
835 cf = device_cfdata(dksc->sc_dev);
836 error = config_detach(dksc->sc_dev, 0);
837 if (error == 0)
838 free(cf, M_RAIDFRAME);
839 } else if (do_put) {
840 raidput(rs);
841 }
842
843 return (error);
844
845 }
846
847 static void
848 raid_wakeup(RF_Raid_t *raidPtr)
849 {
850 rf_lock_mutex2(raidPtr->iodone_lock);
851 rf_signal_cond2(raidPtr->iodone_cv);
852 rf_unlock_mutex2(raidPtr->iodone_lock);
853 }
854
855 static void
856 raidstrategy(struct buf *bp)
857 {
858 unsigned int unit;
859 struct raid_softc *rs;
860 struct dk_softc *dksc;
861 RF_Raid_t *raidPtr;
862
863 unit = raidunit(bp->b_dev);
864 if ((rs = raidget(unit, false)) == NULL) {
865 bp->b_error = ENXIO;
866 goto fail;
867 }
868 if ((rs->sc_flags & RAIDF_INITED) == 0) {
869 bp->b_error = ENXIO;
870 goto fail;
871 }
872 dksc = &rs->sc_dksc;
873 raidPtr = &rs->sc_r;
874
875 /* Queue IO only */
876 if (dk_strategy_defer(dksc, bp))
877 goto done;
878
879 /* schedule the IO to happen at the next convenient time */
880 raid_wakeup(raidPtr);
881
882 done:
883 return;
884
885 fail:
886 bp->b_resid = bp->b_bcount;
887 biodone(bp);
888 }
889
890 static int
891 raid_diskstart(device_t dev, struct buf *bp)
892 {
893 struct raid_softc *rs = raidsoftc(dev);
894 RF_Raid_t *raidPtr;
895
896 raidPtr = &rs->sc_r;
897 if (!raidPtr->valid) {
898 db1_printf(("raid is not valid..\n"));
899 return ENODEV;
900 }
901
902 /* XXX */
903 bp->b_resid = 0;
904
905 return raiddoaccess(raidPtr, bp);
906 }
907
908 void
909 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
910 {
911 struct raid_softc *rs;
912 struct dk_softc *dksc;
913
914 rs = raidPtr->softc;
915 dksc = &rs->sc_dksc;
916
917 dk_done(dksc, bp);
918
919 rf_lock_mutex2(raidPtr->mutex);
920 raidPtr->openings++;
921 rf_unlock_mutex2(raidPtr->mutex);
922
923 /* schedule more IO */
924 raid_wakeup(raidPtr);
925 }
926
927 /* ARGSUSED */
928 static int
929 raidread(dev_t dev, struct uio *uio, int flags)
930 {
931 int unit = raidunit(dev);
932 struct raid_softc *rs;
933
934 if ((rs = raidget(unit, false)) == NULL)
935 return ENXIO;
936
937 if ((rs->sc_flags & RAIDF_INITED) == 0)
938 return (ENXIO);
939
940 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
941
942 }
943
944 /* ARGSUSED */
945 static int
946 raidwrite(dev_t dev, struct uio *uio, int flags)
947 {
948 int unit = raidunit(dev);
949 struct raid_softc *rs;
950
951 if ((rs = raidget(unit, false)) == NULL)
952 return ENXIO;
953
954 if ((rs->sc_flags & RAIDF_INITED) == 0)
955 return (ENXIO);
956
957 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
958
959 }
960
961 static int
962 raid_detach_unlocked(struct raid_softc *rs)
963 {
964 struct dk_softc *dksc = &rs->sc_dksc;
965 RF_Raid_t *raidPtr;
966 int error;
967
968 raidPtr = &rs->sc_r;
969
970 if (DK_BUSY(dksc, 0) ||
971 raidPtr->recon_in_progress != 0 ||
972 raidPtr->parity_rewrite_in_progress != 0 ||
973 raidPtr->copyback_in_progress != 0)
974 return EBUSY;
975
976 if ((rs->sc_flags & RAIDF_INITED) == 0)
977 return 0;
978
979 rs->sc_flags &= ~RAIDF_SHUTDOWN;
980
981 if ((error = rf_Shutdown(raidPtr)) != 0)
982 return error;
983
984 rs->sc_flags &= ~RAIDF_INITED;
985
986 /* Kill off any queued buffers */
987 dk_drain(dksc);
988 bufq_free(dksc->sc_bufq);
989
990 /* Detach the disk. */
991 dkwedge_delall(&dksc->sc_dkdev);
992 disk_detach(&dksc->sc_dkdev);
993 disk_destroy(&dksc->sc_dkdev);
994 dk_detach(dksc);
995
996 return 0;
997 }
998
999 static int
1000 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1001 {
1002 int unit = raidunit(dev);
1003 int error = 0;
1004 int part, pmask;
1005 struct raid_softc *rs;
1006 struct dk_softc *dksc;
1007 RF_Config_t *k_cfg, *u_cfg;
1008 RF_Raid_t *raidPtr;
1009 RF_RaidDisk_t *diskPtr;
1010 RF_AccTotals_t *totals;
1011 RF_DeviceConfig_t *d_cfg, **ucfgp;
1012 u_char *specific_buf;
1013 int retcode = 0;
1014 int column;
1015 /* int raidid; */
1016 struct rf_recon_req *rrcopy, *rr;
1017 RF_ComponentLabel_t *clabel;
1018 RF_ComponentLabel_t *ci_label;
1019 RF_ComponentLabel_t **clabel_ptr;
1020 RF_SingleComponent_t *sparePtr,*componentPtr;
1021 RF_SingleComponent_t component;
1022 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1023 int i, j, d;
1024
1025 if ((rs = raidget(unit, false)) == NULL)
1026 return ENXIO;
1027 dksc = &rs->sc_dksc;
1028 raidPtr = &rs->sc_r;
1029
1030 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1031 (int) DISKPART(dev), (int) unit, cmd));
1032
1033 /* Must be initialized for these... */
1034 switch (cmd) {
1035 case RAIDFRAME_REWRITEPARITY:
1036 case RAIDFRAME_GET_INFO:
1037 case RAIDFRAME_RESET_ACCTOTALS:
1038 case RAIDFRAME_GET_ACCTOTALS:
1039 case RAIDFRAME_KEEP_ACCTOTALS:
1040 case RAIDFRAME_GET_SIZE:
1041 case RAIDFRAME_FAIL_DISK:
1042 case RAIDFRAME_COPYBACK:
1043 case RAIDFRAME_CHECK_RECON_STATUS:
1044 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1045 case RAIDFRAME_GET_COMPONENT_LABEL:
1046 case RAIDFRAME_SET_COMPONENT_LABEL:
1047 case RAIDFRAME_ADD_HOT_SPARE:
1048 case RAIDFRAME_REMOVE_HOT_SPARE:
1049 case RAIDFRAME_INIT_LABELS:
1050 case RAIDFRAME_REBUILD_IN_PLACE:
1051 case RAIDFRAME_CHECK_PARITY:
1052 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1053 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1054 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1055 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1056 case RAIDFRAME_SET_AUTOCONFIG:
1057 case RAIDFRAME_SET_ROOT:
1058 case RAIDFRAME_DELETE_COMPONENT:
1059 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1060 case RAIDFRAME_PARITYMAP_STATUS:
1061 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1062 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1063 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1064 if ((rs->sc_flags & RAIDF_INITED) == 0)
1065 return (ENXIO);
1066 }
1067
1068 switch (cmd) {
1069 #ifdef COMPAT_50
1070 case RAIDFRAME_GET_INFO50:
1071 return rf_get_info50(raidPtr, data);
1072
1073 case RAIDFRAME_CONFIGURE50:
1074 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1075 return retcode;
1076 goto config;
1077 #endif
1078 /* configure the system */
1079 case RAIDFRAME_CONFIGURE:
1080
1081 if (raidPtr->valid) {
1082 /* There is a valid RAID set running on this unit! */
1083 printf("raid%d: Device already configured!\n",unit);
1084 return(EINVAL);
1085 }
1086
1087 /* copy-in the configuration information */
1088 /* data points to a pointer to the configuration structure */
1089
1090 u_cfg = *((RF_Config_t **) data);
1091 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1092 if (k_cfg == NULL) {
1093 return (ENOMEM);
1094 }
1095 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1096 if (retcode) {
1097 RF_Free(k_cfg, sizeof(RF_Config_t));
1098 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1099 retcode));
1100 goto no_config;
1101 }
1102 goto config;
1103 config:
1104 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1105
1106 /* allocate a buffer for the layout-specific data, and copy it
1107 * in */
1108 if (k_cfg->layoutSpecificSize) {
1109 if (k_cfg->layoutSpecificSize > 10000) {
1110 /* sanity check */
1111 RF_Free(k_cfg, sizeof(RF_Config_t));
1112 retcode = EINVAL;
1113 goto no_config;
1114 }
1115 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1116 (u_char *));
1117 if (specific_buf == NULL) {
1118 RF_Free(k_cfg, sizeof(RF_Config_t));
1119 retcode = ENOMEM;
1120 goto no_config;
1121 }
1122 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1123 k_cfg->layoutSpecificSize);
1124 if (retcode) {
1125 RF_Free(k_cfg, sizeof(RF_Config_t));
1126 RF_Free(specific_buf,
1127 k_cfg->layoutSpecificSize);
1128 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1129 retcode));
1130 goto no_config;
1131 }
1132 } else
1133 specific_buf = NULL;
1134 k_cfg->layoutSpecific = specific_buf;
1135
1136 /* should do some kind of sanity check on the configuration.
1137 * Store the sum of all the bytes in the last byte? */
1138
1139 /* configure the system */
1140
1141 /*
1142 * Clear the entire RAID descriptor, just to make sure
1143 * there is no stale data left in the case of a
1144 * reconfiguration
1145 */
1146 memset(raidPtr, 0, sizeof(*raidPtr));
1147 raidPtr->softc = rs;
1148 raidPtr->raidid = unit;
1149
1150 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1151
1152 if (retcode == 0) {
1153
1154 /* allow this many simultaneous IO's to
1155 this RAID device */
1156 raidPtr->openings = RAIDOUTSTANDING;
1157
1158 raidinit(rs);
1159 raid_wakeup(raidPtr);
1160 rf_markalldirty(raidPtr);
1161 }
1162 /* free the buffers. No return code here. */
1163 if (k_cfg->layoutSpecificSize) {
1164 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1165 }
1166 RF_Free(k_cfg, sizeof(RF_Config_t));
1167
1168 no_config:
1169 /*
1170 * If configuration failed, set sc_flags so that we
1171 * will detach the device when we close it.
1172 */
1173 if (retcode != 0)
1174 rs->sc_flags |= RAIDF_SHUTDOWN;
1175 return (retcode);
1176
1177 /* shutdown the system */
1178 case RAIDFRAME_SHUTDOWN:
1179
1180 part = DISKPART(dev);
1181 pmask = (1 << part);
1182
1183 if ((error = raidlock(rs)) != 0)
1184 return (error);
1185
1186 if (DK_BUSY(dksc, pmask) ||
1187 raidPtr->recon_in_progress != 0 ||
1188 raidPtr->parity_rewrite_in_progress != 0 ||
1189 raidPtr->copyback_in_progress != 0)
1190 retcode = EBUSY;
1191 else {
1192 /* detach and free on close */
1193 rs->sc_flags |= RAIDF_SHUTDOWN;
1194 retcode = 0;
1195 }
1196
1197 raidunlock(rs);
1198
1199 return (retcode);
1200 case RAIDFRAME_GET_COMPONENT_LABEL:
1201 clabel_ptr = (RF_ComponentLabel_t **) data;
1202 /* need to read the component label for the disk indicated
1203 by row,column in clabel */
1204
1205 /*
1206 * Perhaps there should be an option to skip the in-core
1207 * copy and hit the disk, as with disklabel(8).
1208 */
1209 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1210
1211 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1212
1213 if (retcode) {
1214 RF_Free(clabel, sizeof(*clabel));
1215 return retcode;
1216 }
1217
1218 clabel->row = 0; /* Don't allow looking at anything else.*/
1219
1220 column = clabel->column;
1221
1222 if ((column < 0) || (column >= raidPtr->numCol +
1223 raidPtr->numSpare)) {
1224 RF_Free(clabel, sizeof(*clabel));
1225 return EINVAL;
1226 }
1227
1228 RF_Free(clabel, sizeof(*clabel));
1229
1230 clabel = raidget_component_label(raidPtr, column);
1231
1232 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1233
1234 #if 0
1235 case RAIDFRAME_SET_COMPONENT_LABEL:
1236 clabel = (RF_ComponentLabel_t *) data;
1237
1238 /* XXX check the label for valid stuff... */
1239 /* Note that some things *should not* get modified --
1240 the user should be re-initing the labels instead of
1241 trying to patch things.
1242 */
1243
1244 raidid = raidPtr->raidid;
1245 #ifdef DEBUG
1246 printf("raid%d: Got component label:\n", raidid);
1247 printf("raid%d: Version: %d\n", raidid, clabel->version);
1248 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1249 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1250 printf("raid%d: Column: %d\n", raidid, clabel->column);
1251 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1252 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1253 printf("raid%d: Status: %d\n", raidid, clabel->status);
1254 #endif
1255 clabel->row = 0;
1256 column = clabel->column;
1257
1258 if ((column < 0) || (column >= raidPtr->numCol)) {
1259 return(EINVAL);
1260 }
1261
1262 /* XXX this isn't allowed to do anything for now :-) */
1263
1264 /* XXX and before it is, we need to fill in the rest
1265 of the fields!?!?!?! */
1266 memcpy(raidget_component_label(raidPtr, column),
1267 clabel, sizeof(*clabel));
1268 raidflush_component_label(raidPtr, column);
1269 return (0);
1270 #endif
1271
1272 case RAIDFRAME_INIT_LABELS:
1273 clabel = (RF_ComponentLabel_t *) data;
1274 /*
1275 we only want the serial number from
1276 the above. We get all the rest of the information
1277 from the config that was used to create this RAID
1278 set.
1279 */
1280
1281 raidPtr->serial_number = clabel->serial_number;
1282
1283 for(column=0;column<raidPtr->numCol;column++) {
1284 diskPtr = &raidPtr->Disks[column];
1285 if (!RF_DEAD_DISK(diskPtr->status)) {
1286 ci_label = raidget_component_label(raidPtr,
1287 column);
1288 /* Zeroing this is important. */
1289 memset(ci_label, 0, sizeof(*ci_label));
1290 raid_init_component_label(raidPtr, ci_label);
1291 ci_label->serial_number =
1292 raidPtr->serial_number;
1293 ci_label->row = 0; /* we dont' pretend to support more */
1294 rf_component_label_set_partitionsize(ci_label,
1295 diskPtr->partitionSize);
1296 ci_label->column = column;
1297 raidflush_component_label(raidPtr, column);
1298 }
1299 /* XXXjld what about the spares? */
1300 }
1301
1302 return (retcode);
1303 case RAIDFRAME_SET_AUTOCONFIG:
1304 d = rf_set_autoconfig(raidPtr, *(int *) data);
1305 printf("raid%d: New autoconfig value is: %d\n",
1306 raidPtr->raidid, d);
1307 *(int *) data = d;
1308 return (retcode);
1309
1310 case RAIDFRAME_SET_ROOT:
1311 d = rf_set_rootpartition(raidPtr, *(int *) data);
1312 printf("raid%d: New rootpartition value is: %d\n",
1313 raidPtr->raidid, d);
1314 *(int *) data = d;
1315 return (retcode);
1316
1317 /* initialize all parity */
1318 case RAIDFRAME_REWRITEPARITY:
1319
1320 if (raidPtr->Layout.map->faultsTolerated == 0) {
1321 /* Parity for RAID 0 is trivially correct */
1322 raidPtr->parity_good = RF_RAID_CLEAN;
1323 return(0);
1324 }
1325
1326 if (raidPtr->parity_rewrite_in_progress == 1) {
1327 /* Re-write is already in progress! */
1328 return(EINVAL);
1329 }
1330
1331 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1332 rf_RewriteParityThread,
1333 raidPtr,"raid_parity");
1334 return (retcode);
1335
1336
1337 case RAIDFRAME_ADD_HOT_SPARE:
1338 sparePtr = (RF_SingleComponent_t *) data;
1339 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1340 retcode = rf_add_hot_spare(raidPtr, &component);
1341 return(retcode);
1342
1343 case RAIDFRAME_REMOVE_HOT_SPARE:
1344 return(retcode);
1345
1346 case RAIDFRAME_DELETE_COMPONENT:
1347 componentPtr = (RF_SingleComponent_t *)data;
1348 memcpy( &component, componentPtr,
1349 sizeof(RF_SingleComponent_t));
1350 retcode = rf_delete_component(raidPtr, &component);
1351 return(retcode);
1352
1353 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1354 componentPtr = (RF_SingleComponent_t *)data;
1355 memcpy( &component, componentPtr,
1356 sizeof(RF_SingleComponent_t));
1357 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1358 return(retcode);
1359
1360 case RAIDFRAME_REBUILD_IN_PLACE:
1361
1362 if (raidPtr->Layout.map->faultsTolerated == 0) {
1363 /* Can't do this on a RAID 0!! */
1364 return(EINVAL);
1365 }
1366
1367 if (raidPtr->recon_in_progress == 1) {
1368 /* a reconstruct is already in progress! */
1369 return(EINVAL);
1370 }
1371
1372 componentPtr = (RF_SingleComponent_t *) data;
1373 memcpy( &component, componentPtr,
1374 sizeof(RF_SingleComponent_t));
1375 component.row = 0; /* we don't support any more */
1376 column = component.column;
1377
1378 if ((column < 0) || (column >= raidPtr->numCol)) {
1379 return(EINVAL);
1380 }
1381
1382 rf_lock_mutex2(raidPtr->mutex);
1383 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1384 (raidPtr->numFailures > 0)) {
1385 /* XXX 0 above shouldn't be constant!!! */
1386 /* some component other than this has failed.
1387 Let's not make things worse than they already
1388 are... */
1389 printf("raid%d: Unable to reconstruct to disk at:\n",
1390 raidPtr->raidid);
1391 printf("raid%d: Col: %d Too many failures.\n",
1392 raidPtr->raidid, column);
1393 rf_unlock_mutex2(raidPtr->mutex);
1394 return (EINVAL);
1395 }
1396 if (raidPtr->Disks[column].status ==
1397 rf_ds_reconstructing) {
1398 printf("raid%d: Unable to reconstruct to disk at:\n",
1399 raidPtr->raidid);
1400 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1401
1402 rf_unlock_mutex2(raidPtr->mutex);
1403 return (EINVAL);
1404 }
1405 if (raidPtr->Disks[column].status == rf_ds_spared) {
1406 rf_unlock_mutex2(raidPtr->mutex);
1407 return (EINVAL);
1408 }
1409 rf_unlock_mutex2(raidPtr->mutex);
1410
1411 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1412 if (rrcopy == NULL)
1413 return(ENOMEM);
1414
1415 rrcopy->raidPtr = (void *) raidPtr;
1416 rrcopy->col = column;
1417
1418 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1419 rf_ReconstructInPlaceThread,
1420 rrcopy,"raid_reconip");
1421 return(retcode);
1422
1423 case RAIDFRAME_GET_INFO:
1424 if (!raidPtr->valid)
1425 return (ENODEV);
1426 ucfgp = (RF_DeviceConfig_t **) data;
1427 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1428 (RF_DeviceConfig_t *));
1429 if (d_cfg == NULL)
1430 return (ENOMEM);
1431 d_cfg->rows = 1; /* there is only 1 row now */
1432 d_cfg->cols = raidPtr->numCol;
1433 d_cfg->ndevs = raidPtr->numCol;
1434 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1435 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1436 return (ENOMEM);
1437 }
1438 d_cfg->nspares = raidPtr->numSpare;
1439 if (d_cfg->nspares >= RF_MAX_DISKS) {
1440 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1441 return (ENOMEM);
1442 }
1443 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1444 d = 0;
1445 for (j = 0; j < d_cfg->cols; j++) {
1446 d_cfg->devs[d] = raidPtr->Disks[j];
1447 d++;
1448 }
1449 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1450 d_cfg->spares[i] = raidPtr->Disks[j];
1451 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1452 /* XXX: raidctl(8) expects to see this as a used spare */
1453 d_cfg->spares[i].status = rf_ds_used_spare;
1454 }
1455 }
1456 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1457 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1458
1459 return (retcode);
1460
1461 case RAIDFRAME_CHECK_PARITY:
1462 *(int *) data = raidPtr->parity_good;
1463 return (0);
1464
1465 case RAIDFRAME_PARITYMAP_STATUS:
1466 if (rf_paritymap_ineligible(raidPtr))
1467 return EINVAL;
1468 rf_paritymap_status(raidPtr->parity_map,
1469 (struct rf_pmstat *)data);
1470 return 0;
1471
1472 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1473 if (rf_paritymap_ineligible(raidPtr))
1474 return EINVAL;
1475 if (raidPtr->parity_map == NULL)
1476 return ENOENT; /* ??? */
1477 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1478 (struct rf_pmparams *)data, 1))
1479 return EINVAL;
1480 return 0;
1481
1482 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1483 if (rf_paritymap_ineligible(raidPtr))
1484 return EINVAL;
1485 *(int *) data = rf_paritymap_get_disable(raidPtr);
1486 return 0;
1487
1488 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1489 if (rf_paritymap_ineligible(raidPtr))
1490 return EINVAL;
1491 rf_paritymap_set_disable(raidPtr, *(int *)data);
1492 /* XXX should errors be passed up? */
1493 return 0;
1494
1495 case RAIDFRAME_RESET_ACCTOTALS:
1496 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1497 return (0);
1498
1499 case RAIDFRAME_GET_ACCTOTALS:
1500 totals = (RF_AccTotals_t *) data;
1501 *totals = raidPtr->acc_totals;
1502 return (0);
1503
1504 case RAIDFRAME_KEEP_ACCTOTALS:
1505 raidPtr->keep_acc_totals = *(int *)data;
1506 return (0);
1507
1508 case RAIDFRAME_GET_SIZE:
1509 *(int *) data = raidPtr->totalSectors;
1510 return (0);
1511
1512 /* fail a disk & optionally start reconstruction */
1513 case RAIDFRAME_FAIL_DISK:
1514
1515 if (raidPtr->Layout.map->faultsTolerated == 0) {
1516 /* Can't do this on a RAID 0!! */
1517 return(EINVAL);
1518 }
1519
1520 rr = (struct rf_recon_req *) data;
1521 rr->row = 0;
1522 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1523 return (EINVAL);
1524
1525
1526 rf_lock_mutex2(raidPtr->mutex);
1527 if (raidPtr->status == rf_rs_reconstructing) {
1528 /* you can't fail a disk while we're reconstructing! */
1529 /* XXX wrong for RAID6 */
1530 rf_unlock_mutex2(raidPtr->mutex);
1531 return (EINVAL);
1532 }
1533 if ((raidPtr->Disks[rr->col].status ==
1534 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1535 /* some other component has failed. Let's not make
1536 things worse. XXX wrong for RAID6 */
1537 rf_unlock_mutex2(raidPtr->mutex);
1538 return (EINVAL);
1539 }
1540 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1541 /* Can't fail a spared disk! */
1542 rf_unlock_mutex2(raidPtr->mutex);
1543 return (EINVAL);
1544 }
1545 rf_unlock_mutex2(raidPtr->mutex);
1546
1547 /* make a copy of the recon request so that we don't rely on
1548 * the user's buffer */
1549 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1550 if (rrcopy == NULL)
1551 return(ENOMEM);
1552 memcpy(rrcopy, rr, sizeof(*rr));
1553 rrcopy->raidPtr = (void *) raidPtr;
1554
1555 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1556 rf_ReconThread,
1557 rrcopy,"raid_recon");
1558 return (0);
1559
1560 /* invoke a copyback operation after recon on whatever disk
1561 * needs it, if any */
1562 case RAIDFRAME_COPYBACK:
1563
1564 if (raidPtr->Layout.map->faultsTolerated == 0) {
1565 /* This makes no sense on a RAID 0!! */
1566 return(EINVAL);
1567 }
1568
1569 if (raidPtr->copyback_in_progress == 1) {
1570 /* Copyback is already in progress! */
1571 return(EINVAL);
1572 }
1573
1574 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1575 rf_CopybackThread,
1576 raidPtr,"raid_copyback");
1577 return (retcode);
1578
1579 /* return the percentage completion of reconstruction */
1580 case RAIDFRAME_CHECK_RECON_STATUS:
1581 if (raidPtr->Layout.map->faultsTolerated == 0) {
1582 /* This makes no sense on a RAID 0, so tell the
1583 user it's done. */
1584 *(int *) data = 100;
1585 return(0);
1586 }
1587 if (raidPtr->status != rf_rs_reconstructing)
1588 *(int *) data = 100;
1589 else {
1590 if (raidPtr->reconControl->numRUsTotal > 0) {
1591 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1592 } else {
1593 *(int *) data = 0;
1594 }
1595 }
1596 return (0);
1597 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1598 progressInfoPtr = (RF_ProgressInfo_t **) data;
1599 if (raidPtr->status != rf_rs_reconstructing) {
1600 progressInfo.remaining = 0;
1601 progressInfo.completed = 100;
1602 progressInfo.total = 100;
1603 } else {
1604 progressInfo.total =
1605 raidPtr->reconControl->numRUsTotal;
1606 progressInfo.completed =
1607 raidPtr->reconControl->numRUsComplete;
1608 progressInfo.remaining = progressInfo.total -
1609 progressInfo.completed;
1610 }
1611 retcode = copyout(&progressInfo, *progressInfoPtr,
1612 sizeof(RF_ProgressInfo_t));
1613 return (retcode);
1614
1615 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1616 if (raidPtr->Layout.map->faultsTolerated == 0) {
1617 /* This makes no sense on a RAID 0, so tell the
1618 user it's done. */
1619 *(int *) data = 100;
1620 return(0);
1621 }
1622 if (raidPtr->parity_rewrite_in_progress == 1) {
1623 *(int *) data = 100 *
1624 raidPtr->parity_rewrite_stripes_done /
1625 raidPtr->Layout.numStripe;
1626 } else {
1627 *(int *) data = 100;
1628 }
1629 return (0);
1630
1631 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1632 progressInfoPtr = (RF_ProgressInfo_t **) data;
1633 if (raidPtr->parity_rewrite_in_progress == 1) {
1634 progressInfo.total = raidPtr->Layout.numStripe;
1635 progressInfo.completed =
1636 raidPtr->parity_rewrite_stripes_done;
1637 progressInfo.remaining = progressInfo.total -
1638 progressInfo.completed;
1639 } else {
1640 progressInfo.remaining = 0;
1641 progressInfo.completed = 100;
1642 progressInfo.total = 100;
1643 }
1644 retcode = copyout(&progressInfo, *progressInfoPtr,
1645 sizeof(RF_ProgressInfo_t));
1646 return (retcode);
1647
1648 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1649 if (raidPtr->Layout.map->faultsTolerated == 0) {
1650 /* This makes no sense on a RAID 0 */
1651 *(int *) data = 100;
1652 return(0);
1653 }
1654 if (raidPtr->copyback_in_progress == 1) {
1655 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1656 raidPtr->Layout.numStripe;
1657 } else {
1658 *(int *) data = 100;
1659 }
1660 return (0);
1661
1662 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1663 progressInfoPtr = (RF_ProgressInfo_t **) data;
1664 if (raidPtr->copyback_in_progress == 1) {
1665 progressInfo.total = raidPtr->Layout.numStripe;
1666 progressInfo.completed =
1667 raidPtr->copyback_stripes_done;
1668 progressInfo.remaining = progressInfo.total -
1669 progressInfo.completed;
1670 } else {
1671 progressInfo.remaining = 0;
1672 progressInfo.completed = 100;
1673 progressInfo.total = 100;
1674 }
1675 retcode = copyout(&progressInfo, *progressInfoPtr,
1676 sizeof(RF_ProgressInfo_t));
1677 return (retcode);
1678
1679 /* the sparetable daemon calls this to wait for the kernel to
1680 * need a spare table. this ioctl does not return until a
1681 * spare table is needed. XXX -- calling mpsleep here in the
1682 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1683 * -- I should either compute the spare table in the kernel,
1684 * or have a different -- XXX XXX -- interface (a different
1685 * character device) for delivering the table -- XXX */
1686 #if 0
1687 case RAIDFRAME_SPARET_WAIT:
1688 rf_lock_mutex2(rf_sparet_wait_mutex);
1689 while (!rf_sparet_wait_queue)
1690 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1691 waitreq = rf_sparet_wait_queue;
1692 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1693 rf_unlock_mutex2(rf_sparet_wait_mutex);
1694
1695 /* structure assignment */
1696 *((RF_SparetWait_t *) data) = *waitreq;
1697
1698 RF_Free(waitreq, sizeof(*waitreq));
1699 return (0);
1700
1701 /* wakes up a process waiting on SPARET_WAIT and puts an error
1702 * code in it that will cause the dameon to exit */
1703 case RAIDFRAME_ABORT_SPARET_WAIT:
1704 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1705 waitreq->fcol = -1;
1706 rf_lock_mutex2(rf_sparet_wait_mutex);
1707 waitreq->next = rf_sparet_wait_queue;
1708 rf_sparet_wait_queue = waitreq;
1709 rf_broadcast_conf2(rf_sparet_wait_cv);
1710 rf_unlock_mutex2(rf_sparet_wait_mutex);
1711 return (0);
1712
1713 /* used by the spare table daemon to deliver a spare table
1714 * into the kernel */
1715 case RAIDFRAME_SEND_SPARET:
1716
1717 /* install the spare table */
1718 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1719
1720 /* respond to the requestor. the return status of the spare
1721 * table installation is passed in the "fcol" field */
1722 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1723 waitreq->fcol = retcode;
1724 rf_lock_mutex2(rf_sparet_wait_mutex);
1725 waitreq->next = rf_sparet_resp_queue;
1726 rf_sparet_resp_queue = waitreq;
1727 rf_broadcast_cond2(rf_sparet_resp_cv);
1728 rf_unlock_mutex2(rf_sparet_wait_mutex);
1729
1730 return (retcode);
1731 #endif
1732
1733 default:
1734 break; /* fall through to the os-specific code below */
1735
1736 }
1737
1738 if (!raidPtr->valid)
1739 return (EINVAL);
1740
1741 /*
1742 * Add support for "regular" device ioctls here.
1743 */
1744
1745 error = dk_ioctl(dksc, dev, cmd, data, flag, l);
1746 if (error != EPASSTHROUGH)
1747 return (error);
1748
1749 switch (cmd) {
1750 case DIOCCACHESYNC:
1751 return rf_sync_component_caches(raidPtr);
1752
1753 default:
1754 retcode = ENOTTY;
1755 }
1756 return (retcode);
1757
1758 }
1759
1760
1761 /* raidinit -- complete the rest of the initialization for the
1762 RAIDframe device. */
1763
1764
1765 static void
1766 raidinit(struct raid_softc *rs)
1767 {
1768 cfdata_t cf;
1769 unsigned int unit;
1770 struct dk_softc *dksc = &rs->sc_dksc;
1771 RF_Raid_t *raidPtr = &rs->sc_r;
1772 device_t dev;
1773
1774 unit = raidPtr->raidid;
1775
1776 /* XXX doesn't check bounds. */
1777 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1778
1779 /* attach the pseudo device */
1780 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1781 cf->cf_name = raid_cd.cd_name;
1782 cf->cf_atname = raid_cd.cd_name;
1783 cf->cf_unit = unit;
1784 cf->cf_fstate = FSTATE_STAR;
1785
1786 dev = config_attach_pseudo(cf);
1787 if (dev == NULL) {
1788 printf("raid%d: config_attach_pseudo failed\n",
1789 raidPtr->raidid);
1790 free(cf, M_RAIDFRAME);
1791 return;
1792 }
1793
1794 /* provide a backpointer to the real softc */
1795 raidsoftc(dev) = rs;
1796
1797 /* disk_attach actually creates space for the CPU disklabel, among
1798 * other things, so it's critical to call this *BEFORE* we try putzing
1799 * with disklabels. */
1800 dk_init(dksc, dev, DKTYPE_RAID);
1801 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1802
1803 /* XXX There may be a weird interaction here between this, and
1804 * protectedSectors, as used in RAIDframe. */
1805
1806 rs->sc_size = raidPtr->totalSectors;
1807
1808 /* Attach dk and disk subsystems */
1809 dk_attach(dksc);
1810 disk_attach(&dksc->sc_dkdev);
1811 rf_set_geometry(rs, raidPtr);
1812
1813 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1814
1815 /* mark unit as usuable */
1816 rs->sc_flags |= RAIDF_INITED;
1817
1818 dkwedge_discover(&dksc->sc_dkdev);
1819 }
1820
1821 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1822 /* wake up the daemon & tell it to get us a spare table
1823 * XXX
1824 * the entries in the queues should be tagged with the raidPtr
1825 * so that in the extremely rare case that two recons happen at once,
1826 * we know for which device were requesting a spare table
1827 * XXX
1828 *
1829 * XXX This code is not currently used. GO
1830 */
1831 int
1832 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1833 {
1834 int retcode;
1835
1836 rf_lock_mutex2(rf_sparet_wait_mutex);
1837 req->next = rf_sparet_wait_queue;
1838 rf_sparet_wait_queue = req;
1839 rf_broadcast_cond2(rf_sparet_wait_cv);
1840
1841 /* mpsleep unlocks the mutex */
1842 while (!rf_sparet_resp_queue) {
1843 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1844 }
1845 req = rf_sparet_resp_queue;
1846 rf_sparet_resp_queue = req->next;
1847 rf_unlock_mutex2(rf_sparet_wait_mutex);
1848
1849 retcode = req->fcol;
1850 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1851 * alloc'd */
1852 return (retcode);
1853 }
1854 #endif
1855
1856 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1857 * bp & passes it down.
1858 * any calls originating in the kernel must use non-blocking I/O
1859 * do some extra sanity checking to return "appropriate" error values for
1860 * certain conditions (to make some standard utilities work)
1861 *
1862 * Formerly known as: rf_DoAccessKernel
1863 */
1864 void
1865 raidstart(RF_Raid_t *raidPtr)
1866 {
1867 struct raid_softc *rs;
1868 struct dk_softc *dksc;
1869
1870 rs = raidPtr->softc;
1871 dksc = &rs->sc_dksc;
1872 /* quick check to see if anything has died recently */
1873 rf_lock_mutex2(raidPtr->mutex);
1874 if (raidPtr->numNewFailures > 0) {
1875 rf_unlock_mutex2(raidPtr->mutex);
1876 rf_update_component_labels(raidPtr,
1877 RF_NORMAL_COMPONENT_UPDATE);
1878 rf_lock_mutex2(raidPtr->mutex);
1879 raidPtr->numNewFailures--;
1880 }
1881 rf_unlock_mutex2(raidPtr->mutex);
1882
1883 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1884 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1885 return;
1886 }
1887
1888 dk_start(dksc, NULL);
1889 }
1890
1891 static int
1892 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1893 {
1894 RF_SectorCount_t num_blocks, pb, sum;
1895 RF_RaidAddr_t raid_addr;
1896 daddr_t blocknum;
1897 int do_async;
1898 int rc;
1899
1900 rf_lock_mutex2(raidPtr->mutex);
1901 if (raidPtr->openings == 0) {
1902 rf_unlock_mutex2(raidPtr->mutex);
1903 return EAGAIN;
1904 }
1905 rf_unlock_mutex2(raidPtr->mutex);
1906
1907 blocknum = bp->b_rawblkno;
1908
1909 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1910 (int) blocknum));
1911
1912 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1913 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1914
1915 /* *THIS* is where we adjust what block we're going to...
1916 * but DO NOT TOUCH bp->b_blkno!!! */
1917 raid_addr = blocknum;
1918
1919 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1920 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1921 sum = raid_addr + num_blocks + pb;
1922 if (1 || rf_debugKernelAccess) {
1923 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1924 (int) raid_addr, (int) sum, (int) num_blocks,
1925 (int) pb, (int) bp->b_resid));
1926 }
1927 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1928 || (sum < num_blocks) || (sum < pb)) {
1929 rc = ENOSPC;
1930 goto done;
1931 }
1932 /*
1933 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1934 */
1935
1936 if (bp->b_bcount & raidPtr->sectorMask) {
1937 rc = ENOSPC;
1938 goto done;
1939 }
1940 db1_printf(("Calling DoAccess..\n"));
1941
1942
1943 rf_lock_mutex2(raidPtr->mutex);
1944 raidPtr->openings--;
1945 rf_unlock_mutex2(raidPtr->mutex);
1946
1947 /*
1948 * Everything is async.
1949 */
1950 do_async = 1;
1951
1952 /* don't ever condition on bp->b_flags & B_WRITE.
1953 * always condition on B_READ instead */
1954
1955 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1956 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1957 do_async, raid_addr, num_blocks,
1958 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1959
1960 done:
1961 return rc;
1962 }
1963
1964 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1965
1966 int
1967 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1968 {
1969 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1970 struct buf *bp;
1971
1972 req->queue = queue;
1973 bp = req->bp;
1974
1975 switch (req->type) {
1976 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1977 /* XXX need to do something extra here.. */
1978 /* I'm leaving this in, as I've never actually seen it used,
1979 * and I'd like folks to report it... GO */
1980 printf(("WAKEUP CALLED\n"));
1981 queue->numOutstanding++;
1982
1983 bp->b_flags = 0;
1984 bp->b_private = req;
1985
1986 KernelWakeupFunc(bp);
1987 break;
1988
1989 case RF_IO_TYPE_READ:
1990 case RF_IO_TYPE_WRITE:
1991 #if RF_ACC_TRACE > 0
1992 if (req->tracerec) {
1993 RF_ETIMER_START(req->tracerec->timer);
1994 }
1995 #endif
1996 InitBP(bp, queue->rf_cinfo->ci_vp,
1997 op, queue->rf_cinfo->ci_dev,
1998 req->sectorOffset, req->numSector,
1999 req->buf, KernelWakeupFunc, (void *) req,
2000 queue->raidPtr->logBytesPerSector, req->b_proc);
2001
2002 if (rf_debugKernelAccess) {
2003 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2004 (long) bp->b_blkno));
2005 }
2006 queue->numOutstanding++;
2007 queue->last_deq_sector = req->sectorOffset;
2008 /* acc wouldn't have been let in if there were any pending
2009 * reqs at any other priority */
2010 queue->curPriority = req->priority;
2011
2012 db1_printf(("Going for %c to unit %d col %d\n",
2013 req->type, queue->raidPtr->raidid,
2014 queue->col));
2015 db1_printf(("sector %d count %d (%d bytes) %d\n",
2016 (int) req->sectorOffset, (int) req->numSector,
2017 (int) (req->numSector <<
2018 queue->raidPtr->logBytesPerSector),
2019 (int) queue->raidPtr->logBytesPerSector));
2020
2021 /*
2022 * XXX: drop lock here since this can block at
2023 * least with backing SCSI devices. Retake it
2024 * to minimize fuss with calling interfaces.
2025 */
2026
2027 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2028 bdev_strategy(bp);
2029 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2030 break;
2031
2032 default:
2033 panic("bad req->type in rf_DispatchKernelIO");
2034 }
2035 db1_printf(("Exiting from DispatchKernelIO\n"));
2036
2037 return (0);
2038 }
2039 /* this is the callback function associated with a I/O invoked from
2040 kernel code.
2041 */
2042 static void
2043 KernelWakeupFunc(struct buf *bp)
2044 {
2045 RF_DiskQueueData_t *req = NULL;
2046 RF_DiskQueue_t *queue;
2047
2048 db1_printf(("recovering the request queue:\n"));
2049
2050 req = bp->b_private;
2051
2052 queue = (RF_DiskQueue_t *) req->queue;
2053
2054 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2055
2056 #if RF_ACC_TRACE > 0
2057 if (req->tracerec) {
2058 RF_ETIMER_STOP(req->tracerec->timer);
2059 RF_ETIMER_EVAL(req->tracerec->timer);
2060 rf_lock_mutex2(rf_tracing_mutex);
2061 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2062 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2063 req->tracerec->num_phys_ios++;
2064 rf_unlock_mutex2(rf_tracing_mutex);
2065 }
2066 #endif
2067
2068 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2069 * ballistic, and mark the component as hosed... */
2070
2071 if (bp->b_error != 0) {
2072 /* Mark the disk as dead */
2073 /* but only mark it once... */
2074 /* and only if it wouldn't leave this RAID set
2075 completely broken */
2076 if (((queue->raidPtr->Disks[queue->col].status ==
2077 rf_ds_optimal) ||
2078 (queue->raidPtr->Disks[queue->col].status ==
2079 rf_ds_used_spare)) &&
2080 (queue->raidPtr->numFailures <
2081 queue->raidPtr->Layout.map->faultsTolerated)) {
2082 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2083 queue->raidPtr->raidid,
2084 bp->b_error,
2085 queue->raidPtr->Disks[queue->col].devname);
2086 queue->raidPtr->Disks[queue->col].status =
2087 rf_ds_failed;
2088 queue->raidPtr->status = rf_rs_degraded;
2089 queue->raidPtr->numFailures++;
2090 queue->raidPtr->numNewFailures++;
2091 } else { /* Disk is already dead... */
2092 /* printf("Disk already marked as dead!\n"); */
2093 }
2094
2095 }
2096
2097 /* Fill in the error value */
2098 req->error = bp->b_error;
2099
2100 /* Drop this one on the "finished" queue... */
2101 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2102
2103 /* Let the raidio thread know there is work to be done. */
2104 rf_signal_cond2(queue->raidPtr->iodone_cv);
2105
2106 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2107 }
2108
2109
2110 /*
2111 * initialize a buf structure for doing an I/O in the kernel.
2112 */
2113 static void
2114 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2115 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2116 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2117 struct proc *b_proc)
2118 {
2119 /* bp->b_flags = B_PHYS | rw_flag; */
2120 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2121 bp->b_oflags = 0;
2122 bp->b_cflags = 0;
2123 bp->b_bcount = numSect << logBytesPerSector;
2124 bp->b_bufsize = bp->b_bcount;
2125 bp->b_error = 0;
2126 bp->b_dev = dev;
2127 bp->b_data = bf;
2128 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2129 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2130 if (bp->b_bcount == 0) {
2131 panic("bp->b_bcount is zero in InitBP!!");
2132 }
2133 bp->b_proc = b_proc;
2134 bp->b_iodone = cbFunc;
2135 bp->b_private = cbArg;
2136 }
2137
2138 /*
2139 * Wait interruptibly for an exclusive lock.
2140 *
2141 * XXX
2142 * Several drivers do this; it should be abstracted and made MP-safe.
2143 * (Hmm... where have we seen this warning before :-> GO )
2144 */
2145 static int
2146 raidlock(struct raid_softc *rs)
2147 {
2148 int error;
2149
2150 error = 0;
2151 mutex_enter(&rs->sc_mutex);
2152 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2153 rs->sc_flags |= RAIDF_WANTED;
2154 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2155 if (error != 0)
2156 goto done;
2157 }
2158 rs->sc_flags |= RAIDF_LOCKED;
2159 done:
2160 mutex_exit(&rs->sc_mutex);
2161 return (error);
2162 }
2163 /*
2164 * Unlock and wake up any waiters.
2165 */
2166 static void
2167 raidunlock(struct raid_softc *rs)
2168 {
2169
2170 mutex_enter(&rs->sc_mutex);
2171 rs->sc_flags &= ~RAIDF_LOCKED;
2172 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2173 rs->sc_flags &= ~RAIDF_WANTED;
2174 cv_broadcast(&rs->sc_cv);
2175 }
2176 mutex_exit(&rs->sc_mutex);
2177 }
2178
2179
2180 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2181 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2182 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2183
2184 static daddr_t
2185 rf_component_info_offset(void)
2186 {
2187
2188 return RF_COMPONENT_INFO_OFFSET;
2189 }
2190
2191 static daddr_t
2192 rf_component_info_size(unsigned secsize)
2193 {
2194 daddr_t info_size;
2195
2196 KASSERT(secsize);
2197 if (secsize > RF_COMPONENT_INFO_SIZE)
2198 info_size = secsize;
2199 else
2200 info_size = RF_COMPONENT_INFO_SIZE;
2201
2202 return info_size;
2203 }
2204
2205 static daddr_t
2206 rf_parity_map_offset(RF_Raid_t *raidPtr)
2207 {
2208 daddr_t map_offset;
2209
2210 KASSERT(raidPtr->bytesPerSector);
2211 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2212 map_offset = raidPtr->bytesPerSector;
2213 else
2214 map_offset = RF_COMPONENT_INFO_SIZE;
2215 map_offset += rf_component_info_offset();
2216
2217 return map_offset;
2218 }
2219
2220 static daddr_t
2221 rf_parity_map_size(RF_Raid_t *raidPtr)
2222 {
2223 daddr_t map_size;
2224
2225 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2226 map_size = raidPtr->bytesPerSector;
2227 else
2228 map_size = RF_PARITY_MAP_SIZE;
2229
2230 return map_size;
2231 }
2232
2233 int
2234 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2235 {
2236 RF_ComponentLabel_t *clabel;
2237
2238 clabel = raidget_component_label(raidPtr, col);
2239 clabel->clean = RF_RAID_CLEAN;
2240 raidflush_component_label(raidPtr, col);
2241 return(0);
2242 }
2243
2244
2245 int
2246 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2247 {
2248 RF_ComponentLabel_t *clabel;
2249
2250 clabel = raidget_component_label(raidPtr, col);
2251 clabel->clean = RF_RAID_DIRTY;
2252 raidflush_component_label(raidPtr, col);
2253 return(0);
2254 }
2255
2256 int
2257 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2258 {
2259 KASSERT(raidPtr->bytesPerSector);
2260 return raidread_component_label(raidPtr->bytesPerSector,
2261 raidPtr->Disks[col].dev,
2262 raidPtr->raid_cinfo[col].ci_vp,
2263 &raidPtr->raid_cinfo[col].ci_label);
2264 }
2265
2266 RF_ComponentLabel_t *
2267 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2268 {
2269 return &raidPtr->raid_cinfo[col].ci_label;
2270 }
2271
2272 int
2273 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2274 {
2275 RF_ComponentLabel_t *label;
2276
2277 label = &raidPtr->raid_cinfo[col].ci_label;
2278 label->mod_counter = raidPtr->mod_counter;
2279 #ifndef RF_NO_PARITY_MAP
2280 label->parity_map_modcount = label->mod_counter;
2281 #endif
2282 return raidwrite_component_label(raidPtr->bytesPerSector,
2283 raidPtr->Disks[col].dev,
2284 raidPtr->raid_cinfo[col].ci_vp, label);
2285 }
2286
2287
2288 static int
2289 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2290 RF_ComponentLabel_t *clabel)
2291 {
2292 return raidread_component_area(dev, b_vp, clabel,
2293 sizeof(RF_ComponentLabel_t),
2294 rf_component_info_offset(),
2295 rf_component_info_size(secsize));
2296 }
2297
2298 /* ARGSUSED */
2299 static int
2300 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2301 size_t msize, daddr_t offset, daddr_t dsize)
2302 {
2303 struct buf *bp;
2304 int error;
2305
2306 /* XXX should probably ensure that we don't try to do this if
2307 someone has changed rf_protected_sectors. */
2308
2309 if (b_vp == NULL) {
2310 /* For whatever reason, this component is not valid.
2311 Don't try to read a component label from it. */
2312 return(EINVAL);
2313 }
2314
2315 /* get a block of the appropriate size... */
2316 bp = geteblk((int)dsize);
2317 bp->b_dev = dev;
2318
2319 /* get our ducks in a row for the read */
2320 bp->b_blkno = offset / DEV_BSIZE;
2321 bp->b_bcount = dsize;
2322 bp->b_flags |= B_READ;
2323 bp->b_resid = dsize;
2324
2325 bdev_strategy(bp);
2326 error = bp->b_error;
2327 if (!error)
2328 error = biowait(bp);
2329
2330 if (!error) {
2331 memcpy(data, bp->b_data, msize);
2332 }
2333
2334 brelse(bp, 0);
2335 return(error);
2336 }
2337
2338
2339 static int
2340 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2341 RF_ComponentLabel_t *clabel)
2342 {
2343 return raidwrite_component_area(dev, b_vp, clabel,
2344 sizeof(RF_ComponentLabel_t),
2345 rf_component_info_offset(),
2346 rf_component_info_size(secsize), 0);
2347 }
2348
2349 /* ARGSUSED */
2350 static int
2351 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2352 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2353 {
2354 struct buf *bp;
2355 int error;
2356
2357 /* get a block of the appropriate size... */
2358 bp = geteblk((int)dsize);
2359 bp->b_dev = dev;
2360
2361 /* get our ducks in a row for the write */
2362 bp->b_blkno = offset / DEV_BSIZE;
2363 bp->b_bcount = dsize;
2364 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2365 bp->b_resid = dsize;
2366
2367 memset(bp->b_data, 0, dsize);
2368 memcpy(bp->b_data, data, msize);
2369
2370 bdev_strategy(bp);
2371 if (asyncp)
2372 return 0;
2373 error = bp->b_error;
2374 if (!error)
2375 error = biowait(bp);
2376 brelse(bp, 0);
2377 if (error) {
2378 #if 1
2379 printf("Failed to write RAID component info!\n");
2380 #endif
2381 }
2382
2383 return(error);
2384 }
2385
2386 void
2387 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2388 {
2389 int c;
2390
2391 for (c = 0; c < raidPtr->numCol; c++) {
2392 /* Skip dead disks. */
2393 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2394 continue;
2395 /* XXXjld: what if an error occurs here? */
2396 raidwrite_component_area(raidPtr->Disks[c].dev,
2397 raidPtr->raid_cinfo[c].ci_vp, map,
2398 RF_PARITYMAP_NBYTE,
2399 rf_parity_map_offset(raidPtr),
2400 rf_parity_map_size(raidPtr), 0);
2401 }
2402 }
2403
2404 void
2405 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2406 {
2407 struct rf_paritymap_ondisk tmp;
2408 int c,first;
2409
2410 first=1;
2411 for (c = 0; c < raidPtr->numCol; c++) {
2412 /* Skip dead disks. */
2413 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2414 continue;
2415 raidread_component_area(raidPtr->Disks[c].dev,
2416 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2417 RF_PARITYMAP_NBYTE,
2418 rf_parity_map_offset(raidPtr),
2419 rf_parity_map_size(raidPtr));
2420 if (first) {
2421 memcpy(map, &tmp, sizeof(*map));
2422 first = 0;
2423 } else {
2424 rf_paritymap_merge(map, &tmp);
2425 }
2426 }
2427 }
2428
2429 void
2430 rf_markalldirty(RF_Raid_t *raidPtr)
2431 {
2432 RF_ComponentLabel_t *clabel;
2433 int sparecol;
2434 int c;
2435 int j;
2436 int scol = -1;
2437
2438 raidPtr->mod_counter++;
2439 for (c = 0; c < raidPtr->numCol; c++) {
2440 /* we don't want to touch (at all) a disk that has
2441 failed */
2442 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2443 clabel = raidget_component_label(raidPtr, c);
2444 if (clabel->status == rf_ds_spared) {
2445 /* XXX do something special...
2446 but whatever you do, don't
2447 try to access it!! */
2448 } else {
2449 raidmarkdirty(raidPtr, c);
2450 }
2451 }
2452 }
2453
2454 for( c = 0; c < raidPtr->numSpare ; c++) {
2455 sparecol = raidPtr->numCol + c;
2456 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2457 /*
2458
2459 we claim this disk is "optimal" if it's
2460 rf_ds_used_spare, as that means it should be
2461 directly substitutable for the disk it replaced.
2462 We note that too...
2463
2464 */
2465
2466 for(j=0;j<raidPtr->numCol;j++) {
2467 if (raidPtr->Disks[j].spareCol == sparecol) {
2468 scol = j;
2469 break;
2470 }
2471 }
2472
2473 clabel = raidget_component_label(raidPtr, sparecol);
2474 /* make sure status is noted */
2475
2476 raid_init_component_label(raidPtr, clabel);
2477
2478 clabel->row = 0;
2479 clabel->column = scol;
2480 /* Note: we *don't* change status from rf_ds_used_spare
2481 to rf_ds_optimal */
2482 /* clabel.status = rf_ds_optimal; */
2483
2484 raidmarkdirty(raidPtr, sparecol);
2485 }
2486 }
2487 }
2488
2489
2490 void
2491 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2492 {
2493 RF_ComponentLabel_t *clabel;
2494 int sparecol;
2495 int c;
2496 int j;
2497 int scol;
2498
2499 scol = -1;
2500
2501 /* XXX should do extra checks to make sure things really are clean,
2502 rather than blindly setting the clean bit... */
2503
2504 raidPtr->mod_counter++;
2505
2506 for (c = 0; c < raidPtr->numCol; c++) {
2507 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2508 clabel = raidget_component_label(raidPtr, c);
2509 /* make sure status is noted */
2510 clabel->status = rf_ds_optimal;
2511
2512 /* note what unit we are configured as */
2513 clabel->last_unit = raidPtr->raidid;
2514
2515 raidflush_component_label(raidPtr, c);
2516 if (final == RF_FINAL_COMPONENT_UPDATE) {
2517 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2518 raidmarkclean(raidPtr, c);
2519 }
2520 }
2521 }
2522 /* else we don't touch it.. */
2523 }
2524
2525 for( c = 0; c < raidPtr->numSpare ; c++) {
2526 sparecol = raidPtr->numCol + c;
2527 /* Need to ensure that the reconstruct actually completed! */
2528 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2529 /*
2530
2531 we claim this disk is "optimal" if it's
2532 rf_ds_used_spare, as that means it should be
2533 directly substitutable for the disk it replaced.
2534 We note that too...
2535
2536 */
2537
2538 for(j=0;j<raidPtr->numCol;j++) {
2539 if (raidPtr->Disks[j].spareCol == sparecol) {
2540 scol = j;
2541 break;
2542 }
2543 }
2544
2545 /* XXX shouldn't *really* need this... */
2546 clabel = raidget_component_label(raidPtr, sparecol);
2547 /* make sure status is noted */
2548
2549 raid_init_component_label(raidPtr, clabel);
2550
2551 clabel->column = scol;
2552 clabel->status = rf_ds_optimal;
2553 clabel->last_unit = raidPtr->raidid;
2554
2555 raidflush_component_label(raidPtr, sparecol);
2556 if (final == RF_FINAL_COMPONENT_UPDATE) {
2557 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2558 raidmarkclean(raidPtr, sparecol);
2559 }
2560 }
2561 }
2562 }
2563 }
2564
2565 void
2566 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2567 {
2568
2569 if (vp != NULL) {
2570 if (auto_configured == 1) {
2571 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2572 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2573 vput(vp);
2574
2575 } else {
2576 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2577 }
2578 }
2579 }
2580
2581
2582 void
2583 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2584 {
2585 int r,c;
2586 struct vnode *vp;
2587 int acd;
2588
2589
2590 /* We take this opportunity to close the vnodes like we should.. */
2591
2592 for (c = 0; c < raidPtr->numCol; c++) {
2593 vp = raidPtr->raid_cinfo[c].ci_vp;
2594 acd = raidPtr->Disks[c].auto_configured;
2595 rf_close_component(raidPtr, vp, acd);
2596 raidPtr->raid_cinfo[c].ci_vp = NULL;
2597 raidPtr->Disks[c].auto_configured = 0;
2598 }
2599
2600 for (r = 0; r < raidPtr->numSpare; r++) {
2601 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2602 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2603 rf_close_component(raidPtr, vp, acd);
2604 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2605 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2606 }
2607 }
2608
2609
2610 void
2611 rf_ReconThread(struct rf_recon_req *req)
2612 {
2613 int s;
2614 RF_Raid_t *raidPtr;
2615
2616 s = splbio();
2617 raidPtr = (RF_Raid_t *) req->raidPtr;
2618 raidPtr->recon_in_progress = 1;
2619
2620 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2621 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2622
2623 RF_Free(req, sizeof(*req));
2624
2625 raidPtr->recon_in_progress = 0;
2626 splx(s);
2627
2628 /* That's all... */
2629 kthread_exit(0); /* does not return */
2630 }
2631
2632 void
2633 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2634 {
2635 int retcode;
2636 int s;
2637
2638 raidPtr->parity_rewrite_stripes_done = 0;
2639 raidPtr->parity_rewrite_in_progress = 1;
2640 s = splbio();
2641 retcode = rf_RewriteParity(raidPtr);
2642 splx(s);
2643 if (retcode) {
2644 printf("raid%d: Error re-writing parity (%d)!\n",
2645 raidPtr->raidid, retcode);
2646 } else {
2647 /* set the clean bit! If we shutdown correctly,
2648 the clean bit on each component label will get
2649 set */
2650 raidPtr->parity_good = RF_RAID_CLEAN;
2651 }
2652 raidPtr->parity_rewrite_in_progress = 0;
2653
2654 /* Anyone waiting for us to stop? If so, inform them... */
2655 if (raidPtr->waitShutdown) {
2656 wakeup(&raidPtr->parity_rewrite_in_progress);
2657 }
2658
2659 /* That's all... */
2660 kthread_exit(0); /* does not return */
2661 }
2662
2663
2664 void
2665 rf_CopybackThread(RF_Raid_t *raidPtr)
2666 {
2667 int s;
2668
2669 raidPtr->copyback_in_progress = 1;
2670 s = splbio();
2671 rf_CopybackReconstructedData(raidPtr);
2672 splx(s);
2673 raidPtr->copyback_in_progress = 0;
2674
2675 /* That's all... */
2676 kthread_exit(0); /* does not return */
2677 }
2678
2679
2680 void
2681 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2682 {
2683 int s;
2684 RF_Raid_t *raidPtr;
2685
2686 s = splbio();
2687 raidPtr = req->raidPtr;
2688 raidPtr->recon_in_progress = 1;
2689 rf_ReconstructInPlace(raidPtr, req->col);
2690 RF_Free(req, sizeof(*req));
2691 raidPtr->recon_in_progress = 0;
2692 splx(s);
2693
2694 /* That's all... */
2695 kthread_exit(0); /* does not return */
2696 }
2697
2698 static RF_AutoConfig_t *
2699 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2700 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2701 unsigned secsize)
2702 {
2703 int good_one = 0;
2704 RF_ComponentLabel_t *clabel;
2705 RF_AutoConfig_t *ac;
2706
2707 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2708 if (clabel == NULL) {
2709 oomem:
2710 while(ac_list) {
2711 ac = ac_list;
2712 if (ac->clabel)
2713 free(ac->clabel, M_RAIDFRAME);
2714 ac_list = ac_list->next;
2715 free(ac, M_RAIDFRAME);
2716 }
2717 printf("RAID auto config: out of memory!\n");
2718 return NULL; /* XXX probably should panic? */
2719 }
2720
2721 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2722 /* Got the label. Does it look reasonable? */
2723 if (rf_reasonable_label(clabel, numsecs) &&
2724 (rf_component_label_partitionsize(clabel) <= size)) {
2725 #ifdef DEBUG
2726 printf("Component on: %s: %llu\n",
2727 cname, (unsigned long long)size);
2728 rf_print_component_label(clabel);
2729 #endif
2730 /* if it's reasonable, add it, else ignore it. */
2731 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2732 M_NOWAIT);
2733 if (ac == NULL) {
2734 free(clabel, M_RAIDFRAME);
2735 goto oomem;
2736 }
2737 strlcpy(ac->devname, cname, sizeof(ac->devname));
2738 ac->dev = dev;
2739 ac->vp = vp;
2740 ac->clabel = clabel;
2741 ac->next = ac_list;
2742 ac_list = ac;
2743 good_one = 1;
2744 }
2745 }
2746 if (!good_one) {
2747 /* cleanup */
2748 free(clabel, M_RAIDFRAME);
2749 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2750 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2751 vput(vp);
2752 }
2753 return ac_list;
2754 }
2755
2756 RF_AutoConfig_t *
2757 rf_find_raid_components(void)
2758 {
2759 struct vnode *vp;
2760 struct disklabel label;
2761 device_t dv;
2762 deviter_t di;
2763 dev_t dev;
2764 int bmajor, bminor, wedge, rf_part_found;
2765 int error;
2766 int i;
2767 RF_AutoConfig_t *ac_list;
2768 uint64_t numsecs;
2769 unsigned secsize;
2770 int dowedges;
2771
2772 /* initialize the AutoConfig list */
2773 ac_list = NULL;
2774
2775 /*
2776 * we begin by trolling through *all* the devices on the system *twice*
2777 * first we scan for wedges, second for other devices. This avoids
2778 * using a raw partition instead of a wedge that covers the whole disk
2779 */
2780
2781 for (dowedges=1; dowedges>=0; --dowedges) {
2782 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2783 dv = deviter_next(&di)) {
2784
2785 /* we are only interested in disks... */
2786 if (device_class(dv) != DV_DISK)
2787 continue;
2788
2789 /* we don't care about floppies... */
2790 if (device_is_a(dv, "fd")) {
2791 continue;
2792 }
2793
2794 /* we don't care about CD's... */
2795 if (device_is_a(dv, "cd")) {
2796 continue;
2797 }
2798
2799 /* we don't care about md's... */
2800 if (device_is_a(dv, "md")) {
2801 continue;
2802 }
2803
2804 /* hdfd is the Atari/Hades floppy driver */
2805 if (device_is_a(dv, "hdfd")) {
2806 continue;
2807 }
2808
2809 /* fdisa is the Atari/Milan floppy driver */
2810 if (device_is_a(dv, "fdisa")) {
2811 continue;
2812 }
2813
2814 /* are we in the wedges pass ? */
2815 wedge = device_is_a(dv, "dk");
2816 if (wedge != dowedges) {
2817 continue;
2818 }
2819
2820 /* need to find the device_name_to_block_device_major stuff */
2821 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2822
2823 rf_part_found = 0; /*No raid partition as yet*/
2824
2825 /* get a vnode for the raw partition of this disk */
2826 bminor = minor(device_unit(dv));
2827 dev = wedge ? makedev(bmajor, bminor) :
2828 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2829 if (bdevvp(dev, &vp))
2830 panic("RAID can't alloc vnode");
2831
2832 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2833
2834 if (error) {
2835 /* "Who cares." Continue looking
2836 for something that exists*/
2837 vput(vp);
2838 continue;
2839 }
2840
2841 error = getdisksize(vp, &numsecs, &secsize);
2842 if (error) {
2843 /*
2844 * Pseudo devices like vnd and cgd can be
2845 * opened but may still need some configuration.
2846 * Ignore these quietly.
2847 */
2848 if (error != ENXIO)
2849 printf("RAIDframe: can't get disk size"
2850 " for dev %s (%d)\n",
2851 device_xname(dv), error);
2852 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2853 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2854 vput(vp);
2855 continue;
2856 }
2857 if (wedge) {
2858 struct dkwedge_info dkw;
2859 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2860 NOCRED);
2861 if (error) {
2862 printf("RAIDframe: can't get wedge info for "
2863 "dev %s (%d)\n", device_xname(dv), error);
2864 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2865 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2866 vput(vp);
2867 continue;
2868 }
2869
2870 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2871 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2872 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2873 vput(vp);
2874 continue;
2875 }
2876
2877 ac_list = rf_get_component(ac_list, dev, vp,
2878 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2879 rf_part_found = 1; /*There is a raid component on this disk*/
2880 continue;
2881 }
2882
2883 /* Ok, the disk exists. Go get the disklabel. */
2884 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2885 if (error) {
2886 /*
2887 * XXX can't happen - open() would
2888 * have errored out (or faked up one)
2889 */
2890 if (error != ENOTTY)
2891 printf("RAIDframe: can't get label for dev "
2892 "%s (%d)\n", device_xname(dv), error);
2893 }
2894
2895 /* don't need this any more. We'll allocate it again
2896 a little later if we really do... */
2897 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2898 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2899 vput(vp);
2900
2901 if (error)
2902 continue;
2903
2904 rf_part_found = 0; /*No raid partitions yet*/
2905 for (i = 0; i < label.d_npartitions; i++) {
2906 char cname[sizeof(ac_list->devname)];
2907
2908 /* We only support partitions marked as RAID */
2909 if (label.d_partitions[i].p_fstype != FS_RAID)
2910 continue;
2911
2912 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2913 if (bdevvp(dev, &vp))
2914 panic("RAID can't alloc vnode");
2915
2916 error = VOP_OPEN(vp, FREAD, NOCRED);
2917 if (error) {
2918 /* Whatever... */
2919 vput(vp);
2920 continue;
2921 }
2922 snprintf(cname, sizeof(cname), "%s%c",
2923 device_xname(dv), 'a' + i);
2924 ac_list = rf_get_component(ac_list, dev, vp, cname,
2925 label.d_partitions[i].p_size, numsecs, secsize);
2926 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2927 }
2928
2929 /*
2930 *If there is no raid component on this disk, either in a
2931 *disklabel or inside a wedge, check the raw partition as well,
2932 *as it is possible to configure raid components on raw disk
2933 *devices.
2934 */
2935
2936 if (!rf_part_found) {
2937 char cname[sizeof(ac_list->devname)];
2938
2939 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2940 if (bdevvp(dev, &vp))
2941 panic("RAID can't alloc vnode");
2942
2943 error = VOP_OPEN(vp, FREAD, NOCRED);
2944 if (error) {
2945 /* Whatever... */
2946 vput(vp);
2947 continue;
2948 }
2949 snprintf(cname, sizeof(cname), "%s%c",
2950 device_xname(dv), 'a' + RAW_PART);
2951 ac_list = rf_get_component(ac_list, dev, vp, cname,
2952 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2953 }
2954 }
2955 deviter_release(&di);
2956 }
2957 return ac_list;
2958 }
2959
2960
2961 int
2962 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2963 {
2964
2965 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2966 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2967 ((clabel->clean == RF_RAID_CLEAN) ||
2968 (clabel->clean == RF_RAID_DIRTY)) &&
2969 clabel->row >=0 &&
2970 clabel->column >= 0 &&
2971 clabel->num_rows > 0 &&
2972 clabel->num_columns > 0 &&
2973 clabel->row < clabel->num_rows &&
2974 clabel->column < clabel->num_columns &&
2975 clabel->blockSize > 0 &&
2976 /*
2977 * numBlocksHi may contain garbage, but it is ok since
2978 * the type is unsigned. If it is really garbage,
2979 * rf_fix_old_label_size() will fix it.
2980 */
2981 rf_component_label_numblocks(clabel) > 0) {
2982 /*
2983 * label looks reasonable enough...
2984 * let's make sure it has no old garbage.
2985 */
2986 if (numsecs)
2987 rf_fix_old_label_size(clabel, numsecs);
2988 return(1);
2989 }
2990 return(0);
2991 }
2992
2993
2994 /*
2995 * For reasons yet unknown, some old component labels have garbage in
2996 * the newer numBlocksHi region, and this causes lossage. Since those
2997 * disks will also have numsecs set to less than 32 bits of sectors,
2998 * we can determine when this corruption has occurred, and fix it.
2999 *
3000 * The exact same problem, with the same unknown reason, happens to
3001 * the partitionSizeHi member as well.
3002 */
3003 static void
3004 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3005 {
3006
3007 if (numsecs < ((uint64_t)1 << 32)) {
3008 if (clabel->numBlocksHi) {
3009 printf("WARNING: total sectors < 32 bits, yet "
3010 "numBlocksHi set\n"
3011 "WARNING: resetting numBlocksHi to zero.\n");
3012 clabel->numBlocksHi = 0;
3013 }
3014
3015 if (clabel->partitionSizeHi) {
3016 printf("WARNING: total sectors < 32 bits, yet "
3017 "partitionSizeHi set\n"
3018 "WARNING: resetting partitionSizeHi to zero.\n");
3019 clabel->partitionSizeHi = 0;
3020 }
3021 }
3022 }
3023
3024
3025 #ifdef DEBUG
3026 void
3027 rf_print_component_label(RF_ComponentLabel_t *clabel)
3028 {
3029 uint64_t numBlocks;
3030 static const char *rp[] = {
3031 "No", "Force", "Soft", "*invalid*"
3032 };
3033
3034
3035 numBlocks = rf_component_label_numblocks(clabel);
3036
3037 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3038 clabel->row, clabel->column,
3039 clabel->num_rows, clabel->num_columns);
3040 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3041 clabel->version, clabel->serial_number,
3042 clabel->mod_counter);
3043 printf(" Clean: %s Status: %d\n",
3044 clabel->clean ? "Yes" : "No", clabel->status);
3045 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3046 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3047 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3048 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3049 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3050 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3051 printf(" Last configured as: raid%d\n", clabel->last_unit);
3052 #if 0
3053 printf(" Config order: %d\n", clabel->config_order);
3054 #endif
3055
3056 }
3057 #endif
3058
3059 RF_ConfigSet_t *
3060 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3061 {
3062 RF_AutoConfig_t *ac;
3063 RF_ConfigSet_t *config_sets;
3064 RF_ConfigSet_t *cset;
3065 RF_AutoConfig_t *ac_next;
3066
3067
3068 config_sets = NULL;
3069
3070 /* Go through the AutoConfig list, and figure out which components
3071 belong to what sets. */
3072 ac = ac_list;
3073 while(ac!=NULL) {
3074 /* we're going to putz with ac->next, so save it here
3075 for use at the end of the loop */
3076 ac_next = ac->next;
3077
3078 if (config_sets == NULL) {
3079 /* will need at least this one... */
3080 config_sets = (RF_ConfigSet_t *)
3081 malloc(sizeof(RF_ConfigSet_t),
3082 M_RAIDFRAME, M_NOWAIT);
3083 if (config_sets == NULL) {
3084 panic("rf_create_auto_sets: No memory!");
3085 }
3086 /* this one is easy :) */
3087 config_sets->ac = ac;
3088 config_sets->next = NULL;
3089 config_sets->rootable = 0;
3090 ac->next = NULL;
3091 } else {
3092 /* which set does this component fit into? */
3093 cset = config_sets;
3094 while(cset!=NULL) {
3095 if (rf_does_it_fit(cset, ac)) {
3096 /* looks like it matches... */
3097 ac->next = cset->ac;
3098 cset->ac = ac;
3099 break;
3100 }
3101 cset = cset->next;
3102 }
3103 if (cset==NULL) {
3104 /* didn't find a match above... new set..*/
3105 cset = (RF_ConfigSet_t *)
3106 malloc(sizeof(RF_ConfigSet_t),
3107 M_RAIDFRAME, M_NOWAIT);
3108 if (cset == NULL) {
3109 panic("rf_create_auto_sets: No memory!");
3110 }
3111 cset->ac = ac;
3112 ac->next = NULL;
3113 cset->next = config_sets;
3114 cset->rootable = 0;
3115 config_sets = cset;
3116 }
3117 }
3118 ac = ac_next;
3119 }
3120
3121
3122 return(config_sets);
3123 }
3124
3125 static int
3126 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3127 {
3128 RF_ComponentLabel_t *clabel1, *clabel2;
3129
3130 /* If this one matches the *first* one in the set, that's good
3131 enough, since the other members of the set would have been
3132 through here too... */
3133 /* note that we are not checking partitionSize here..
3134
3135 Note that we are also not checking the mod_counters here.
3136 If everything else matches except the mod_counter, that's
3137 good enough for this test. We will deal with the mod_counters
3138 a little later in the autoconfiguration process.
3139
3140 (clabel1->mod_counter == clabel2->mod_counter) &&
3141
3142 The reason we don't check for this is that failed disks
3143 will have lower modification counts. If those disks are
3144 not added to the set they used to belong to, then they will
3145 form their own set, which may result in 2 different sets,
3146 for example, competing to be configured at raid0, and
3147 perhaps competing to be the root filesystem set. If the
3148 wrong ones get configured, or both attempt to become /,
3149 weird behaviour and or serious lossage will occur. Thus we
3150 need to bring them into the fold here, and kick them out at
3151 a later point.
3152
3153 */
3154
3155 clabel1 = cset->ac->clabel;
3156 clabel2 = ac->clabel;
3157 if ((clabel1->version == clabel2->version) &&
3158 (clabel1->serial_number == clabel2->serial_number) &&
3159 (clabel1->num_rows == clabel2->num_rows) &&
3160 (clabel1->num_columns == clabel2->num_columns) &&
3161 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3162 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3163 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3164 (clabel1->parityConfig == clabel2->parityConfig) &&
3165 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3166 (clabel1->blockSize == clabel2->blockSize) &&
3167 rf_component_label_numblocks(clabel1) ==
3168 rf_component_label_numblocks(clabel2) &&
3169 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3170 (clabel1->root_partition == clabel2->root_partition) &&
3171 (clabel1->last_unit == clabel2->last_unit) &&
3172 (clabel1->config_order == clabel2->config_order)) {
3173 /* if it get's here, it almost *has* to be a match */
3174 } else {
3175 /* it's not consistent with somebody in the set..
3176 punt */
3177 return(0);
3178 }
3179 /* all was fine.. it must fit... */
3180 return(1);
3181 }
3182
3183 int
3184 rf_have_enough_components(RF_ConfigSet_t *cset)
3185 {
3186 RF_AutoConfig_t *ac;
3187 RF_AutoConfig_t *auto_config;
3188 RF_ComponentLabel_t *clabel;
3189 int c;
3190 int num_cols;
3191 int num_missing;
3192 int mod_counter;
3193 int mod_counter_found;
3194 int even_pair_failed;
3195 char parity_type;
3196
3197
3198 /* check to see that we have enough 'live' components
3199 of this set. If so, we can configure it if necessary */
3200
3201 num_cols = cset->ac->clabel->num_columns;
3202 parity_type = cset->ac->clabel->parityConfig;
3203
3204 /* XXX Check for duplicate components!?!?!? */
3205
3206 /* Determine what the mod_counter is supposed to be for this set. */
3207
3208 mod_counter_found = 0;
3209 mod_counter = 0;
3210 ac = cset->ac;
3211 while(ac!=NULL) {
3212 if (mod_counter_found==0) {
3213 mod_counter = ac->clabel->mod_counter;
3214 mod_counter_found = 1;
3215 } else {
3216 if (ac->clabel->mod_counter > mod_counter) {
3217 mod_counter = ac->clabel->mod_counter;
3218 }
3219 }
3220 ac = ac->next;
3221 }
3222
3223 num_missing = 0;
3224 auto_config = cset->ac;
3225
3226 even_pair_failed = 0;
3227 for(c=0; c<num_cols; c++) {
3228 ac = auto_config;
3229 while(ac!=NULL) {
3230 if ((ac->clabel->column == c) &&
3231 (ac->clabel->mod_counter == mod_counter)) {
3232 /* it's this one... */
3233 #ifdef DEBUG
3234 printf("Found: %s at %d\n",
3235 ac->devname,c);
3236 #endif
3237 break;
3238 }
3239 ac=ac->next;
3240 }
3241 if (ac==NULL) {
3242 /* Didn't find one here! */
3243 /* special case for RAID 1, especially
3244 where there are more than 2
3245 components (where RAIDframe treats
3246 things a little differently :( ) */
3247 if (parity_type == '1') {
3248 if (c%2 == 0) { /* even component */
3249 even_pair_failed = 1;
3250 } else { /* odd component. If
3251 we're failed, and
3252 so is the even
3253 component, it's
3254 "Good Night, Charlie" */
3255 if (even_pair_failed == 1) {
3256 return(0);
3257 }
3258 }
3259 } else {
3260 /* normal accounting */
3261 num_missing++;
3262 }
3263 }
3264 if ((parity_type == '1') && (c%2 == 1)) {
3265 /* Just did an even component, and we didn't
3266 bail.. reset the even_pair_failed flag,
3267 and go on to the next component.... */
3268 even_pair_failed = 0;
3269 }
3270 }
3271
3272 clabel = cset->ac->clabel;
3273
3274 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3275 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3276 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3277 /* XXX this needs to be made *much* more general */
3278 /* Too many failures */
3279 return(0);
3280 }
3281 /* otherwise, all is well, and we've got enough to take a kick
3282 at autoconfiguring this set */
3283 return(1);
3284 }
3285
3286 void
3287 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3288 RF_Raid_t *raidPtr)
3289 {
3290 RF_ComponentLabel_t *clabel;
3291 int i;
3292
3293 clabel = ac->clabel;
3294
3295 /* 1. Fill in the common stuff */
3296 config->numRow = clabel->num_rows = 1;
3297 config->numCol = clabel->num_columns;
3298 config->numSpare = 0; /* XXX should this be set here? */
3299 config->sectPerSU = clabel->sectPerSU;
3300 config->SUsPerPU = clabel->SUsPerPU;
3301 config->SUsPerRU = clabel->SUsPerRU;
3302 config->parityConfig = clabel->parityConfig;
3303 /* XXX... */
3304 strcpy(config->diskQueueType,"fifo");
3305 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3306 config->layoutSpecificSize = 0; /* XXX ?? */
3307
3308 while(ac!=NULL) {
3309 /* row/col values will be in range due to the checks
3310 in reasonable_label() */
3311 strcpy(config->devnames[0][ac->clabel->column],
3312 ac->devname);
3313 ac = ac->next;
3314 }
3315
3316 for(i=0;i<RF_MAXDBGV;i++) {
3317 config->debugVars[i][0] = 0;
3318 }
3319 }
3320
3321 int
3322 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3323 {
3324 RF_ComponentLabel_t *clabel;
3325 int column;
3326 int sparecol;
3327
3328 raidPtr->autoconfigure = new_value;
3329
3330 for(column=0; column<raidPtr->numCol; column++) {
3331 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3332 clabel = raidget_component_label(raidPtr, column);
3333 clabel->autoconfigure = new_value;
3334 raidflush_component_label(raidPtr, column);
3335 }
3336 }
3337 for(column = 0; column < raidPtr->numSpare ; column++) {
3338 sparecol = raidPtr->numCol + column;
3339 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3340 clabel = raidget_component_label(raidPtr, sparecol);
3341 clabel->autoconfigure = new_value;
3342 raidflush_component_label(raidPtr, sparecol);
3343 }
3344 }
3345 return(new_value);
3346 }
3347
3348 int
3349 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3350 {
3351 RF_ComponentLabel_t *clabel;
3352 int column;
3353 int sparecol;
3354
3355 raidPtr->root_partition = new_value;
3356 for(column=0; column<raidPtr->numCol; column++) {
3357 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3358 clabel = raidget_component_label(raidPtr, column);
3359 clabel->root_partition = new_value;
3360 raidflush_component_label(raidPtr, column);
3361 }
3362 }
3363 for(column = 0; column < raidPtr->numSpare ; column++) {
3364 sparecol = raidPtr->numCol + column;
3365 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3366 clabel = raidget_component_label(raidPtr, sparecol);
3367 clabel->root_partition = new_value;
3368 raidflush_component_label(raidPtr, sparecol);
3369 }
3370 }
3371 return(new_value);
3372 }
3373
3374 void
3375 rf_release_all_vps(RF_ConfigSet_t *cset)
3376 {
3377 RF_AutoConfig_t *ac;
3378
3379 ac = cset->ac;
3380 while(ac!=NULL) {
3381 /* Close the vp, and give it back */
3382 if (ac->vp) {
3383 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3384 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3385 vput(ac->vp);
3386 ac->vp = NULL;
3387 }
3388 ac = ac->next;
3389 }
3390 }
3391
3392
3393 void
3394 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3395 {
3396 RF_AutoConfig_t *ac;
3397 RF_AutoConfig_t *next_ac;
3398
3399 ac = cset->ac;
3400 while(ac!=NULL) {
3401 next_ac = ac->next;
3402 /* nuke the label */
3403 free(ac->clabel, M_RAIDFRAME);
3404 /* cleanup the config structure */
3405 free(ac, M_RAIDFRAME);
3406 /* "next.." */
3407 ac = next_ac;
3408 }
3409 /* and, finally, nuke the config set */
3410 free(cset, M_RAIDFRAME);
3411 }
3412
3413
3414 void
3415 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3416 {
3417 /* current version number */
3418 clabel->version = RF_COMPONENT_LABEL_VERSION;
3419 clabel->serial_number = raidPtr->serial_number;
3420 clabel->mod_counter = raidPtr->mod_counter;
3421
3422 clabel->num_rows = 1;
3423 clabel->num_columns = raidPtr->numCol;
3424 clabel->clean = RF_RAID_DIRTY; /* not clean */
3425 clabel->status = rf_ds_optimal; /* "It's good!" */
3426
3427 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3428 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3429 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3430
3431 clabel->blockSize = raidPtr->bytesPerSector;
3432 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3433
3434 /* XXX not portable */
3435 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3436 clabel->maxOutstanding = raidPtr->maxOutstanding;
3437 clabel->autoconfigure = raidPtr->autoconfigure;
3438 clabel->root_partition = raidPtr->root_partition;
3439 clabel->last_unit = raidPtr->raidid;
3440 clabel->config_order = raidPtr->config_order;
3441
3442 #ifndef RF_NO_PARITY_MAP
3443 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3444 #endif
3445 }
3446
3447 struct raid_softc *
3448 rf_auto_config_set(RF_ConfigSet_t *cset)
3449 {
3450 RF_Raid_t *raidPtr;
3451 RF_Config_t *config;
3452 int raidID;
3453 struct raid_softc *sc;
3454
3455 #ifdef DEBUG
3456 printf("RAID autoconfigure\n");
3457 #endif
3458
3459 /* 1. Create a config structure */
3460 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3461 if (config == NULL) {
3462 printf("%s: Out of mem - config!?!?\n", __func__);
3463 /* XXX do something more intelligent here. */
3464 return NULL;
3465 }
3466
3467 /*
3468 2. Figure out what RAID ID this one is supposed to live at
3469 See if we can get the same RAID dev that it was configured
3470 on last time..
3471 */
3472
3473 raidID = cset->ac->clabel->last_unit;
3474 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3475 sc = raidget(++raidID, false))
3476 continue;
3477 #ifdef DEBUG
3478 printf("Configuring raid%d:\n",raidID);
3479 #endif
3480
3481 if (sc == NULL)
3482 sc = raidget(raidID, true);
3483 if (sc == NULL) {
3484 printf("%s: Out of mem - softc!?!?\n", __func__);
3485 /* XXX do something more intelligent here. */
3486 free(config, M_RAIDFRAME);
3487 return NULL;
3488 }
3489
3490 raidPtr = &sc->sc_r;
3491
3492 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3493 raidPtr->softc = sc;
3494 raidPtr->raidid = raidID;
3495 raidPtr->openings = RAIDOUTSTANDING;
3496
3497 /* 3. Build the configuration structure */
3498 rf_create_configuration(cset->ac, config, raidPtr);
3499
3500 /* 4. Do the configuration */
3501 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3502 raidinit(sc);
3503
3504 rf_markalldirty(raidPtr);
3505 raidPtr->autoconfigure = 1; /* XXX do this here? */
3506 switch (cset->ac->clabel->root_partition) {
3507 case 1: /* Force Root */
3508 case 2: /* Soft Root: root when boot partition part of raid */
3509 /*
3510 * everything configured just fine. Make a note
3511 * that this set is eligible to be root,
3512 * or forced to be root
3513 */
3514 cset->rootable = cset->ac->clabel->root_partition;
3515 /* XXX do this here? */
3516 raidPtr->root_partition = cset->rootable;
3517 break;
3518 default:
3519 break;
3520 }
3521 } else {
3522 raidput(sc);
3523 sc = NULL;
3524 }
3525
3526 /* 5. Cleanup */
3527 free(config, M_RAIDFRAME);
3528 return sc;
3529 }
3530
3531 void
3532 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3533 size_t xmin, size_t xmax)
3534 {
3535 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3536 pool_sethiwat(p, xmax);
3537 pool_prime(p, xmin);
3538 pool_setlowat(p, xmin);
3539 }
3540
3541 /*
3542 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3543 * to see if there is IO pending and if that IO could possibly be done
3544 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3545 * otherwise.
3546 *
3547 */
3548 int
3549 rf_buf_queue_check(RF_Raid_t *raidPtr)
3550 {
3551 struct raid_softc *rs;
3552 struct dk_softc *dksc;
3553
3554 rs = raidPtr->softc;
3555 dksc = &rs->sc_dksc;
3556
3557 if ((rs->sc_flags & RAIDF_INITED) == 0)
3558 return 1;
3559
3560 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3561 /* there is work to do */
3562 return 0;
3563 }
3564 /* default is nothing to do */
3565 return 1;
3566 }
3567
3568 int
3569 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3570 {
3571 uint64_t numsecs;
3572 unsigned secsize;
3573 int error;
3574
3575 error = getdisksize(vp, &numsecs, &secsize);
3576 if (error == 0) {
3577 diskPtr->blockSize = secsize;
3578 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3579 diskPtr->partitionSize = numsecs;
3580 return 0;
3581 }
3582 return error;
3583 }
3584
3585 static int
3586 raid_match(device_t self, cfdata_t cfdata, void *aux)
3587 {
3588 return 1;
3589 }
3590
3591 static void
3592 raid_attach(device_t parent, device_t self, void *aux)
3593 {
3594 }
3595
3596
3597 static int
3598 raid_detach(device_t self, int flags)
3599 {
3600 int error;
3601 struct raid_softc *rs = raidsoftc(self);
3602
3603 if (rs == NULL)
3604 return ENXIO;
3605
3606 if ((error = raidlock(rs)) != 0)
3607 return (error);
3608
3609 error = raid_detach_unlocked(rs);
3610
3611 raidunlock(rs);
3612
3613 /* XXX raid can be referenced here */
3614
3615 if (error)
3616 return error;
3617
3618 /* Free the softc */
3619 raidput(rs);
3620
3621 return 0;
3622 }
3623
3624 static void
3625 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3626 {
3627 struct dk_softc *dksc = &rs->sc_dksc;
3628 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3629
3630 memset(dg, 0, sizeof(*dg));
3631
3632 dg->dg_secperunit = raidPtr->totalSectors;
3633 dg->dg_secsize = raidPtr->bytesPerSector;
3634 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3635 dg->dg_ntracks = 4 * raidPtr->numCol;
3636
3637 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3638 }
3639
3640 /*
3641 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3642 * We end up returning whatever error was returned by the first cache flush
3643 * that fails.
3644 */
3645
3646 int
3647 rf_sync_component_caches(RF_Raid_t *raidPtr)
3648 {
3649 int c, sparecol;
3650 int e,error;
3651 int force = 1;
3652
3653 error = 0;
3654 for (c = 0; c < raidPtr->numCol; c++) {
3655 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3656 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3657 &force, FWRITE, NOCRED);
3658 if (e) {
3659 if (e != ENODEV)
3660 printf("raid%d: cache flush to component %s failed.\n",
3661 raidPtr->raidid, raidPtr->Disks[c].devname);
3662 if (error == 0) {
3663 error = e;
3664 }
3665 }
3666 }
3667 }
3668
3669 for( c = 0; c < raidPtr->numSpare ; c++) {
3670 sparecol = raidPtr->numCol + c;
3671 /* Need to ensure that the reconstruct actually completed! */
3672 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3673 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3674 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3675 if (e) {
3676 if (e != ENODEV)
3677 printf("raid%d: cache flush to component %s failed.\n",
3678 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3679 if (error == 0) {
3680 error = e;
3681 }
3682 }
3683 }
3684 }
3685 return error;
3686 }
3687
3688 /*
3689 * Module interface
3690 */
3691
3692 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3693
3694 #ifdef _MODULE
3695 CFDRIVER_DECL(raid, DV_DISK, NULL);
3696 #endif
3697
3698 static int raid_modcmd(modcmd_t, void *);
3699 static int raid_modcmd_init(void);
3700 static int raid_modcmd_fini(void);
3701
3702 static int
3703 raid_modcmd(modcmd_t cmd, void *data)
3704 {
3705 int error;
3706
3707 error = 0;
3708 switch (cmd) {
3709 case MODULE_CMD_INIT:
3710 error = raid_modcmd_init();
3711 break;
3712 case MODULE_CMD_FINI:
3713 error = raid_modcmd_fini();
3714 break;
3715 default:
3716 error = ENOTTY;
3717 break;
3718 }
3719 return error;
3720 }
3721
3722 static int
3723 raid_modcmd_init(void)
3724 {
3725 int error;
3726 int bmajor, cmajor;
3727
3728 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3729 mutex_enter(&raid_lock);
3730 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3731 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3732 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3733 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3734
3735 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3736 #endif
3737
3738 bmajor = cmajor = -1;
3739 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3740 &raid_cdevsw, &cmajor);
3741 if (error != 0 && error != EEXIST) {
3742 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3743 mutex_exit(&raid_lock);
3744 return error;
3745 }
3746 #ifdef _MODULE
3747 error = config_cfdriver_attach(&raid_cd);
3748 if (error != 0) {
3749 aprint_error("%s: config_cfdriver_attach failed %d\n",
3750 __func__, error);
3751 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3752 mutex_exit(&raid_lock);
3753 return error;
3754 }
3755 #endif
3756 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3757 if (error != 0) {
3758 aprint_error("%s: config_cfattach_attach failed %d\n",
3759 __func__, error);
3760 #ifdef _MODULE
3761 config_cfdriver_detach(&raid_cd);
3762 #endif
3763 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3764 mutex_exit(&raid_lock);
3765 return error;
3766 }
3767
3768 raidautoconfigdone = false;
3769
3770 mutex_exit(&raid_lock);
3771
3772 if (error == 0) {
3773 if (rf_BootRaidframe(true) == 0)
3774 aprint_verbose("Kernelized RAIDframe activated\n");
3775 else
3776 panic("Serious error activating RAID!!");
3777 }
3778
3779 /*
3780 * Register a finalizer which will be used to auto-config RAID
3781 * sets once all real hardware devices have been found.
3782 */
3783 error = config_finalize_register(NULL, rf_autoconfig);
3784 if (error != 0) {
3785 aprint_error("WARNING: unable to register RAIDframe "
3786 "finalizer\n");
3787 error = 0;
3788 }
3789
3790 return error;
3791 }
3792
3793 static int
3794 raid_modcmd_fini(void)
3795 {
3796 int error;
3797
3798 mutex_enter(&raid_lock);
3799
3800 /* Don't allow unload if raid device(s) exist. */
3801 if (!LIST_EMPTY(&raids)) {
3802 mutex_exit(&raid_lock);
3803 return EBUSY;
3804 }
3805
3806 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3807 if (error != 0) {
3808 aprint_error("%s: cannot detach cfattach\n",__func__);
3809 mutex_exit(&raid_lock);
3810 return error;
3811 }
3812 #ifdef _MODULE
3813 error = config_cfdriver_detach(&raid_cd);
3814 if (error != 0) {
3815 aprint_error("%s: cannot detach cfdriver\n",__func__);
3816 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3817 mutex_exit(&raid_lock);
3818 return error;
3819 }
3820 #endif
3821 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3822 if (error != 0) {
3823 aprint_error("%s: cannot detach devsw\n",__func__);
3824 #ifdef _MODULE
3825 config_cfdriver_attach(&raid_cd);
3826 #endif
3827 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3828 mutex_exit(&raid_lock);
3829 return error;
3830 }
3831 rf_BootRaidframe(false);
3832 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3833 rf_destroy_mutex2(rf_sparet_wait_mutex);
3834 rf_destroy_cond2(rf_sparet_wait_cv);
3835 rf_destroy_cond2(rf_sparet_resp_cv);
3836 #endif
3837 mutex_exit(&raid_lock);
3838 mutex_destroy(&raid_lock);
3839
3840 return error;
3841 }
3842