rf_netbsdkintf.c revision 1.335 1 /* $NetBSD: rf_netbsdkintf.c,v 1.335 2016/01/03 08:17:24 mlelstv Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.335 2016/01/03 08:17:24 mlelstv Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #include "ioconf.h"
156
157 #ifdef DEBUG
158 int rf_kdebug_level = 0;
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #else /* DEBUG */
161 #define db1_printf(a) { }
162 #endif /* DEBUG */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 static rf_declare_mutex2(rf_sparet_wait_mutex);
166 static rf_declare_cond2(rf_sparet_wait_cv);
167 static rf_declare_cond2(rf_sparet_resp_cv);
168
169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
170 * spare table */
171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
172 * installation process */
173 #endif
174
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181 void *, int, struct proc *);
182 struct raid_softc;
183 static void raidinit(struct raid_softc *);
184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
185
186 static int raid_match(device_t, cfdata_t, void *);
187 static void raid_attach(device_t, device_t, void *);
188 static int raid_detach(device_t, int);
189
190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
191 daddr_t, daddr_t);
192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
193 daddr_t, daddr_t, int);
194
195 static int raidwrite_component_label(unsigned,
196 dev_t, struct vnode *, RF_ComponentLabel_t *);
197 static int raidread_component_label(unsigned,
198 dev_t, struct vnode *, RF_ComponentLabel_t *);
199
200 static int raid_diskstart(device_t, struct buf *bp);
201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
202 static int raid_lastclose(device_t);
203
204 static dev_type_open(raidopen);
205 static dev_type_close(raidclose);
206 static dev_type_read(raidread);
207 static dev_type_write(raidwrite);
208 static dev_type_ioctl(raidioctl);
209 static dev_type_strategy(raidstrategy);
210 static dev_type_dump(raiddump);
211 static dev_type_size(raidsize);
212
213 const struct bdevsw raid_bdevsw = {
214 .d_open = raidopen,
215 .d_close = raidclose,
216 .d_strategy = raidstrategy,
217 .d_ioctl = raidioctl,
218 .d_dump = raiddump,
219 .d_psize = raidsize,
220 .d_discard = nodiscard,
221 .d_flag = D_DISK
222 };
223
224 const struct cdevsw raid_cdevsw = {
225 .d_open = raidopen,
226 .d_close = raidclose,
227 .d_read = raidread,
228 .d_write = raidwrite,
229 .d_ioctl = raidioctl,
230 .d_stop = nostop,
231 .d_tty = notty,
232 .d_poll = nopoll,
233 .d_mmap = nommap,
234 .d_kqfilter = nokqfilter,
235 .d_discard = nodiscard,
236 .d_flag = D_DISK
237 };
238
239 static struct dkdriver rf_dkdriver = {
240 .d_open = raidopen,
241 .d_close = raidclose,
242 .d_strategy = raidstrategy,
243 .d_diskstart = raid_diskstart,
244 .d_dumpblocks = raid_dumpblocks,
245 .d_lastclose = raid_lastclose,
246 .d_minphys = minphys
247 };
248
249 struct raid_softc {
250 struct dk_softc sc_dksc;
251 int sc_unit;
252 int sc_flags; /* flags */
253 int sc_cflags; /* configuration flags */
254 kmutex_t sc_mutex; /* interlock mutex */
255 kcondvar_t sc_cv; /* and the condvar */
256 uint64_t sc_size; /* size of the raid device */
257 char sc_xname[20]; /* XXX external name */
258 RF_Raid_t sc_r;
259 LIST_ENTRY(raid_softc) sc_link;
260 };
261 /* sc_flags */
262 #define RAIDF_INITED 0x01 /* unit has been initialized */
263 #define RAIDF_WLABEL 0x02 /* label area is writable */
264 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
265 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
266 #define RAIDF_DETACH 0x10 /* detach after final close */
267 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
268 #define RAIDF_LOCKED 0x80 /* unit is locked */
269
270 #define raidunit(x) DISKUNIT(x)
271 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
272
273 extern struct cfdriver raid_cd;
274 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
275 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
276 DVF_DETACH_SHUTDOWN);
277
278 /*
279 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
280 * Be aware that large numbers can allow the driver to consume a lot of
281 * kernel memory, especially on writes, and in degraded mode reads.
282 *
283 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
284 * a single 64K write will typically require 64K for the old data,
285 * 64K for the old parity, and 64K for the new parity, for a total
286 * of 192K (if the parity buffer is not re-used immediately).
287 * Even it if is used immediately, that's still 128K, which when multiplied
288 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
289 *
290 * Now in degraded mode, for example, a 64K read on the above setup may
291 * require data reconstruction, which will require *all* of the 4 remaining
292 * disks to participate -- 4 * 32K/disk == 128K again.
293 */
294
295 #ifndef RAIDOUTSTANDING
296 #define RAIDOUTSTANDING 6
297 #endif
298
299 #define RAIDLABELDEV(dev) \
300 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
301
302 /* declared here, and made public, for the benefit of KVM stuff.. */
303
304 static int raidlock(struct raid_softc *);
305 static void raidunlock(struct raid_softc *);
306
307 static int raid_detach_unlocked(struct raid_softc *);
308
309 static void rf_markalldirty(RF_Raid_t *);
310 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
311
312 void rf_ReconThread(struct rf_recon_req *);
313 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
314 void rf_CopybackThread(RF_Raid_t *raidPtr);
315 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
316 int rf_autoconfig(device_t);
317 void rf_buildroothack(RF_ConfigSet_t *);
318
319 RF_AutoConfig_t *rf_find_raid_components(void);
320 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
321 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
322 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
323 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
324 int rf_set_autoconfig(RF_Raid_t *, int);
325 int rf_set_rootpartition(RF_Raid_t *, int);
326 void rf_release_all_vps(RF_ConfigSet_t *);
327 void rf_cleanup_config_set(RF_ConfigSet_t *);
328 int rf_have_enough_components(RF_ConfigSet_t *);
329 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
330 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
331
332 /*
333 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
334 * Note that this is overridden by having RAID_AUTOCONFIG as an option
335 * in the kernel config file.
336 */
337 #ifdef RAID_AUTOCONFIG
338 int raidautoconfig = 1;
339 #else
340 int raidautoconfig = 0;
341 #endif
342 static bool raidautoconfigdone = false;
343
344 struct RF_Pools_s rf_pools;
345
346 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
347 static kmutex_t raid_lock;
348
349 static struct raid_softc *
350 raidcreate(int unit) {
351 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
352 if (sc == NULL) {
353 #ifdef DIAGNOSTIC
354 printf("%s: out of memory\n", __func__);
355 #endif
356 return NULL;
357 }
358 sc->sc_unit = unit;
359 cv_init(&sc->sc_cv, "raidunit");
360 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
361 return sc;
362 }
363
364 static void
365 raiddestroy(struct raid_softc *sc) {
366 cv_destroy(&sc->sc_cv);
367 mutex_destroy(&sc->sc_mutex);
368 kmem_free(sc, sizeof(*sc));
369 }
370
371 static struct raid_softc *
372 raidget(int unit, bool create) {
373 struct raid_softc *sc;
374 if (unit < 0) {
375 #ifdef DIAGNOSTIC
376 panic("%s: unit %d!", __func__, unit);
377 #endif
378 return NULL;
379 }
380 mutex_enter(&raid_lock);
381 LIST_FOREACH(sc, &raids, sc_link) {
382 if (sc->sc_unit == unit) {
383 mutex_exit(&raid_lock);
384 return sc;
385 }
386 }
387 mutex_exit(&raid_lock);
388 if (!create)
389 return NULL;
390 if ((sc = raidcreate(unit)) == NULL)
391 return NULL;
392 mutex_enter(&raid_lock);
393 LIST_INSERT_HEAD(&raids, sc, sc_link);
394 mutex_exit(&raid_lock);
395 return sc;
396 }
397
398 static void
399 raidput(struct raid_softc *sc) {
400 mutex_enter(&raid_lock);
401 LIST_REMOVE(sc, sc_link);
402 mutex_exit(&raid_lock);
403 raiddestroy(sc);
404 }
405
406 void
407 raidattach(int num)
408 {
409
410 /*
411 * Device attachment and associated initialization now occurs
412 * as part of the module initialization.
413 */
414 }
415
416 int
417 rf_autoconfig(device_t self)
418 {
419 RF_AutoConfig_t *ac_list;
420 RF_ConfigSet_t *config_sets;
421
422 if (!raidautoconfig || raidautoconfigdone == true)
423 return (0);
424
425 /* XXX This code can only be run once. */
426 raidautoconfigdone = true;
427
428 #ifdef __HAVE_CPU_BOOTCONF
429 /*
430 * 0. find the boot device if needed first so we can use it later
431 * this needs to be done before we autoconfigure any raid sets,
432 * because if we use wedges we are not going to be able to open
433 * the boot device later
434 */
435 if (booted_device == NULL)
436 cpu_bootconf();
437 #endif
438 /* 1. locate all RAID components on the system */
439 aprint_debug("Searching for RAID components...\n");
440 ac_list = rf_find_raid_components();
441
442 /* 2. Sort them into their respective sets. */
443 config_sets = rf_create_auto_sets(ac_list);
444
445 /*
446 * 3. Evaluate each set and configure the valid ones.
447 * This gets done in rf_buildroothack().
448 */
449 rf_buildroothack(config_sets);
450
451 return 1;
452 }
453
454 static int
455 rf_containsboot(RF_Raid_t *r, device_t bdv) {
456 const char *bootname = device_xname(bdv);
457 size_t len = strlen(bootname);
458
459 for (int col = 0; col < r->numCol; col++) {
460 const char *devname = r->Disks[col].devname;
461 devname += sizeof("/dev/") - 1;
462 if (strncmp(devname, "dk", 2) == 0) {
463 const char *parent =
464 dkwedge_get_parent_name(r->Disks[col].dev);
465 if (parent != NULL)
466 devname = parent;
467 }
468 if (strncmp(devname, bootname, len) == 0) {
469 struct raid_softc *sc = r->softc;
470 aprint_debug("raid%d includes boot device %s\n",
471 sc->sc_unit, devname);
472 return 1;
473 }
474 }
475 return 0;
476 }
477
478 void
479 rf_buildroothack(RF_ConfigSet_t *config_sets)
480 {
481 RF_ConfigSet_t *cset;
482 RF_ConfigSet_t *next_cset;
483 int num_root;
484 struct raid_softc *sc, *rsc;
485 struct dk_softc *dksc;
486
487 sc = rsc = NULL;
488 num_root = 0;
489 cset = config_sets;
490 while (cset != NULL) {
491 next_cset = cset->next;
492 if (rf_have_enough_components(cset) &&
493 cset->ac->clabel->autoconfigure == 1) {
494 sc = rf_auto_config_set(cset);
495 if (sc != NULL) {
496 aprint_debug("raid%d: configured ok\n",
497 sc->sc_unit);
498 if (cset->rootable) {
499 rsc = sc;
500 num_root++;
501 }
502 } else {
503 /* The autoconfig didn't work :( */
504 aprint_debug("Autoconfig failed\n");
505 rf_release_all_vps(cset);
506 }
507 } else {
508 /* we're not autoconfiguring this set...
509 release the associated resources */
510 rf_release_all_vps(cset);
511 }
512 /* cleanup */
513 rf_cleanup_config_set(cset);
514 cset = next_cset;
515 }
516 dksc = &rsc->sc_dksc;
517
518 /* if the user has specified what the root device should be
519 then we don't touch booted_device or boothowto... */
520
521 if (rootspec != NULL)
522 return;
523
524 /* we found something bootable... */
525
526 /*
527 * XXX: The following code assumes that the root raid
528 * is the first ('a') partition. This is about the best
529 * we can do with a BSD disklabel, but we might be able
530 * to do better with a GPT label, by setting a specified
531 * attribute to indicate the root partition. We can then
532 * stash the partition number in the r->root_partition
533 * high bits (the bottom 2 bits are already used). For
534 * now we just set booted_partition to 0 when we override
535 * root.
536 */
537 if (num_root == 1) {
538 device_t candidate_root;
539 if (dksc->sc_dkdev.dk_nwedges != 0) {
540 char cname[sizeof(cset->ac->devname)];
541 /* XXX: assume 'a' */
542 snprintf(cname, sizeof(cname), "%s%c",
543 device_xname(dksc->sc_dev), 'a');
544 candidate_root = dkwedge_find_by_wname(cname);
545 } else
546 candidate_root = dksc->sc_dev;
547 if (booted_device == NULL ||
548 rsc->sc_r.root_partition == 1 ||
549 rf_containsboot(&rsc->sc_r, booted_device)) {
550 booted_device = candidate_root;
551 booted_partition = 0; /* XXX assume 'a' */
552 }
553 } else if (num_root > 1) {
554
555 /*
556 * Maybe the MD code can help. If it cannot, then
557 * setroot() will discover that we have no
558 * booted_device and will ask the user if nothing was
559 * hardwired in the kernel config file
560 */
561 if (booted_device == NULL)
562 return;
563
564 num_root = 0;
565 mutex_enter(&raid_lock);
566 LIST_FOREACH(sc, &raids, sc_link) {
567 RF_Raid_t *r = &sc->sc_r;
568 if (r->valid == 0)
569 continue;
570
571 if (r->root_partition == 0)
572 continue;
573
574 if (rf_containsboot(r, booted_device)) {
575 num_root++;
576 rsc = sc;
577 dksc = &rsc->sc_dksc;
578 }
579 }
580 mutex_exit(&raid_lock);
581
582 if (num_root == 1) {
583 booted_device = dksc->sc_dev;
584 booted_partition = 0; /* XXX assume 'a' */
585 } else {
586 /* we can't guess.. require the user to answer... */
587 boothowto |= RB_ASKNAME;
588 }
589 }
590 }
591
592 static int
593 raidsize(dev_t dev)
594 {
595 struct raid_softc *rs;
596 struct dk_softc *dksc;
597 unsigned int unit;
598
599 unit = raidunit(dev);
600 if ((rs = raidget(unit, false)) == NULL)
601 return ENXIO;
602 dksc = &rs->sc_dksc;
603
604 if ((rs->sc_flags & RAIDF_INITED) == 0)
605 return (ENODEV);
606
607 return dk_size(dksc, dev);
608 }
609
610 static int
611 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
612 {
613 unsigned int unit;
614 struct raid_softc *rs;
615 struct dk_softc *dksc;
616
617 unit = raidunit(dev);
618 if ((rs = raidget(unit, false)) == NULL)
619 return ENXIO;
620 dksc = &rs->sc_dksc;
621
622 if ((rs->sc_flags & RAIDF_INITED) == 0)
623 return ENODEV;
624
625 return dk_dump(dksc, dev, blkno, va, size);
626 }
627
628 static int
629 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
630 {
631 struct raid_softc *rs = raidsoftc(dev);
632 const struct bdevsw *bdev;
633 RF_Raid_t *raidPtr;
634 int c, sparecol, j, scol, dumpto;
635 int error = 0;
636
637 raidPtr = &rs->sc_r;
638
639 /* we only support dumping to RAID 1 sets */
640 if (raidPtr->Layout.numDataCol != 1 ||
641 raidPtr->Layout.numParityCol != 1)
642 return EINVAL;
643
644 if ((error = raidlock(rs)) != 0)
645 return error;
646
647 /* figure out what device is alive.. */
648
649 /*
650 Look for a component to dump to. The preference for the
651 component to dump to is as follows:
652 1) the master
653 2) a used_spare of the master
654 3) the slave
655 4) a used_spare of the slave
656 */
657
658 dumpto = -1;
659 for (c = 0; c < raidPtr->numCol; c++) {
660 if (raidPtr->Disks[c].status == rf_ds_optimal) {
661 /* this might be the one */
662 dumpto = c;
663 break;
664 }
665 }
666
667 /*
668 At this point we have possibly selected a live master or a
669 live slave. We now check to see if there is a spared
670 master (or a spared slave), if we didn't find a live master
671 or a live slave.
672 */
673
674 for (c = 0; c < raidPtr->numSpare; c++) {
675 sparecol = raidPtr->numCol + c;
676 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
677 /* How about this one? */
678 scol = -1;
679 for(j=0;j<raidPtr->numCol;j++) {
680 if (raidPtr->Disks[j].spareCol == sparecol) {
681 scol = j;
682 break;
683 }
684 }
685 if (scol == 0) {
686 /*
687 We must have found a spared master!
688 We'll take that over anything else
689 found so far. (We couldn't have
690 found a real master before, since
691 this is a used spare, and it's
692 saying that it's replacing the
693 master.) On reboot (with
694 autoconfiguration turned on)
695 sparecol will become the 1st
696 component (component0) of this set.
697 */
698 dumpto = sparecol;
699 break;
700 } else if (scol != -1) {
701 /*
702 Must be a spared slave. We'll dump
703 to that if we havn't found anything
704 else so far.
705 */
706 if (dumpto == -1)
707 dumpto = sparecol;
708 }
709 }
710 }
711
712 if (dumpto == -1) {
713 /* we couldn't find any live components to dump to!?!?
714 */
715 error = EINVAL;
716 goto out;
717 }
718
719 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
720
721 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
722 blkno, va, nblk);
723
724 out:
725 raidunlock(rs);
726
727 return error;
728 }
729
730 /* ARGSUSED */
731 static int
732 raidopen(dev_t dev, int flags, int fmt,
733 struct lwp *l)
734 {
735 int unit = raidunit(dev);
736 struct raid_softc *rs;
737 struct dk_softc *dksc;
738 int error = 0;
739 int part, pmask;
740
741 if ((rs = raidget(unit, true)) == NULL)
742 return ENXIO;
743 if ((error = raidlock(rs)) != 0)
744 return (error);
745
746 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
747 error = EBUSY;
748 goto bad;
749 }
750
751 dksc = &rs->sc_dksc;
752
753 part = DISKPART(dev);
754 pmask = (1 << part);
755
756 if (!DK_BUSY(dksc, pmask) &&
757 ((rs->sc_flags & RAIDF_INITED) != 0)) {
758 /* First one... mark things as dirty... Note that we *MUST*
759 have done a configure before this. I DO NOT WANT TO BE
760 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
761 THAT THEY BELONG TOGETHER!!!!! */
762 /* XXX should check to see if we're only open for reading
763 here... If so, we needn't do this, but then need some
764 other way of keeping track of what's happened.. */
765
766 rf_markalldirty(&rs->sc_r);
767 }
768
769 if ((rs->sc_flags & RAIDF_INITED) != 0)
770 error = dk_open(dksc, dev, flags, fmt, l);
771
772 bad:
773 raidunlock(rs);
774
775 return (error);
776
777
778 }
779
780 static int
781 raid_lastclose(device_t self)
782 {
783 struct raid_softc *rs = raidsoftc(self);
784
785 /* Last one... device is not unconfigured yet.
786 Device shutdown has taken care of setting the
787 clean bits if RAIDF_INITED is not set
788 mark things as clean... */
789
790 rf_update_component_labels(&rs->sc_r,
791 RF_FINAL_COMPONENT_UPDATE);
792
793 /* pass to unlocked code */
794 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
795 rs->sc_flags |= RAIDF_DETACH;
796
797 return 0;
798 }
799
800 /* ARGSUSED */
801 static int
802 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
803 {
804 int unit = raidunit(dev);
805 struct raid_softc *rs;
806 struct dk_softc *dksc;
807 cfdata_t cf;
808 int error = 0, do_detach = 0, do_put = 0;
809
810 if ((rs = raidget(unit, false)) == NULL)
811 return ENXIO;
812 dksc = &rs->sc_dksc;
813
814 if ((error = raidlock(rs)) != 0)
815 return (error);
816
817 if ((rs->sc_flags & RAIDF_INITED) != 0) {
818 error = dk_close(dksc, dev, flags, fmt, l);
819 if ((rs->sc_flags & RAIDF_DETACH) != 0)
820 do_detach = 1;
821 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
822 do_put = 1;
823
824 raidunlock(rs);
825
826 if (do_detach) {
827 /* free the pseudo device attach bits */
828 cf = device_cfdata(dksc->sc_dev);
829 error = config_detach(dksc->sc_dev, 0);
830 if (error == 0)
831 free(cf, M_RAIDFRAME);
832 } else if (do_put) {
833 raidput(rs);
834 }
835
836 return (error);
837
838 }
839
840 static void
841 raid_wakeup(RF_Raid_t *raidPtr)
842 {
843 rf_lock_mutex2(raidPtr->iodone_lock);
844 rf_signal_cond2(raidPtr->iodone_cv);
845 rf_unlock_mutex2(raidPtr->iodone_lock);
846 }
847
848 static void
849 raidstrategy(struct buf *bp)
850 {
851 unsigned int unit;
852 struct raid_softc *rs;
853 struct dk_softc *dksc;
854 RF_Raid_t *raidPtr;
855
856 unit = raidunit(bp->b_dev);
857 if ((rs = raidget(unit, false)) == NULL) {
858 bp->b_error = ENXIO;
859 goto fail;
860 }
861 if ((rs->sc_flags & RAIDF_INITED) == 0) {
862 bp->b_error = ENXIO;
863 goto fail;
864 }
865 dksc = &rs->sc_dksc;
866 raidPtr = &rs->sc_r;
867
868 /* Queue IO only */
869 if (dk_strategy_defer(dksc, bp))
870 goto done;
871
872 /* schedule the IO to happen at the next convenient time */
873 raid_wakeup(raidPtr);
874
875 done:
876 return;
877
878 fail:
879 bp->b_resid = bp->b_bcount;
880 biodone(bp);
881 }
882
883 static int
884 raid_diskstart(device_t dev, struct buf *bp)
885 {
886 struct raid_softc *rs = raidsoftc(dev);
887 RF_Raid_t *raidPtr;
888
889 raidPtr = &rs->sc_r;
890 if (!raidPtr->valid) {
891 db1_printf(("raid is not valid..\n"));
892 return ENODEV;
893 }
894
895 /* XXX */
896 bp->b_resid = 0;
897
898 return raiddoaccess(raidPtr, bp);
899 }
900
901 void
902 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
903 {
904 struct raid_softc *rs;
905 struct dk_softc *dksc;
906
907 rs = raidPtr->softc;
908 dksc = &rs->sc_dksc;
909
910 dk_done(dksc, bp);
911
912 rf_lock_mutex2(raidPtr->mutex);
913 raidPtr->openings++;
914 rf_unlock_mutex2(raidPtr->mutex);
915
916 /* schedule more IO */
917 raid_wakeup(raidPtr);
918 }
919
920 /* ARGSUSED */
921 static int
922 raidread(dev_t dev, struct uio *uio, int flags)
923 {
924 int unit = raidunit(dev);
925 struct raid_softc *rs;
926
927 if ((rs = raidget(unit, false)) == NULL)
928 return ENXIO;
929
930 if ((rs->sc_flags & RAIDF_INITED) == 0)
931 return (ENXIO);
932
933 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
934
935 }
936
937 /* ARGSUSED */
938 static int
939 raidwrite(dev_t dev, struct uio *uio, int flags)
940 {
941 int unit = raidunit(dev);
942 struct raid_softc *rs;
943
944 if ((rs = raidget(unit, false)) == NULL)
945 return ENXIO;
946
947 if ((rs->sc_flags & RAIDF_INITED) == 0)
948 return (ENXIO);
949
950 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
951
952 }
953
954 static int
955 raid_detach_unlocked(struct raid_softc *rs)
956 {
957 struct dk_softc *dksc = &rs->sc_dksc;
958 RF_Raid_t *raidPtr;
959 int error;
960
961 raidPtr = &rs->sc_r;
962
963 if (DK_BUSY(dksc, 0))
964 return EBUSY;
965
966 if ((rs->sc_flags & RAIDF_INITED) == 0)
967 return 0;
968
969 rs->sc_flags &= ~RAIDF_SHUTDOWN;
970
971 if ((error = rf_Shutdown(raidPtr)) != 0)
972 return error;
973
974 rs->sc_flags &= ~RAIDF_INITED;
975
976 /* Kill off any queued buffers */
977 dk_drain(dksc);
978 bufq_free(dksc->sc_bufq);
979
980 /* Detach the disk. */
981 dkwedge_delall(&dksc->sc_dkdev);
982 disk_detach(&dksc->sc_dkdev);
983 disk_destroy(&dksc->sc_dkdev);
984 dk_detach(dksc);
985
986 return 0;
987 }
988
989 static int
990 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
991 {
992 int unit = raidunit(dev);
993 int error = 0;
994 int part, pmask;
995 struct raid_softc *rs;
996 struct dk_softc *dksc;
997 RF_Config_t *k_cfg, *u_cfg;
998 RF_Raid_t *raidPtr;
999 RF_RaidDisk_t *diskPtr;
1000 RF_AccTotals_t *totals;
1001 RF_DeviceConfig_t *d_cfg, **ucfgp;
1002 u_char *specific_buf;
1003 int retcode = 0;
1004 int column;
1005 /* int raidid; */
1006 struct rf_recon_req *rrcopy, *rr;
1007 RF_ComponentLabel_t *clabel;
1008 RF_ComponentLabel_t *ci_label;
1009 RF_ComponentLabel_t **clabel_ptr;
1010 RF_SingleComponent_t *sparePtr,*componentPtr;
1011 RF_SingleComponent_t component;
1012 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1013 int i, j, d;
1014
1015 if ((rs = raidget(unit, false)) == NULL)
1016 return ENXIO;
1017 dksc = &rs->sc_dksc;
1018 raidPtr = &rs->sc_r;
1019
1020 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1021 (int) DISKPART(dev), (int) unit, cmd));
1022
1023 /* Must be initialized for these... */
1024 switch (cmd) {
1025 case RAIDFRAME_REWRITEPARITY:
1026 case RAIDFRAME_GET_INFO:
1027 case RAIDFRAME_RESET_ACCTOTALS:
1028 case RAIDFRAME_GET_ACCTOTALS:
1029 case RAIDFRAME_KEEP_ACCTOTALS:
1030 case RAIDFRAME_GET_SIZE:
1031 case RAIDFRAME_FAIL_DISK:
1032 case RAIDFRAME_COPYBACK:
1033 case RAIDFRAME_CHECK_RECON_STATUS:
1034 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1035 case RAIDFRAME_GET_COMPONENT_LABEL:
1036 case RAIDFRAME_SET_COMPONENT_LABEL:
1037 case RAIDFRAME_ADD_HOT_SPARE:
1038 case RAIDFRAME_REMOVE_HOT_SPARE:
1039 case RAIDFRAME_INIT_LABELS:
1040 case RAIDFRAME_REBUILD_IN_PLACE:
1041 case RAIDFRAME_CHECK_PARITY:
1042 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1043 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1044 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1045 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1046 case RAIDFRAME_SET_AUTOCONFIG:
1047 case RAIDFRAME_SET_ROOT:
1048 case RAIDFRAME_DELETE_COMPONENT:
1049 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1050 case RAIDFRAME_PARITYMAP_STATUS:
1051 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1052 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1053 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1054 if ((rs->sc_flags & RAIDF_INITED) == 0)
1055 return (ENXIO);
1056 }
1057
1058 switch (cmd) {
1059 #ifdef COMPAT_50
1060 case RAIDFRAME_GET_INFO50:
1061 return rf_get_info50(raidPtr, data);
1062
1063 case RAIDFRAME_CONFIGURE50:
1064 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1065 return retcode;
1066 goto config;
1067 #endif
1068 /* configure the system */
1069 case RAIDFRAME_CONFIGURE:
1070
1071 if (raidPtr->valid) {
1072 /* There is a valid RAID set running on this unit! */
1073 printf("raid%d: Device already configured!\n",unit);
1074 return(EINVAL);
1075 }
1076
1077 /* copy-in the configuration information */
1078 /* data points to a pointer to the configuration structure */
1079
1080 u_cfg = *((RF_Config_t **) data);
1081 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1082 if (k_cfg == NULL) {
1083 return (ENOMEM);
1084 }
1085 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1086 if (retcode) {
1087 RF_Free(k_cfg, sizeof(RF_Config_t));
1088 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1089 retcode));
1090 goto no_config;
1091 }
1092 goto config;
1093 config:
1094 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1095
1096 /* allocate a buffer for the layout-specific data, and copy it
1097 * in */
1098 if (k_cfg->layoutSpecificSize) {
1099 if (k_cfg->layoutSpecificSize > 10000) {
1100 /* sanity check */
1101 RF_Free(k_cfg, sizeof(RF_Config_t));
1102 retcode = EINVAL;
1103 goto no_config;
1104 }
1105 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1106 (u_char *));
1107 if (specific_buf == NULL) {
1108 RF_Free(k_cfg, sizeof(RF_Config_t));
1109 retcode = ENOMEM;
1110 goto no_config;
1111 }
1112 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1113 k_cfg->layoutSpecificSize);
1114 if (retcode) {
1115 RF_Free(k_cfg, sizeof(RF_Config_t));
1116 RF_Free(specific_buf,
1117 k_cfg->layoutSpecificSize);
1118 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1119 retcode));
1120 goto no_config;
1121 }
1122 } else
1123 specific_buf = NULL;
1124 k_cfg->layoutSpecific = specific_buf;
1125
1126 /* should do some kind of sanity check on the configuration.
1127 * Store the sum of all the bytes in the last byte? */
1128
1129 /* configure the system */
1130
1131 /*
1132 * Clear the entire RAID descriptor, just to make sure
1133 * there is no stale data left in the case of a
1134 * reconfiguration
1135 */
1136 memset(raidPtr, 0, sizeof(*raidPtr));
1137 raidPtr->softc = rs;
1138 raidPtr->raidid = unit;
1139
1140 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1141
1142 if (retcode == 0) {
1143
1144 /* allow this many simultaneous IO's to
1145 this RAID device */
1146 raidPtr->openings = RAIDOUTSTANDING;
1147
1148 raidinit(rs);
1149 raid_wakeup(raidPtr);
1150 rf_markalldirty(raidPtr);
1151 }
1152 /* free the buffers. No return code here. */
1153 if (k_cfg->layoutSpecificSize) {
1154 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1155 }
1156 RF_Free(k_cfg, sizeof(RF_Config_t));
1157
1158 no_config:
1159 /*
1160 * If configuration failed, set sc_flags so that we
1161 * will detach the device when we close it.
1162 */
1163 if (retcode != 0)
1164 rs->sc_flags |= RAIDF_SHUTDOWN;
1165 return (retcode);
1166
1167 /* shutdown the system */
1168 case RAIDFRAME_SHUTDOWN:
1169
1170 part = DISKPART(dev);
1171 pmask = (1 << part);
1172
1173 if ((error = raidlock(rs)) != 0)
1174 return (error);
1175
1176 if (DK_BUSY(dksc, pmask))
1177 retcode = EBUSY;
1178 else {
1179 /* detach and free on close */
1180 rs->sc_flags |= RAIDF_SHUTDOWN;
1181 retcode = 0;
1182 }
1183
1184 raidunlock(rs);
1185
1186 return (retcode);
1187 case RAIDFRAME_GET_COMPONENT_LABEL:
1188 clabel_ptr = (RF_ComponentLabel_t **) data;
1189 /* need to read the component label for the disk indicated
1190 by row,column in clabel */
1191
1192 /*
1193 * Perhaps there should be an option to skip the in-core
1194 * copy and hit the disk, as with disklabel(8).
1195 */
1196 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1197
1198 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1199
1200 if (retcode) {
1201 RF_Free(clabel, sizeof(*clabel));
1202 return retcode;
1203 }
1204
1205 clabel->row = 0; /* Don't allow looking at anything else.*/
1206
1207 column = clabel->column;
1208
1209 if ((column < 0) || (column >= raidPtr->numCol +
1210 raidPtr->numSpare)) {
1211 RF_Free(clabel, sizeof(*clabel));
1212 return EINVAL;
1213 }
1214
1215 RF_Free(clabel, sizeof(*clabel));
1216
1217 clabel = raidget_component_label(raidPtr, column);
1218
1219 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1220
1221 #if 0
1222 case RAIDFRAME_SET_COMPONENT_LABEL:
1223 clabel = (RF_ComponentLabel_t *) data;
1224
1225 /* XXX check the label for valid stuff... */
1226 /* Note that some things *should not* get modified --
1227 the user should be re-initing the labels instead of
1228 trying to patch things.
1229 */
1230
1231 raidid = raidPtr->raidid;
1232 #ifdef DEBUG
1233 printf("raid%d: Got component label:\n", raidid);
1234 printf("raid%d: Version: %d\n", raidid, clabel->version);
1235 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1236 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1237 printf("raid%d: Column: %d\n", raidid, clabel->column);
1238 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1239 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1240 printf("raid%d: Status: %d\n", raidid, clabel->status);
1241 #endif
1242 clabel->row = 0;
1243 column = clabel->column;
1244
1245 if ((column < 0) || (column >= raidPtr->numCol)) {
1246 return(EINVAL);
1247 }
1248
1249 /* XXX this isn't allowed to do anything for now :-) */
1250
1251 /* XXX and before it is, we need to fill in the rest
1252 of the fields!?!?!?! */
1253 memcpy(raidget_component_label(raidPtr, column),
1254 clabel, sizeof(*clabel));
1255 raidflush_component_label(raidPtr, column);
1256 return (0);
1257 #endif
1258
1259 case RAIDFRAME_INIT_LABELS:
1260 clabel = (RF_ComponentLabel_t *) data;
1261 /*
1262 we only want the serial number from
1263 the above. We get all the rest of the information
1264 from the config that was used to create this RAID
1265 set.
1266 */
1267
1268 raidPtr->serial_number = clabel->serial_number;
1269
1270 for(column=0;column<raidPtr->numCol;column++) {
1271 diskPtr = &raidPtr->Disks[column];
1272 if (!RF_DEAD_DISK(diskPtr->status)) {
1273 ci_label = raidget_component_label(raidPtr,
1274 column);
1275 /* Zeroing this is important. */
1276 memset(ci_label, 0, sizeof(*ci_label));
1277 raid_init_component_label(raidPtr, ci_label);
1278 ci_label->serial_number =
1279 raidPtr->serial_number;
1280 ci_label->row = 0; /* we dont' pretend to support more */
1281 rf_component_label_set_partitionsize(ci_label,
1282 diskPtr->partitionSize);
1283 ci_label->column = column;
1284 raidflush_component_label(raidPtr, column);
1285 }
1286 /* XXXjld what about the spares? */
1287 }
1288
1289 return (retcode);
1290 case RAIDFRAME_SET_AUTOCONFIG:
1291 d = rf_set_autoconfig(raidPtr, *(int *) data);
1292 printf("raid%d: New autoconfig value is: %d\n",
1293 raidPtr->raidid, d);
1294 *(int *) data = d;
1295 return (retcode);
1296
1297 case RAIDFRAME_SET_ROOT:
1298 d = rf_set_rootpartition(raidPtr, *(int *) data);
1299 printf("raid%d: New rootpartition value is: %d\n",
1300 raidPtr->raidid, d);
1301 *(int *) data = d;
1302 return (retcode);
1303
1304 /* initialize all parity */
1305 case RAIDFRAME_REWRITEPARITY:
1306
1307 if (raidPtr->Layout.map->faultsTolerated == 0) {
1308 /* Parity for RAID 0 is trivially correct */
1309 raidPtr->parity_good = RF_RAID_CLEAN;
1310 return(0);
1311 }
1312
1313 if (raidPtr->parity_rewrite_in_progress == 1) {
1314 /* Re-write is already in progress! */
1315 return(EINVAL);
1316 }
1317
1318 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1319 rf_RewriteParityThread,
1320 raidPtr,"raid_parity");
1321 return (retcode);
1322
1323
1324 case RAIDFRAME_ADD_HOT_SPARE:
1325 sparePtr = (RF_SingleComponent_t *) data;
1326 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1327 retcode = rf_add_hot_spare(raidPtr, &component);
1328 return(retcode);
1329
1330 case RAIDFRAME_REMOVE_HOT_SPARE:
1331 return(retcode);
1332
1333 case RAIDFRAME_DELETE_COMPONENT:
1334 componentPtr = (RF_SingleComponent_t *)data;
1335 memcpy( &component, componentPtr,
1336 sizeof(RF_SingleComponent_t));
1337 retcode = rf_delete_component(raidPtr, &component);
1338 return(retcode);
1339
1340 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1341 componentPtr = (RF_SingleComponent_t *)data;
1342 memcpy( &component, componentPtr,
1343 sizeof(RF_SingleComponent_t));
1344 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1345 return(retcode);
1346
1347 case RAIDFRAME_REBUILD_IN_PLACE:
1348
1349 if (raidPtr->Layout.map->faultsTolerated == 0) {
1350 /* Can't do this on a RAID 0!! */
1351 return(EINVAL);
1352 }
1353
1354 if (raidPtr->recon_in_progress == 1) {
1355 /* a reconstruct is already in progress! */
1356 return(EINVAL);
1357 }
1358
1359 componentPtr = (RF_SingleComponent_t *) data;
1360 memcpy( &component, componentPtr,
1361 sizeof(RF_SingleComponent_t));
1362 component.row = 0; /* we don't support any more */
1363 column = component.column;
1364
1365 if ((column < 0) || (column >= raidPtr->numCol)) {
1366 return(EINVAL);
1367 }
1368
1369 rf_lock_mutex2(raidPtr->mutex);
1370 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1371 (raidPtr->numFailures > 0)) {
1372 /* XXX 0 above shouldn't be constant!!! */
1373 /* some component other than this has failed.
1374 Let's not make things worse than they already
1375 are... */
1376 printf("raid%d: Unable to reconstruct to disk at:\n",
1377 raidPtr->raidid);
1378 printf("raid%d: Col: %d Too many failures.\n",
1379 raidPtr->raidid, column);
1380 rf_unlock_mutex2(raidPtr->mutex);
1381 return (EINVAL);
1382 }
1383 if (raidPtr->Disks[column].status ==
1384 rf_ds_reconstructing) {
1385 printf("raid%d: Unable to reconstruct to disk at:\n",
1386 raidPtr->raidid);
1387 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1388
1389 rf_unlock_mutex2(raidPtr->mutex);
1390 return (EINVAL);
1391 }
1392 if (raidPtr->Disks[column].status == rf_ds_spared) {
1393 rf_unlock_mutex2(raidPtr->mutex);
1394 return (EINVAL);
1395 }
1396 rf_unlock_mutex2(raidPtr->mutex);
1397
1398 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1399 if (rrcopy == NULL)
1400 return(ENOMEM);
1401
1402 rrcopy->raidPtr = (void *) raidPtr;
1403 rrcopy->col = column;
1404
1405 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1406 rf_ReconstructInPlaceThread,
1407 rrcopy,"raid_reconip");
1408 return(retcode);
1409
1410 case RAIDFRAME_GET_INFO:
1411 if (!raidPtr->valid)
1412 return (ENODEV);
1413 ucfgp = (RF_DeviceConfig_t **) data;
1414 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1415 (RF_DeviceConfig_t *));
1416 if (d_cfg == NULL)
1417 return (ENOMEM);
1418 d_cfg->rows = 1; /* there is only 1 row now */
1419 d_cfg->cols = raidPtr->numCol;
1420 d_cfg->ndevs = raidPtr->numCol;
1421 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1422 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1423 return (ENOMEM);
1424 }
1425 d_cfg->nspares = raidPtr->numSpare;
1426 if (d_cfg->nspares >= RF_MAX_DISKS) {
1427 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1428 return (ENOMEM);
1429 }
1430 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1431 d = 0;
1432 for (j = 0; j < d_cfg->cols; j++) {
1433 d_cfg->devs[d] = raidPtr->Disks[j];
1434 d++;
1435 }
1436 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1437 d_cfg->spares[i] = raidPtr->Disks[j];
1438 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1439 /* XXX: raidctl(8) expects to see this as a used spare */
1440 d_cfg->spares[i].status = rf_ds_used_spare;
1441 }
1442 }
1443 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1444 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1445
1446 return (retcode);
1447
1448 case RAIDFRAME_CHECK_PARITY:
1449 *(int *) data = raidPtr->parity_good;
1450 return (0);
1451
1452 case RAIDFRAME_PARITYMAP_STATUS:
1453 if (rf_paritymap_ineligible(raidPtr))
1454 return EINVAL;
1455 rf_paritymap_status(raidPtr->parity_map,
1456 (struct rf_pmstat *)data);
1457 return 0;
1458
1459 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1460 if (rf_paritymap_ineligible(raidPtr))
1461 return EINVAL;
1462 if (raidPtr->parity_map == NULL)
1463 return ENOENT; /* ??? */
1464 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1465 (struct rf_pmparams *)data, 1))
1466 return EINVAL;
1467 return 0;
1468
1469 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1470 if (rf_paritymap_ineligible(raidPtr))
1471 return EINVAL;
1472 *(int *) data = rf_paritymap_get_disable(raidPtr);
1473 return 0;
1474
1475 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1476 if (rf_paritymap_ineligible(raidPtr))
1477 return EINVAL;
1478 rf_paritymap_set_disable(raidPtr, *(int *)data);
1479 /* XXX should errors be passed up? */
1480 return 0;
1481
1482 case RAIDFRAME_RESET_ACCTOTALS:
1483 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1484 return (0);
1485
1486 case RAIDFRAME_GET_ACCTOTALS:
1487 totals = (RF_AccTotals_t *) data;
1488 *totals = raidPtr->acc_totals;
1489 return (0);
1490
1491 case RAIDFRAME_KEEP_ACCTOTALS:
1492 raidPtr->keep_acc_totals = *(int *)data;
1493 return (0);
1494
1495 case RAIDFRAME_GET_SIZE:
1496 *(int *) data = raidPtr->totalSectors;
1497 return (0);
1498
1499 /* fail a disk & optionally start reconstruction */
1500 case RAIDFRAME_FAIL_DISK:
1501
1502 if (raidPtr->Layout.map->faultsTolerated == 0) {
1503 /* Can't do this on a RAID 0!! */
1504 return(EINVAL);
1505 }
1506
1507 rr = (struct rf_recon_req *) data;
1508 rr->row = 0;
1509 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1510 return (EINVAL);
1511
1512
1513 rf_lock_mutex2(raidPtr->mutex);
1514 if (raidPtr->status == rf_rs_reconstructing) {
1515 /* you can't fail a disk while we're reconstructing! */
1516 /* XXX wrong for RAID6 */
1517 rf_unlock_mutex2(raidPtr->mutex);
1518 return (EINVAL);
1519 }
1520 if ((raidPtr->Disks[rr->col].status ==
1521 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1522 /* some other component has failed. Let's not make
1523 things worse. XXX wrong for RAID6 */
1524 rf_unlock_mutex2(raidPtr->mutex);
1525 return (EINVAL);
1526 }
1527 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1528 /* Can't fail a spared disk! */
1529 rf_unlock_mutex2(raidPtr->mutex);
1530 return (EINVAL);
1531 }
1532 rf_unlock_mutex2(raidPtr->mutex);
1533
1534 /* make a copy of the recon request so that we don't rely on
1535 * the user's buffer */
1536 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1537 if (rrcopy == NULL)
1538 return(ENOMEM);
1539 memcpy(rrcopy, rr, sizeof(*rr));
1540 rrcopy->raidPtr = (void *) raidPtr;
1541
1542 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1543 rf_ReconThread,
1544 rrcopy,"raid_recon");
1545 return (0);
1546
1547 /* invoke a copyback operation after recon on whatever disk
1548 * needs it, if any */
1549 case RAIDFRAME_COPYBACK:
1550
1551 if (raidPtr->Layout.map->faultsTolerated == 0) {
1552 /* This makes no sense on a RAID 0!! */
1553 return(EINVAL);
1554 }
1555
1556 if (raidPtr->copyback_in_progress == 1) {
1557 /* Copyback is already in progress! */
1558 return(EINVAL);
1559 }
1560
1561 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1562 rf_CopybackThread,
1563 raidPtr,"raid_copyback");
1564 return (retcode);
1565
1566 /* return the percentage completion of reconstruction */
1567 case RAIDFRAME_CHECK_RECON_STATUS:
1568 if (raidPtr->Layout.map->faultsTolerated == 0) {
1569 /* This makes no sense on a RAID 0, so tell the
1570 user it's done. */
1571 *(int *) data = 100;
1572 return(0);
1573 }
1574 if (raidPtr->status != rf_rs_reconstructing)
1575 *(int *) data = 100;
1576 else {
1577 if (raidPtr->reconControl->numRUsTotal > 0) {
1578 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1579 } else {
1580 *(int *) data = 0;
1581 }
1582 }
1583 return (0);
1584 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1585 progressInfoPtr = (RF_ProgressInfo_t **) data;
1586 if (raidPtr->status != rf_rs_reconstructing) {
1587 progressInfo.remaining = 0;
1588 progressInfo.completed = 100;
1589 progressInfo.total = 100;
1590 } else {
1591 progressInfo.total =
1592 raidPtr->reconControl->numRUsTotal;
1593 progressInfo.completed =
1594 raidPtr->reconControl->numRUsComplete;
1595 progressInfo.remaining = progressInfo.total -
1596 progressInfo.completed;
1597 }
1598 retcode = copyout(&progressInfo, *progressInfoPtr,
1599 sizeof(RF_ProgressInfo_t));
1600 return (retcode);
1601
1602 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1603 if (raidPtr->Layout.map->faultsTolerated == 0) {
1604 /* This makes no sense on a RAID 0, so tell the
1605 user it's done. */
1606 *(int *) data = 100;
1607 return(0);
1608 }
1609 if (raidPtr->parity_rewrite_in_progress == 1) {
1610 *(int *) data = 100 *
1611 raidPtr->parity_rewrite_stripes_done /
1612 raidPtr->Layout.numStripe;
1613 } else {
1614 *(int *) data = 100;
1615 }
1616 return (0);
1617
1618 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1619 progressInfoPtr = (RF_ProgressInfo_t **) data;
1620 if (raidPtr->parity_rewrite_in_progress == 1) {
1621 progressInfo.total = raidPtr->Layout.numStripe;
1622 progressInfo.completed =
1623 raidPtr->parity_rewrite_stripes_done;
1624 progressInfo.remaining = progressInfo.total -
1625 progressInfo.completed;
1626 } else {
1627 progressInfo.remaining = 0;
1628 progressInfo.completed = 100;
1629 progressInfo.total = 100;
1630 }
1631 retcode = copyout(&progressInfo, *progressInfoPtr,
1632 sizeof(RF_ProgressInfo_t));
1633 return (retcode);
1634
1635 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1636 if (raidPtr->Layout.map->faultsTolerated == 0) {
1637 /* This makes no sense on a RAID 0 */
1638 *(int *) data = 100;
1639 return(0);
1640 }
1641 if (raidPtr->copyback_in_progress == 1) {
1642 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1643 raidPtr->Layout.numStripe;
1644 } else {
1645 *(int *) data = 100;
1646 }
1647 return (0);
1648
1649 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1650 progressInfoPtr = (RF_ProgressInfo_t **) data;
1651 if (raidPtr->copyback_in_progress == 1) {
1652 progressInfo.total = raidPtr->Layout.numStripe;
1653 progressInfo.completed =
1654 raidPtr->copyback_stripes_done;
1655 progressInfo.remaining = progressInfo.total -
1656 progressInfo.completed;
1657 } else {
1658 progressInfo.remaining = 0;
1659 progressInfo.completed = 100;
1660 progressInfo.total = 100;
1661 }
1662 retcode = copyout(&progressInfo, *progressInfoPtr,
1663 sizeof(RF_ProgressInfo_t));
1664 return (retcode);
1665
1666 /* the sparetable daemon calls this to wait for the kernel to
1667 * need a spare table. this ioctl does not return until a
1668 * spare table is needed. XXX -- calling mpsleep here in the
1669 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1670 * -- I should either compute the spare table in the kernel,
1671 * or have a different -- XXX XXX -- interface (a different
1672 * character device) for delivering the table -- XXX */
1673 #if 0
1674 case RAIDFRAME_SPARET_WAIT:
1675 rf_lock_mutex2(rf_sparet_wait_mutex);
1676 while (!rf_sparet_wait_queue)
1677 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1678 waitreq = rf_sparet_wait_queue;
1679 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1680 rf_unlock_mutex2(rf_sparet_wait_mutex);
1681
1682 /* structure assignment */
1683 *((RF_SparetWait_t *) data) = *waitreq;
1684
1685 RF_Free(waitreq, sizeof(*waitreq));
1686 return (0);
1687
1688 /* wakes up a process waiting on SPARET_WAIT and puts an error
1689 * code in it that will cause the dameon to exit */
1690 case RAIDFRAME_ABORT_SPARET_WAIT:
1691 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1692 waitreq->fcol = -1;
1693 rf_lock_mutex2(rf_sparet_wait_mutex);
1694 waitreq->next = rf_sparet_wait_queue;
1695 rf_sparet_wait_queue = waitreq;
1696 rf_broadcast_conf2(rf_sparet_wait_cv);
1697 rf_unlock_mutex2(rf_sparet_wait_mutex);
1698 return (0);
1699
1700 /* used by the spare table daemon to deliver a spare table
1701 * into the kernel */
1702 case RAIDFRAME_SEND_SPARET:
1703
1704 /* install the spare table */
1705 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1706
1707 /* respond to the requestor. the return status of the spare
1708 * table installation is passed in the "fcol" field */
1709 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1710 waitreq->fcol = retcode;
1711 rf_lock_mutex2(rf_sparet_wait_mutex);
1712 waitreq->next = rf_sparet_resp_queue;
1713 rf_sparet_resp_queue = waitreq;
1714 rf_broadcast_cond2(rf_sparet_resp_cv);
1715 rf_unlock_mutex2(rf_sparet_wait_mutex);
1716
1717 return (retcode);
1718 #endif
1719
1720 default:
1721 break; /* fall through to the os-specific code below */
1722
1723 }
1724
1725 if (!raidPtr->valid)
1726 return (EINVAL);
1727
1728 /*
1729 * Add support for "regular" device ioctls here.
1730 */
1731
1732 error = dk_ioctl(dksc, dev, cmd, data, flag, l);
1733 if (error != EPASSTHROUGH)
1734 return (error);
1735
1736 switch (cmd) {
1737 case DIOCCACHESYNC:
1738 return rf_sync_component_caches(raidPtr);
1739
1740 default:
1741 retcode = ENOTTY;
1742 }
1743 return (retcode);
1744
1745 }
1746
1747
1748 /* raidinit -- complete the rest of the initialization for the
1749 RAIDframe device. */
1750
1751
1752 static void
1753 raidinit(struct raid_softc *rs)
1754 {
1755 cfdata_t cf;
1756 unsigned int unit;
1757 struct dk_softc *dksc = &rs->sc_dksc;
1758 RF_Raid_t *raidPtr = &rs->sc_r;
1759 device_t dev;
1760
1761 unit = raidPtr->raidid;
1762
1763 /* XXX doesn't check bounds. */
1764 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1765
1766 /* attach the pseudo device */
1767 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1768 cf->cf_name = raid_cd.cd_name;
1769 cf->cf_atname = raid_cd.cd_name;
1770 cf->cf_unit = unit;
1771 cf->cf_fstate = FSTATE_STAR;
1772
1773 dev = config_attach_pseudo(cf);
1774 if (dev == NULL) {
1775 printf("raid%d: config_attach_pseudo failed\n",
1776 raidPtr->raidid);
1777 free(cf, M_RAIDFRAME);
1778 return;
1779 }
1780
1781 /* provide a backpointer to the real softc */
1782 raidsoftc(dev) = rs;
1783
1784 /* disk_attach actually creates space for the CPU disklabel, among
1785 * other things, so it's critical to call this *BEFORE* we try putzing
1786 * with disklabels. */
1787 dk_init(dksc, dev, DKTYPE_RAID);
1788 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1789
1790 /* XXX There may be a weird interaction here between this, and
1791 * protectedSectors, as used in RAIDframe. */
1792
1793 rs->sc_size = raidPtr->totalSectors;
1794
1795 /* Attach dk and disk subsystems */
1796 dk_attach(dksc);
1797 disk_attach(&dksc->sc_dkdev);
1798 rf_set_geometry(rs, raidPtr);
1799
1800 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1801
1802 /* mark unit as usuable */
1803 rs->sc_flags |= RAIDF_INITED;
1804
1805 dkwedge_discover(&dksc->sc_dkdev);
1806 }
1807
1808 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1809 /* wake up the daemon & tell it to get us a spare table
1810 * XXX
1811 * the entries in the queues should be tagged with the raidPtr
1812 * so that in the extremely rare case that two recons happen at once,
1813 * we know for which device were requesting a spare table
1814 * XXX
1815 *
1816 * XXX This code is not currently used. GO
1817 */
1818 int
1819 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1820 {
1821 int retcode;
1822
1823 rf_lock_mutex2(rf_sparet_wait_mutex);
1824 req->next = rf_sparet_wait_queue;
1825 rf_sparet_wait_queue = req;
1826 rf_broadcast_cond2(rf_sparet_wait_cv);
1827
1828 /* mpsleep unlocks the mutex */
1829 while (!rf_sparet_resp_queue) {
1830 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1831 }
1832 req = rf_sparet_resp_queue;
1833 rf_sparet_resp_queue = req->next;
1834 rf_unlock_mutex2(rf_sparet_wait_mutex);
1835
1836 retcode = req->fcol;
1837 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1838 * alloc'd */
1839 return (retcode);
1840 }
1841 #endif
1842
1843 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1844 * bp & passes it down.
1845 * any calls originating in the kernel must use non-blocking I/O
1846 * do some extra sanity checking to return "appropriate" error values for
1847 * certain conditions (to make some standard utilities work)
1848 *
1849 * Formerly known as: rf_DoAccessKernel
1850 */
1851 void
1852 raidstart(RF_Raid_t *raidPtr)
1853 {
1854 struct raid_softc *rs;
1855 struct dk_softc *dksc;
1856
1857 rs = raidPtr->softc;
1858 dksc = &rs->sc_dksc;
1859 /* quick check to see if anything has died recently */
1860 rf_lock_mutex2(raidPtr->mutex);
1861 if (raidPtr->numNewFailures > 0) {
1862 rf_unlock_mutex2(raidPtr->mutex);
1863 rf_update_component_labels(raidPtr,
1864 RF_NORMAL_COMPONENT_UPDATE);
1865 rf_lock_mutex2(raidPtr->mutex);
1866 raidPtr->numNewFailures--;
1867 }
1868 rf_unlock_mutex2(raidPtr->mutex);
1869
1870 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1871 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1872 return;
1873 }
1874
1875 dk_start(dksc, NULL);
1876 }
1877
1878 static int
1879 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1880 {
1881 RF_SectorCount_t num_blocks, pb, sum;
1882 RF_RaidAddr_t raid_addr;
1883 daddr_t blocknum;
1884 int do_async;
1885 int rc;
1886
1887 rf_lock_mutex2(raidPtr->mutex);
1888 if (raidPtr->openings == 0) {
1889 rf_unlock_mutex2(raidPtr->mutex);
1890 return EAGAIN;
1891 }
1892 rf_unlock_mutex2(raidPtr->mutex);
1893
1894 blocknum = bp->b_rawblkno;
1895
1896 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1897 (int) blocknum));
1898
1899 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1900 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1901
1902 /* *THIS* is where we adjust what block we're going to...
1903 * but DO NOT TOUCH bp->b_blkno!!! */
1904 raid_addr = blocknum;
1905
1906 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1907 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1908 sum = raid_addr + num_blocks + pb;
1909 if (1 || rf_debugKernelAccess) {
1910 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1911 (int) raid_addr, (int) sum, (int) num_blocks,
1912 (int) pb, (int) bp->b_resid));
1913 }
1914 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1915 || (sum < num_blocks) || (sum < pb)) {
1916 rc = ENOSPC;
1917 goto done;
1918 }
1919 /*
1920 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1921 */
1922
1923 if (bp->b_bcount & raidPtr->sectorMask) {
1924 rc = ENOSPC;
1925 goto done;
1926 }
1927 db1_printf(("Calling DoAccess..\n"));
1928
1929
1930 rf_lock_mutex2(raidPtr->mutex);
1931 raidPtr->openings--;
1932 rf_unlock_mutex2(raidPtr->mutex);
1933
1934 /*
1935 * Everything is async.
1936 */
1937 do_async = 1;
1938
1939 /* don't ever condition on bp->b_flags & B_WRITE.
1940 * always condition on B_READ instead */
1941
1942 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1943 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1944 do_async, raid_addr, num_blocks,
1945 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1946
1947 done:
1948 return rc;
1949 }
1950
1951 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1952
1953 int
1954 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1955 {
1956 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1957 struct buf *bp;
1958
1959 req->queue = queue;
1960 bp = req->bp;
1961
1962 switch (req->type) {
1963 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1964 /* XXX need to do something extra here.. */
1965 /* I'm leaving this in, as I've never actually seen it used,
1966 * and I'd like folks to report it... GO */
1967 printf(("WAKEUP CALLED\n"));
1968 queue->numOutstanding++;
1969
1970 bp->b_flags = 0;
1971 bp->b_private = req;
1972
1973 KernelWakeupFunc(bp);
1974 break;
1975
1976 case RF_IO_TYPE_READ:
1977 case RF_IO_TYPE_WRITE:
1978 #if RF_ACC_TRACE > 0
1979 if (req->tracerec) {
1980 RF_ETIMER_START(req->tracerec->timer);
1981 }
1982 #endif
1983 InitBP(bp, queue->rf_cinfo->ci_vp,
1984 op, queue->rf_cinfo->ci_dev,
1985 req->sectorOffset, req->numSector,
1986 req->buf, KernelWakeupFunc, (void *) req,
1987 queue->raidPtr->logBytesPerSector, req->b_proc);
1988
1989 if (rf_debugKernelAccess) {
1990 db1_printf(("dispatch: bp->b_blkno = %ld\n",
1991 (long) bp->b_blkno));
1992 }
1993 queue->numOutstanding++;
1994 queue->last_deq_sector = req->sectorOffset;
1995 /* acc wouldn't have been let in if there were any pending
1996 * reqs at any other priority */
1997 queue->curPriority = req->priority;
1998
1999 db1_printf(("Going for %c to unit %d col %d\n",
2000 req->type, queue->raidPtr->raidid,
2001 queue->col));
2002 db1_printf(("sector %d count %d (%d bytes) %d\n",
2003 (int) req->sectorOffset, (int) req->numSector,
2004 (int) (req->numSector <<
2005 queue->raidPtr->logBytesPerSector),
2006 (int) queue->raidPtr->logBytesPerSector));
2007
2008 /*
2009 * XXX: drop lock here since this can block at
2010 * least with backing SCSI devices. Retake it
2011 * to minimize fuss with calling interfaces.
2012 */
2013
2014 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2015 bdev_strategy(bp);
2016 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2017 break;
2018
2019 default:
2020 panic("bad req->type in rf_DispatchKernelIO");
2021 }
2022 db1_printf(("Exiting from DispatchKernelIO\n"));
2023
2024 return (0);
2025 }
2026 /* this is the callback function associated with a I/O invoked from
2027 kernel code.
2028 */
2029 static void
2030 KernelWakeupFunc(struct buf *bp)
2031 {
2032 RF_DiskQueueData_t *req = NULL;
2033 RF_DiskQueue_t *queue;
2034
2035 db1_printf(("recovering the request queue:\n"));
2036
2037 req = bp->b_private;
2038
2039 queue = (RF_DiskQueue_t *) req->queue;
2040
2041 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2042
2043 #if RF_ACC_TRACE > 0
2044 if (req->tracerec) {
2045 RF_ETIMER_STOP(req->tracerec->timer);
2046 RF_ETIMER_EVAL(req->tracerec->timer);
2047 rf_lock_mutex2(rf_tracing_mutex);
2048 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2049 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2050 req->tracerec->num_phys_ios++;
2051 rf_unlock_mutex2(rf_tracing_mutex);
2052 }
2053 #endif
2054
2055 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2056 * ballistic, and mark the component as hosed... */
2057
2058 if (bp->b_error != 0) {
2059 /* Mark the disk as dead */
2060 /* but only mark it once... */
2061 /* and only if it wouldn't leave this RAID set
2062 completely broken */
2063 if (((queue->raidPtr->Disks[queue->col].status ==
2064 rf_ds_optimal) ||
2065 (queue->raidPtr->Disks[queue->col].status ==
2066 rf_ds_used_spare)) &&
2067 (queue->raidPtr->numFailures <
2068 queue->raidPtr->Layout.map->faultsTolerated)) {
2069 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2070 queue->raidPtr->raidid,
2071 bp->b_error,
2072 queue->raidPtr->Disks[queue->col].devname);
2073 queue->raidPtr->Disks[queue->col].status =
2074 rf_ds_failed;
2075 queue->raidPtr->status = rf_rs_degraded;
2076 queue->raidPtr->numFailures++;
2077 queue->raidPtr->numNewFailures++;
2078 } else { /* Disk is already dead... */
2079 /* printf("Disk already marked as dead!\n"); */
2080 }
2081
2082 }
2083
2084 /* Fill in the error value */
2085 req->error = bp->b_error;
2086
2087 /* Drop this one on the "finished" queue... */
2088 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2089
2090 /* Let the raidio thread know there is work to be done. */
2091 rf_signal_cond2(queue->raidPtr->iodone_cv);
2092
2093 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2094 }
2095
2096
2097 /*
2098 * initialize a buf structure for doing an I/O in the kernel.
2099 */
2100 static void
2101 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2102 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2103 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2104 struct proc *b_proc)
2105 {
2106 /* bp->b_flags = B_PHYS | rw_flag; */
2107 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2108 bp->b_oflags = 0;
2109 bp->b_cflags = 0;
2110 bp->b_bcount = numSect << logBytesPerSector;
2111 bp->b_bufsize = bp->b_bcount;
2112 bp->b_error = 0;
2113 bp->b_dev = dev;
2114 bp->b_data = bf;
2115 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2116 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2117 if (bp->b_bcount == 0) {
2118 panic("bp->b_bcount is zero in InitBP!!");
2119 }
2120 bp->b_proc = b_proc;
2121 bp->b_iodone = cbFunc;
2122 bp->b_private = cbArg;
2123 }
2124
2125 /*
2126 * Wait interruptibly for an exclusive lock.
2127 *
2128 * XXX
2129 * Several drivers do this; it should be abstracted and made MP-safe.
2130 * (Hmm... where have we seen this warning before :-> GO )
2131 */
2132 static int
2133 raidlock(struct raid_softc *rs)
2134 {
2135 int error;
2136
2137 error = 0;
2138 mutex_enter(&rs->sc_mutex);
2139 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2140 rs->sc_flags |= RAIDF_WANTED;
2141 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2142 if (error != 0)
2143 goto done;
2144 }
2145 rs->sc_flags |= RAIDF_LOCKED;
2146 done:
2147 mutex_exit(&rs->sc_mutex);
2148 return (error);
2149 }
2150 /*
2151 * Unlock and wake up any waiters.
2152 */
2153 static void
2154 raidunlock(struct raid_softc *rs)
2155 {
2156
2157 mutex_enter(&rs->sc_mutex);
2158 rs->sc_flags &= ~RAIDF_LOCKED;
2159 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2160 rs->sc_flags &= ~RAIDF_WANTED;
2161 cv_broadcast(&rs->sc_cv);
2162 }
2163 mutex_exit(&rs->sc_mutex);
2164 }
2165
2166
2167 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2168 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2169 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2170
2171 static daddr_t
2172 rf_component_info_offset(void)
2173 {
2174
2175 return RF_COMPONENT_INFO_OFFSET;
2176 }
2177
2178 static daddr_t
2179 rf_component_info_size(unsigned secsize)
2180 {
2181 daddr_t info_size;
2182
2183 KASSERT(secsize);
2184 if (secsize > RF_COMPONENT_INFO_SIZE)
2185 info_size = secsize;
2186 else
2187 info_size = RF_COMPONENT_INFO_SIZE;
2188
2189 return info_size;
2190 }
2191
2192 static daddr_t
2193 rf_parity_map_offset(RF_Raid_t *raidPtr)
2194 {
2195 daddr_t map_offset;
2196
2197 KASSERT(raidPtr->bytesPerSector);
2198 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2199 map_offset = raidPtr->bytesPerSector;
2200 else
2201 map_offset = RF_COMPONENT_INFO_SIZE;
2202 map_offset += rf_component_info_offset();
2203
2204 return map_offset;
2205 }
2206
2207 static daddr_t
2208 rf_parity_map_size(RF_Raid_t *raidPtr)
2209 {
2210 daddr_t map_size;
2211
2212 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2213 map_size = raidPtr->bytesPerSector;
2214 else
2215 map_size = RF_PARITY_MAP_SIZE;
2216
2217 return map_size;
2218 }
2219
2220 int
2221 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2222 {
2223 RF_ComponentLabel_t *clabel;
2224
2225 clabel = raidget_component_label(raidPtr, col);
2226 clabel->clean = RF_RAID_CLEAN;
2227 raidflush_component_label(raidPtr, col);
2228 return(0);
2229 }
2230
2231
2232 int
2233 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2234 {
2235 RF_ComponentLabel_t *clabel;
2236
2237 clabel = raidget_component_label(raidPtr, col);
2238 clabel->clean = RF_RAID_DIRTY;
2239 raidflush_component_label(raidPtr, col);
2240 return(0);
2241 }
2242
2243 int
2244 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2245 {
2246 KASSERT(raidPtr->bytesPerSector);
2247 return raidread_component_label(raidPtr->bytesPerSector,
2248 raidPtr->Disks[col].dev,
2249 raidPtr->raid_cinfo[col].ci_vp,
2250 &raidPtr->raid_cinfo[col].ci_label);
2251 }
2252
2253 RF_ComponentLabel_t *
2254 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2255 {
2256 return &raidPtr->raid_cinfo[col].ci_label;
2257 }
2258
2259 int
2260 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2261 {
2262 RF_ComponentLabel_t *label;
2263
2264 label = &raidPtr->raid_cinfo[col].ci_label;
2265 label->mod_counter = raidPtr->mod_counter;
2266 #ifndef RF_NO_PARITY_MAP
2267 label->parity_map_modcount = label->mod_counter;
2268 #endif
2269 return raidwrite_component_label(raidPtr->bytesPerSector,
2270 raidPtr->Disks[col].dev,
2271 raidPtr->raid_cinfo[col].ci_vp, label);
2272 }
2273
2274
2275 static int
2276 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2277 RF_ComponentLabel_t *clabel)
2278 {
2279 return raidread_component_area(dev, b_vp, clabel,
2280 sizeof(RF_ComponentLabel_t),
2281 rf_component_info_offset(),
2282 rf_component_info_size(secsize));
2283 }
2284
2285 /* ARGSUSED */
2286 static int
2287 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2288 size_t msize, daddr_t offset, daddr_t dsize)
2289 {
2290 struct buf *bp;
2291 int error;
2292
2293 /* XXX should probably ensure that we don't try to do this if
2294 someone has changed rf_protected_sectors. */
2295
2296 if (b_vp == NULL) {
2297 /* For whatever reason, this component is not valid.
2298 Don't try to read a component label from it. */
2299 return(EINVAL);
2300 }
2301
2302 /* get a block of the appropriate size... */
2303 bp = geteblk((int)dsize);
2304 bp->b_dev = dev;
2305
2306 /* get our ducks in a row for the read */
2307 bp->b_blkno = offset / DEV_BSIZE;
2308 bp->b_bcount = dsize;
2309 bp->b_flags |= B_READ;
2310 bp->b_resid = dsize;
2311
2312 bdev_strategy(bp);
2313 error = biowait(bp);
2314
2315 if (!error) {
2316 memcpy(data, bp->b_data, msize);
2317 }
2318
2319 brelse(bp, 0);
2320 return(error);
2321 }
2322
2323
2324 static int
2325 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2326 RF_ComponentLabel_t *clabel)
2327 {
2328 return raidwrite_component_area(dev, b_vp, clabel,
2329 sizeof(RF_ComponentLabel_t),
2330 rf_component_info_offset(),
2331 rf_component_info_size(secsize), 0);
2332 }
2333
2334 /* ARGSUSED */
2335 static int
2336 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2337 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2338 {
2339 struct buf *bp;
2340 int error;
2341
2342 /* get a block of the appropriate size... */
2343 bp = geteblk((int)dsize);
2344 bp->b_dev = dev;
2345
2346 /* get our ducks in a row for the write */
2347 bp->b_blkno = offset / DEV_BSIZE;
2348 bp->b_bcount = dsize;
2349 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2350 bp->b_resid = dsize;
2351
2352 memset(bp->b_data, 0, dsize);
2353 memcpy(bp->b_data, data, msize);
2354
2355 bdev_strategy(bp);
2356 if (asyncp)
2357 return 0;
2358 error = biowait(bp);
2359 brelse(bp, 0);
2360 if (error) {
2361 #if 1
2362 printf("Failed to write RAID component info!\n");
2363 #endif
2364 }
2365
2366 return(error);
2367 }
2368
2369 void
2370 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2371 {
2372 int c;
2373
2374 for (c = 0; c < raidPtr->numCol; c++) {
2375 /* Skip dead disks. */
2376 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2377 continue;
2378 /* XXXjld: what if an error occurs here? */
2379 raidwrite_component_area(raidPtr->Disks[c].dev,
2380 raidPtr->raid_cinfo[c].ci_vp, map,
2381 RF_PARITYMAP_NBYTE,
2382 rf_parity_map_offset(raidPtr),
2383 rf_parity_map_size(raidPtr), 0);
2384 }
2385 }
2386
2387 void
2388 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2389 {
2390 struct rf_paritymap_ondisk tmp;
2391 int c,first;
2392
2393 first=1;
2394 for (c = 0; c < raidPtr->numCol; c++) {
2395 /* Skip dead disks. */
2396 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2397 continue;
2398 raidread_component_area(raidPtr->Disks[c].dev,
2399 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2400 RF_PARITYMAP_NBYTE,
2401 rf_parity_map_offset(raidPtr),
2402 rf_parity_map_size(raidPtr));
2403 if (first) {
2404 memcpy(map, &tmp, sizeof(*map));
2405 first = 0;
2406 } else {
2407 rf_paritymap_merge(map, &tmp);
2408 }
2409 }
2410 }
2411
2412 void
2413 rf_markalldirty(RF_Raid_t *raidPtr)
2414 {
2415 RF_ComponentLabel_t *clabel;
2416 int sparecol;
2417 int c;
2418 int j;
2419 int scol = -1;
2420
2421 raidPtr->mod_counter++;
2422 for (c = 0; c < raidPtr->numCol; c++) {
2423 /* we don't want to touch (at all) a disk that has
2424 failed */
2425 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2426 clabel = raidget_component_label(raidPtr, c);
2427 if (clabel->status == rf_ds_spared) {
2428 /* XXX do something special...
2429 but whatever you do, don't
2430 try to access it!! */
2431 } else {
2432 raidmarkdirty(raidPtr, c);
2433 }
2434 }
2435 }
2436
2437 for( c = 0; c < raidPtr->numSpare ; c++) {
2438 sparecol = raidPtr->numCol + c;
2439 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2440 /*
2441
2442 we claim this disk is "optimal" if it's
2443 rf_ds_used_spare, as that means it should be
2444 directly substitutable for the disk it replaced.
2445 We note that too...
2446
2447 */
2448
2449 for(j=0;j<raidPtr->numCol;j++) {
2450 if (raidPtr->Disks[j].spareCol == sparecol) {
2451 scol = j;
2452 break;
2453 }
2454 }
2455
2456 clabel = raidget_component_label(raidPtr, sparecol);
2457 /* make sure status is noted */
2458
2459 raid_init_component_label(raidPtr, clabel);
2460
2461 clabel->row = 0;
2462 clabel->column = scol;
2463 /* Note: we *don't* change status from rf_ds_used_spare
2464 to rf_ds_optimal */
2465 /* clabel.status = rf_ds_optimal; */
2466
2467 raidmarkdirty(raidPtr, sparecol);
2468 }
2469 }
2470 }
2471
2472
2473 void
2474 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2475 {
2476 RF_ComponentLabel_t *clabel;
2477 int sparecol;
2478 int c;
2479 int j;
2480 int scol;
2481
2482 scol = -1;
2483
2484 /* XXX should do extra checks to make sure things really are clean,
2485 rather than blindly setting the clean bit... */
2486
2487 raidPtr->mod_counter++;
2488
2489 for (c = 0; c < raidPtr->numCol; c++) {
2490 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2491 clabel = raidget_component_label(raidPtr, c);
2492 /* make sure status is noted */
2493 clabel->status = rf_ds_optimal;
2494
2495 /* note what unit we are configured as */
2496 clabel->last_unit = raidPtr->raidid;
2497
2498 raidflush_component_label(raidPtr, c);
2499 if (final == RF_FINAL_COMPONENT_UPDATE) {
2500 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2501 raidmarkclean(raidPtr, c);
2502 }
2503 }
2504 }
2505 /* else we don't touch it.. */
2506 }
2507
2508 for( c = 0; c < raidPtr->numSpare ; c++) {
2509 sparecol = raidPtr->numCol + c;
2510 /* Need to ensure that the reconstruct actually completed! */
2511 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2512 /*
2513
2514 we claim this disk is "optimal" if it's
2515 rf_ds_used_spare, as that means it should be
2516 directly substitutable for the disk it replaced.
2517 We note that too...
2518
2519 */
2520
2521 for(j=0;j<raidPtr->numCol;j++) {
2522 if (raidPtr->Disks[j].spareCol == sparecol) {
2523 scol = j;
2524 break;
2525 }
2526 }
2527
2528 /* XXX shouldn't *really* need this... */
2529 clabel = raidget_component_label(raidPtr, sparecol);
2530 /* make sure status is noted */
2531
2532 raid_init_component_label(raidPtr, clabel);
2533
2534 clabel->column = scol;
2535 clabel->status = rf_ds_optimal;
2536 clabel->last_unit = raidPtr->raidid;
2537
2538 raidflush_component_label(raidPtr, sparecol);
2539 if (final == RF_FINAL_COMPONENT_UPDATE) {
2540 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2541 raidmarkclean(raidPtr, sparecol);
2542 }
2543 }
2544 }
2545 }
2546 }
2547
2548 void
2549 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2550 {
2551
2552 if (vp != NULL) {
2553 if (auto_configured == 1) {
2554 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2555 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2556 vput(vp);
2557
2558 } else {
2559 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2560 }
2561 }
2562 }
2563
2564
2565 void
2566 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2567 {
2568 int r,c;
2569 struct vnode *vp;
2570 int acd;
2571
2572
2573 /* We take this opportunity to close the vnodes like we should.. */
2574
2575 for (c = 0; c < raidPtr->numCol; c++) {
2576 vp = raidPtr->raid_cinfo[c].ci_vp;
2577 acd = raidPtr->Disks[c].auto_configured;
2578 rf_close_component(raidPtr, vp, acd);
2579 raidPtr->raid_cinfo[c].ci_vp = NULL;
2580 raidPtr->Disks[c].auto_configured = 0;
2581 }
2582
2583 for (r = 0; r < raidPtr->numSpare; r++) {
2584 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2585 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2586 rf_close_component(raidPtr, vp, acd);
2587 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2588 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2589 }
2590 }
2591
2592
2593 void
2594 rf_ReconThread(struct rf_recon_req *req)
2595 {
2596 int s;
2597 RF_Raid_t *raidPtr;
2598
2599 s = splbio();
2600 raidPtr = (RF_Raid_t *) req->raidPtr;
2601 raidPtr->recon_in_progress = 1;
2602
2603 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2604 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2605
2606 RF_Free(req, sizeof(*req));
2607
2608 raidPtr->recon_in_progress = 0;
2609 splx(s);
2610
2611 /* That's all... */
2612 kthread_exit(0); /* does not return */
2613 }
2614
2615 void
2616 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2617 {
2618 int retcode;
2619 int s;
2620
2621 raidPtr->parity_rewrite_stripes_done = 0;
2622 raidPtr->parity_rewrite_in_progress = 1;
2623 s = splbio();
2624 retcode = rf_RewriteParity(raidPtr);
2625 splx(s);
2626 if (retcode) {
2627 printf("raid%d: Error re-writing parity (%d)!\n",
2628 raidPtr->raidid, retcode);
2629 } else {
2630 /* set the clean bit! If we shutdown correctly,
2631 the clean bit on each component label will get
2632 set */
2633 raidPtr->parity_good = RF_RAID_CLEAN;
2634 }
2635 raidPtr->parity_rewrite_in_progress = 0;
2636
2637 /* Anyone waiting for us to stop? If so, inform them... */
2638 if (raidPtr->waitShutdown) {
2639 wakeup(&raidPtr->parity_rewrite_in_progress);
2640 }
2641
2642 /* That's all... */
2643 kthread_exit(0); /* does not return */
2644 }
2645
2646
2647 void
2648 rf_CopybackThread(RF_Raid_t *raidPtr)
2649 {
2650 int s;
2651
2652 raidPtr->copyback_in_progress = 1;
2653 s = splbio();
2654 rf_CopybackReconstructedData(raidPtr);
2655 splx(s);
2656 raidPtr->copyback_in_progress = 0;
2657
2658 /* That's all... */
2659 kthread_exit(0); /* does not return */
2660 }
2661
2662
2663 void
2664 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2665 {
2666 int s;
2667 RF_Raid_t *raidPtr;
2668
2669 s = splbio();
2670 raidPtr = req->raidPtr;
2671 raidPtr->recon_in_progress = 1;
2672 rf_ReconstructInPlace(raidPtr, req->col);
2673 RF_Free(req, sizeof(*req));
2674 raidPtr->recon_in_progress = 0;
2675 splx(s);
2676
2677 /* That's all... */
2678 kthread_exit(0); /* does not return */
2679 }
2680
2681 static RF_AutoConfig_t *
2682 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2683 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2684 unsigned secsize)
2685 {
2686 int good_one = 0;
2687 RF_ComponentLabel_t *clabel;
2688 RF_AutoConfig_t *ac;
2689
2690 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2691 if (clabel == NULL) {
2692 oomem:
2693 while(ac_list) {
2694 ac = ac_list;
2695 if (ac->clabel)
2696 free(ac->clabel, M_RAIDFRAME);
2697 ac_list = ac_list->next;
2698 free(ac, M_RAIDFRAME);
2699 }
2700 printf("RAID auto config: out of memory!\n");
2701 return NULL; /* XXX probably should panic? */
2702 }
2703
2704 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2705 /* Got the label. Does it look reasonable? */
2706 if (rf_reasonable_label(clabel, numsecs) &&
2707 (rf_component_label_partitionsize(clabel) <= size)) {
2708 #ifdef DEBUG
2709 printf("Component on: %s: %llu\n",
2710 cname, (unsigned long long)size);
2711 rf_print_component_label(clabel);
2712 #endif
2713 /* if it's reasonable, add it, else ignore it. */
2714 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2715 M_NOWAIT);
2716 if (ac == NULL) {
2717 free(clabel, M_RAIDFRAME);
2718 goto oomem;
2719 }
2720 strlcpy(ac->devname, cname, sizeof(ac->devname));
2721 ac->dev = dev;
2722 ac->vp = vp;
2723 ac->clabel = clabel;
2724 ac->next = ac_list;
2725 ac_list = ac;
2726 good_one = 1;
2727 }
2728 }
2729 if (!good_one) {
2730 /* cleanup */
2731 free(clabel, M_RAIDFRAME);
2732 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2733 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2734 vput(vp);
2735 }
2736 return ac_list;
2737 }
2738
2739 RF_AutoConfig_t *
2740 rf_find_raid_components(void)
2741 {
2742 struct vnode *vp;
2743 struct disklabel label;
2744 device_t dv;
2745 deviter_t di;
2746 dev_t dev;
2747 int bmajor, bminor, wedge, rf_part_found;
2748 int error;
2749 int i;
2750 RF_AutoConfig_t *ac_list;
2751 uint64_t numsecs;
2752 unsigned secsize;
2753 int dowedges;
2754
2755 /* initialize the AutoConfig list */
2756 ac_list = NULL;
2757
2758 /*
2759 * we begin by trolling through *all* the devices on the system *twice*
2760 * first we scan for wedges, second for other devices. This avoids
2761 * using a raw partition instead of a wedge that covers the whole disk
2762 */
2763
2764 for (dowedges=1; dowedges>=0; --dowedges) {
2765 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2766 dv = deviter_next(&di)) {
2767
2768 /* we are only interested in disks... */
2769 if (device_class(dv) != DV_DISK)
2770 continue;
2771
2772 /* we don't care about floppies... */
2773 if (device_is_a(dv, "fd")) {
2774 continue;
2775 }
2776
2777 /* we don't care about CD's... */
2778 if (device_is_a(dv, "cd")) {
2779 continue;
2780 }
2781
2782 /* we don't care about md's... */
2783 if (device_is_a(dv, "md")) {
2784 continue;
2785 }
2786
2787 /* hdfd is the Atari/Hades floppy driver */
2788 if (device_is_a(dv, "hdfd")) {
2789 continue;
2790 }
2791
2792 /* fdisa is the Atari/Milan floppy driver */
2793 if (device_is_a(dv, "fdisa")) {
2794 continue;
2795 }
2796
2797 /* are we in the wedges pass ? */
2798 wedge = device_is_a(dv, "dk");
2799 if (wedge != dowedges) {
2800 continue;
2801 }
2802
2803 /* need to find the device_name_to_block_device_major stuff */
2804 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2805
2806 rf_part_found = 0; /*No raid partition as yet*/
2807
2808 /* get a vnode for the raw partition of this disk */
2809 bminor = minor(device_unit(dv));
2810 dev = wedge ? makedev(bmajor, bminor) :
2811 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2812 if (bdevvp(dev, &vp))
2813 panic("RAID can't alloc vnode");
2814
2815 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2816
2817 if (error) {
2818 /* "Who cares." Continue looking
2819 for something that exists*/
2820 vput(vp);
2821 continue;
2822 }
2823
2824 error = getdisksize(vp, &numsecs, &secsize);
2825 if (error) {
2826 printf("RAIDframe: can't get disk size for "
2827 "dev %s (%d)\n", device_xname(dv), error);
2828 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2829 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2830 vput(vp);
2831 continue;
2832 }
2833 if (wedge) {
2834 struct dkwedge_info dkw;
2835 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2836 NOCRED);
2837 if (error) {
2838 printf("RAIDframe: can't get wedge info for "
2839 "dev %s (%d)\n", device_xname(dv), error);
2840 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2841 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2842 vput(vp);
2843 continue;
2844 }
2845
2846 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2847 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2848 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2849 vput(vp);
2850 continue;
2851 }
2852
2853 ac_list = rf_get_component(ac_list, dev, vp,
2854 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2855 rf_part_found = 1; /*There is a raid component on this disk*/
2856 continue;
2857 }
2858
2859 /* Ok, the disk exists. Go get the disklabel. */
2860 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2861 if (error) {
2862 /*
2863 * XXX can't happen - open() would
2864 * have errored out (or faked up one)
2865 */
2866 if (error != ENOTTY)
2867 printf("RAIDframe: can't get label for dev "
2868 "%s (%d)\n", device_xname(dv), error);
2869 }
2870
2871 /* don't need this any more. We'll allocate it again
2872 a little later if we really do... */
2873 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2874 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2875 vput(vp);
2876
2877 if (error)
2878 continue;
2879
2880 rf_part_found = 0; /*No raid partitions yet*/
2881 for (i = 0; i < label.d_npartitions; i++) {
2882 char cname[sizeof(ac_list->devname)];
2883
2884 /* We only support partitions marked as RAID */
2885 if (label.d_partitions[i].p_fstype != FS_RAID)
2886 continue;
2887
2888 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2889 if (bdevvp(dev, &vp))
2890 panic("RAID can't alloc vnode");
2891
2892 error = VOP_OPEN(vp, FREAD, NOCRED);
2893 if (error) {
2894 /* Whatever... */
2895 vput(vp);
2896 continue;
2897 }
2898 snprintf(cname, sizeof(cname), "%s%c",
2899 device_xname(dv), 'a' + i);
2900 ac_list = rf_get_component(ac_list, dev, vp, cname,
2901 label.d_partitions[i].p_size, numsecs, secsize);
2902 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2903 }
2904
2905 /*
2906 *If there is no raid component on this disk, either in a
2907 *disklabel or inside a wedge, check the raw partition as well,
2908 *as it is possible to configure raid components on raw disk
2909 *devices.
2910 */
2911
2912 if (!rf_part_found) {
2913 char cname[sizeof(ac_list->devname)];
2914
2915 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2916 if (bdevvp(dev, &vp))
2917 panic("RAID can't alloc vnode");
2918
2919 error = VOP_OPEN(vp, FREAD, NOCRED);
2920 if (error) {
2921 /* Whatever... */
2922 vput(vp);
2923 continue;
2924 }
2925 snprintf(cname, sizeof(cname), "%s%c",
2926 device_xname(dv), 'a' + RAW_PART);
2927 ac_list = rf_get_component(ac_list, dev, vp, cname,
2928 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2929 }
2930 }
2931 deviter_release(&di);
2932 }
2933 return ac_list;
2934 }
2935
2936
2937 int
2938 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2939 {
2940
2941 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2942 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2943 ((clabel->clean == RF_RAID_CLEAN) ||
2944 (clabel->clean == RF_RAID_DIRTY)) &&
2945 clabel->row >=0 &&
2946 clabel->column >= 0 &&
2947 clabel->num_rows > 0 &&
2948 clabel->num_columns > 0 &&
2949 clabel->row < clabel->num_rows &&
2950 clabel->column < clabel->num_columns &&
2951 clabel->blockSize > 0 &&
2952 /*
2953 * numBlocksHi may contain garbage, but it is ok since
2954 * the type is unsigned. If it is really garbage,
2955 * rf_fix_old_label_size() will fix it.
2956 */
2957 rf_component_label_numblocks(clabel) > 0) {
2958 /*
2959 * label looks reasonable enough...
2960 * let's make sure it has no old garbage.
2961 */
2962 if (numsecs)
2963 rf_fix_old_label_size(clabel, numsecs);
2964 return(1);
2965 }
2966 return(0);
2967 }
2968
2969
2970 /*
2971 * For reasons yet unknown, some old component labels have garbage in
2972 * the newer numBlocksHi region, and this causes lossage. Since those
2973 * disks will also have numsecs set to less than 32 bits of sectors,
2974 * we can determine when this corruption has occurred, and fix it.
2975 *
2976 * The exact same problem, with the same unknown reason, happens to
2977 * the partitionSizeHi member as well.
2978 */
2979 static void
2980 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2981 {
2982
2983 if (numsecs < ((uint64_t)1 << 32)) {
2984 if (clabel->numBlocksHi) {
2985 printf("WARNING: total sectors < 32 bits, yet "
2986 "numBlocksHi set\n"
2987 "WARNING: resetting numBlocksHi to zero.\n");
2988 clabel->numBlocksHi = 0;
2989 }
2990
2991 if (clabel->partitionSizeHi) {
2992 printf("WARNING: total sectors < 32 bits, yet "
2993 "partitionSizeHi set\n"
2994 "WARNING: resetting partitionSizeHi to zero.\n");
2995 clabel->partitionSizeHi = 0;
2996 }
2997 }
2998 }
2999
3000
3001 #ifdef DEBUG
3002 void
3003 rf_print_component_label(RF_ComponentLabel_t *clabel)
3004 {
3005 uint64_t numBlocks;
3006 static const char *rp[] = {
3007 "No", "Force", "Soft", "*invalid*"
3008 };
3009
3010
3011 numBlocks = rf_component_label_numblocks(clabel);
3012
3013 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3014 clabel->row, clabel->column,
3015 clabel->num_rows, clabel->num_columns);
3016 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3017 clabel->version, clabel->serial_number,
3018 clabel->mod_counter);
3019 printf(" Clean: %s Status: %d\n",
3020 clabel->clean ? "Yes" : "No", clabel->status);
3021 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3022 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3023 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3024 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3025 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3026 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3027 printf(" Last configured as: raid%d\n", clabel->last_unit);
3028 #if 0
3029 printf(" Config order: %d\n", clabel->config_order);
3030 #endif
3031
3032 }
3033 #endif
3034
3035 RF_ConfigSet_t *
3036 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3037 {
3038 RF_AutoConfig_t *ac;
3039 RF_ConfigSet_t *config_sets;
3040 RF_ConfigSet_t *cset;
3041 RF_AutoConfig_t *ac_next;
3042
3043
3044 config_sets = NULL;
3045
3046 /* Go through the AutoConfig list, and figure out which components
3047 belong to what sets. */
3048 ac = ac_list;
3049 while(ac!=NULL) {
3050 /* we're going to putz with ac->next, so save it here
3051 for use at the end of the loop */
3052 ac_next = ac->next;
3053
3054 if (config_sets == NULL) {
3055 /* will need at least this one... */
3056 config_sets = (RF_ConfigSet_t *)
3057 malloc(sizeof(RF_ConfigSet_t),
3058 M_RAIDFRAME, M_NOWAIT);
3059 if (config_sets == NULL) {
3060 panic("rf_create_auto_sets: No memory!");
3061 }
3062 /* this one is easy :) */
3063 config_sets->ac = ac;
3064 config_sets->next = NULL;
3065 config_sets->rootable = 0;
3066 ac->next = NULL;
3067 } else {
3068 /* which set does this component fit into? */
3069 cset = config_sets;
3070 while(cset!=NULL) {
3071 if (rf_does_it_fit(cset, ac)) {
3072 /* looks like it matches... */
3073 ac->next = cset->ac;
3074 cset->ac = ac;
3075 break;
3076 }
3077 cset = cset->next;
3078 }
3079 if (cset==NULL) {
3080 /* didn't find a match above... new set..*/
3081 cset = (RF_ConfigSet_t *)
3082 malloc(sizeof(RF_ConfigSet_t),
3083 M_RAIDFRAME, M_NOWAIT);
3084 if (cset == NULL) {
3085 panic("rf_create_auto_sets: No memory!");
3086 }
3087 cset->ac = ac;
3088 ac->next = NULL;
3089 cset->next = config_sets;
3090 cset->rootable = 0;
3091 config_sets = cset;
3092 }
3093 }
3094 ac = ac_next;
3095 }
3096
3097
3098 return(config_sets);
3099 }
3100
3101 static int
3102 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3103 {
3104 RF_ComponentLabel_t *clabel1, *clabel2;
3105
3106 /* If this one matches the *first* one in the set, that's good
3107 enough, since the other members of the set would have been
3108 through here too... */
3109 /* note that we are not checking partitionSize here..
3110
3111 Note that we are also not checking the mod_counters here.
3112 If everything else matches except the mod_counter, that's
3113 good enough for this test. We will deal with the mod_counters
3114 a little later in the autoconfiguration process.
3115
3116 (clabel1->mod_counter == clabel2->mod_counter) &&
3117
3118 The reason we don't check for this is that failed disks
3119 will have lower modification counts. If those disks are
3120 not added to the set they used to belong to, then they will
3121 form their own set, which may result in 2 different sets,
3122 for example, competing to be configured at raid0, and
3123 perhaps competing to be the root filesystem set. If the
3124 wrong ones get configured, or both attempt to become /,
3125 weird behaviour and or serious lossage will occur. Thus we
3126 need to bring them into the fold here, and kick them out at
3127 a later point.
3128
3129 */
3130
3131 clabel1 = cset->ac->clabel;
3132 clabel2 = ac->clabel;
3133 if ((clabel1->version == clabel2->version) &&
3134 (clabel1->serial_number == clabel2->serial_number) &&
3135 (clabel1->num_rows == clabel2->num_rows) &&
3136 (clabel1->num_columns == clabel2->num_columns) &&
3137 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3138 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3139 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3140 (clabel1->parityConfig == clabel2->parityConfig) &&
3141 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3142 (clabel1->blockSize == clabel2->blockSize) &&
3143 rf_component_label_numblocks(clabel1) ==
3144 rf_component_label_numblocks(clabel2) &&
3145 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3146 (clabel1->root_partition == clabel2->root_partition) &&
3147 (clabel1->last_unit == clabel2->last_unit) &&
3148 (clabel1->config_order == clabel2->config_order)) {
3149 /* if it get's here, it almost *has* to be a match */
3150 } else {
3151 /* it's not consistent with somebody in the set..
3152 punt */
3153 return(0);
3154 }
3155 /* all was fine.. it must fit... */
3156 return(1);
3157 }
3158
3159 int
3160 rf_have_enough_components(RF_ConfigSet_t *cset)
3161 {
3162 RF_AutoConfig_t *ac;
3163 RF_AutoConfig_t *auto_config;
3164 RF_ComponentLabel_t *clabel;
3165 int c;
3166 int num_cols;
3167 int num_missing;
3168 int mod_counter;
3169 int mod_counter_found;
3170 int even_pair_failed;
3171 char parity_type;
3172
3173
3174 /* check to see that we have enough 'live' components
3175 of this set. If so, we can configure it if necessary */
3176
3177 num_cols = cset->ac->clabel->num_columns;
3178 parity_type = cset->ac->clabel->parityConfig;
3179
3180 /* XXX Check for duplicate components!?!?!? */
3181
3182 /* Determine what the mod_counter is supposed to be for this set. */
3183
3184 mod_counter_found = 0;
3185 mod_counter = 0;
3186 ac = cset->ac;
3187 while(ac!=NULL) {
3188 if (mod_counter_found==0) {
3189 mod_counter = ac->clabel->mod_counter;
3190 mod_counter_found = 1;
3191 } else {
3192 if (ac->clabel->mod_counter > mod_counter) {
3193 mod_counter = ac->clabel->mod_counter;
3194 }
3195 }
3196 ac = ac->next;
3197 }
3198
3199 num_missing = 0;
3200 auto_config = cset->ac;
3201
3202 even_pair_failed = 0;
3203 for(c=0; c<num_cols; c++) {
3204 ac = auto_config;
3205 while(ac!=NULL) {
3206 if ((ac->clabel->column == c) &&
3207 (ac->clabel->mod_counter == mod_counter)) {
3208 /* it's this one... */
3209 #ifdef DEBUG
3210 printf("Found: %s at %d\n",
3211 ac->devname,c);
3212 #endif
3213 break;
3214 }
3215 ac=ac->next;
3216 }
3217 if (ac==NULL) {
3218 /* Didn't find one here! */
3219 /* special case for RAID 1, especially
3220 where there are more than 2
3221 components (where RAIDframe treats
3222 things a little differently :( ) */
3223 if (parity_type == '1') {
3224 if (c%2 == 0) { /* even component */
3225 even_pair_failed = 1;
3226 } else { /* odd component. If
3227 we're failed, and
3228 so is the even
3229 component, it's
3230 "Good Night, Charlie" */
3231 if (even_pair_failed == 1) {
3232 return(0);
3233 }
3234 }
3235 } else {
3236 /* normal accounting */
3237 num_missing++;
3238 }
3239 }
3240 if ((parity_type == '1') && (c%2 == 1)) {
3241 /* Just did an even component, and we didn't
3242 bail.. reset the even_pair_failed flag,
3243 and go on to the next component.... */
3244 even_pair_failed = 0;
3245 }
3246 }
3247
3248 clabel = cset->ac->clabel;
3249
3250 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3251 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3252 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3253 /* XXX this needs to be made *much* more general */
3254 /* Too many failures */
3255 return(0);
3256 }
3257 /* otherwise, all is well, and we've got enough to take a kick
3258 at autoconfiguring this set */
3259 return(1);
3260 }
3261
3262 void
3263 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3264 RF_Raid_t *raidPtr)
3265 {
3266 RF_ComponentLabel_t *clabel;
3267 int i;
3268
3269 clabel = ac->clabel;
3270
3271 /* 1. Fill in the common stuff */
3272 config->numRow = clabel->num_rows = 1;
3273 config->numCol = clabel->num_columns;
3274 config->numSpare = 0; /* XXX should this be set here? */
3275 config->sectPerSU = clabel->sectPerSU;
3276 config->SUsPerPU = clabel->SUsPerPU;
3277 config->SUsPerRU = clabel->SUsPerRU;
3278 config->parityConfig = clabel->parityConfig;
3279 /* XXX... */
3280 strcpy(config->diskQueueType,"fifo");
3281 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3282 config->layoutSpecificSize = 0; /* XXX ?? */
3283
3284 while(ac!=NULL) {
3285 /* row/col values will be in range due to the checks
3286 in reasonable_label() */
3287 strcpy(config->devnames[0][ac->clabel->column],
3288 ac->devname);
3289 ac = ac->next;
3290 }
3291
3292 for(i=0;i<RF_MAXDBGV;i++) {
3293 config->debugVars[i][0] = 0;
3294 }
3295 }
3296
3297 int
3298 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3299 {
3300 RF_ComponentLabel_t *clabel;
3301 int column;
3302 int sparecol;
3303
3304 raidPtr->autoconfigure = new_value;
3305
3306 for(column=0; column<raidPtr->numCol; column++) {
3307 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3308 clabel = raidget_component_label(raidPtr, column);
3309 clabel->autoconfigure = new_value;
3310 raidflush_component_label(raidPtr, column);
3311 }
3312 }
3313 for(column = 0; column < raidPtr->numSpare ; column++) {
3314 sparecol = raidPtr->numCol + column;
3315 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3316 clabel = raidget_component_label(raidPtr, sparecol);
3317 clabel->autoconfigure = new_value;
3318 raidflush_component_label(raidPtr, sparecol);
3319 }
3320 }
3321 return(new_value);
3322 }
3323
3324 int
3325 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3326 {
3327 RF_ComponentLabel_t *clabel;
3328 int column;
3329 int sparecol;
3330
3331 raidPtr->root_partition = new_value;
3332 for(column=0; column<raidPtr->numCol; column++) {
3333 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3334 clabel = raidget_component_label(raidPtr, column);
3335 clabel->root_partition = new_value;
3336 raidflush_component_label(raidPtr, column);
3337 }
3338 }
3339 for(column = 0; column < raidPtr->numSpare ; column++) {
3340 sparecol = raidPtr->numCol + column;
3341 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3342 clabel = raidget_component_label(raidPtr, sparecol);
3343 clabel->root_partition = new_value;
3344 raidflush_component_label(raidPtr, sparecol);
3345 }
3346 }
3347 return(new_value);
3348 }
3349
3350 void
3351 rf_release_all_vps(RF_ConfigSet_t *cset)
3352 {
3353 RF_AutoConfig_t *ac;
3354
3355 ac = cset->ac;
3356 while(ac!=NULL) {
3357 /* Close the vp, and give it back */
3358 if (ac->vp) {
3359 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3360 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3361 vput(ac->vp);
3362 ac->vp = NULL;
3363 }
3364 ac = ac->next;
3365 }
3366 }
3367
3368
3369 void
3370 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3371 {
3372 RF_AutoConfig_t *ac;
3373 RF_AutoConfig_t *next_ac;
3374
3375 ac = cset->ac;
3376 while(ac!=NULL) {
3377 next_ac = ac->next;
3378 /* nuke the label */
3379 free(ac->clabel, M_RAIDFRAME);
3380 /* cleanup the config structure */
3381 free(ac, M_RAIDFRAME);
3382 /* "next.." */
3383 ac = next_ac;
3384 }
3385 /* and, finally, nuke the config set */
3386 free(cset, M_RAIDFRAME);
3387 }
3388
3389
3390 void
3391 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3392 {
3393 /* current version number */
3394 clabel->version = RF_COMPONENT_LABEL_VERSION;
3395 clabel->serial_number = raidPtr->serial_number;
3396 clabel->mod_counter = raidPtr->mod_counter;
3397
3398 clabel->num_rows = 1;
3399 clabel->num_columns = raidPtr->numCol;
3400 clabel->clean = RF_RAID_DIRTY; /* not clean */
3401 clabel->status = rf_ds_optimal; /* "It's good!" */
3402
3403 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3404 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3405 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3406
3407 clabel->blockSize = raidPtr->bytesPerSector;
3408 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3409
3410 /* XXX not portable */
3411 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3412 clabel->maxOutstanding = raidPtr->maxOutstanding;
3413 clabel->autoconfigure = raidPtr->autoconfigure;
3414 clabel->root_partition = raidPtr->root_partition;
3415 clabel->last_unit = raidPtr->raidid;
3416 clabel->config_order = raidPtr->config_order;
3417
3418 #ifndef RF_NO_PARITY_MAP
3419 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3420 #endif
3421 }
3422
3423 struct raid_softc *
3424 rf_auto_config_set(RF_ConfigSet_t *cset)
3425 {
3426 RF_Raid_t *raidPtr;
3427 RF_Config_t *config;
3428 int raidID;
3429 struct raid_softc *sc;
3430
3431 #ifdef DEBUG
3432 printf("RAID autoconfigure\n");
3433 #endif
3434
3435 /* 1. Create a config structure */
3436 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3437 if (config == NULL) {
3438 printf("%s: Out of mem - config!?!?\n", __func__);
3439 /* XXX do something more intelligent here. */
3440 return NULL;
3441 }
3442
3443 /*
3444 2. Figure out what RAID ID this one is supposed to live at
3445 See if we can get the same RAID dev that it was configured
3446 on last time..
3447 */
3448
3449 raidID = cset->ac->clabel->last_unit;
3450 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3451 sc = raidget(++raidID, false))
3452 continue;
3453 #ifdef DEBUG
3454 printf("Configuring raid%d:\n",raidID);
3455 #endif
3456
3457 if (sc == NULL)
3458 sc = raidget(raidID, true);
3459 if (sc == NULL) {
3460 printf("%s: Out of mem - softc!?!?\n", __func__);
3461 /* XXX do something more intelligent here. */
3462 free(config, M_RAIDFRAME);
3463 return NULL;
3464 }
3465
3466 raidPtr = &sc->sc_r;
3467
3468 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3469 raidPtr->softc = sc;
3470 raidPtr->raidid = raidID;
3471 raidPtr->openings = RAIDOUTSTANDING;
3472
3473 /* 3. Build the configuration structure */
3474 rf_create_configuration(cset->ac, config, raidPtr);
3475
3476 /* 4. Do the configuration */
3477 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3478 raidinit(sc);
3479
3480 rf_markalldirty(raidPtr);
3481 raidPtr->autoconfigure = 1; /* XXX do this here? */
3482 switch (cset->ac->clabel->root_partition) {
3483 case 1: /* Force Root */
3484 case 2: /* Soft Root: root when boot partition part of raid */
3485 /*
3486 * everything configured just fine. Make a note
3487 * that this set is eligible to be root,
3488 * or forced to be root
3489 */
3490 cset->rootable = cset->ac->clabel->root_partition;
3491 /* XXX do this here? */
3492 raidPtr->root_partition = cset->rootable;
3493 break;
3494 default:
3495 break;
3496 }
3497 } else {
3498 raidput(sc);
3499 sc = NULL;
3500 }
3501
3502 /* 5. Cleanup */
3503 free(config, M_RAIDFRAME);
3504 return sc;
3505 }
3506
3507 void
3508 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3509 size_t xmin, size_t xmax)
3510 {
3511 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3512 pool_sethiwat(p, xmax);
3513 pool_prime(p, xmin);
3514 pool_setlowat(p, xmin);
3515 }
3516
3517 /*
3518 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3519 * to see if there is IO pending and if that IO could possibly be done
3520 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3521 * otherwise.
3522 *
3523 */
3524 int
3525 rf_buf_queue_check(RF_Raid_t *raidPtr)
3526 {
3527 struct raid_softc *rs;
3528 struct dk_softc *dksc;
3529
3530 rs = raidPtr->softc;
3531 dksc = &rs->sc_dksc;
3532
3533 if ((rs->sc_flags & RAIDF_INITED) == 0)
3534 return 1;
3535
3536 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3537 /* there is work to do */
3538 return 0;
3539 }
3540 /* default is nothing to do */
3541 return 1;
3542 }
3543
3544 int
3545 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3546 {
3547 uint64_t numsecs;
3548 unsigned secsize;
3549 int error;
3550
3551 error = getdisksize(vp, &numsecs, &secsize);
3552 if (error == 0) {
3553 diskPtr->blockSize = secsize;
3554 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3555 diskPtr->partitionSize = numsecs;
3556 return 0;
3557 }
3558 return error;
3559 }
3560
3561 static int
3562 raid_match(device_t self, cfdata_t cfdata, void *aux)
3563 {
3564 return 1;
3565 }
3566
3567 static void
3568 raid_attach(device_t parent, device_t self, void *aux)
3569 {
3570 }
3571
3572
3573 static int
3574 raid_detach(device_t self, int flags)
3575 {
3576 int error;
3577 struct raid_softc *rs = raidsoftc(self);
3578
3579 if (rs == NULL)
3580 return ENXIO;
3581
3582 if ((error = raidlock(rs)) != 0)
3583 return (error);
3584
3585 error = raid_detach_unlocked(rs);
3586
3587 raidunlock(rs);
3588
3589 /* XXX raid can be referenced here */
3590
3591 if (error)
3592 return error;
3593
3594 /* Free the softc */
3595 raidput(rs);
3596
3597 return 0;
3598 }
3599
3600 static void
3601 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3602 {
3603 struct dk_softc *dksc = &rs->sc_dksc;
3604 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3605
3606 memset(dg, 0, sizeof(*dg));
3607
3608 dg->dg_secperunit = raidPtr->totalSectors;
3609 dg->dg_secsize = raidPtr->bytesPerSector;
3610 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3611 dg->dg_ntracks = 4 * raidPtr->numCol;
3612
3613 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3614 }
3615
3616 /*
3617 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3618 * We end up returning whatever error was returned by the first cache flush
3619 * that fails.
3620 */
3621
3622 int
3623 rf_sync_component_caches(RF_Raid_t *raidPtr)
3624 {
3625 int c, sparecol;
3626 int e,error;
3627 int force = 1;
3628
3629 error = 0;
3630 for (c = 0; c < raidPtr->numCol; c++) {
3631 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3632 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3633 &force, FWRITE, NOCRED);
3634 if (e) {
3635 if (e != ENODEV)
3636 printf("raid%d: cache flush to component %s failed.\n",
3637 raidPtr->raidid, raidPtr->Disks[c].devname);
3638 if (error == 0) {
3639 error = e;
3640 }
3641 }
3642 }
3643 }
3644
3645 for( c = 0; c < raidPtr->numSpare ; c++) {
3646 sparecol = raidPtr->numCol + c;
3647 /* Need to ensure that the reconstruct actually completed! */
3648 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3649 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3650 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3651 if (e) {
3652 if (e != ENODEV)
3653 printf("raid%d: cache flush to component %s failed.\n",
3654 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3655 if (error == 0) {
3656 error = e;
3657 }
3658 }
3659 }
3660 }
3661 return error;
3662 }
3663
3664 /*
3665 * Module interface
3666 */
3667
3668 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3669
3670 #ifdef _MODULE
3671 CFDRIVER_DECL(raid, DV_DISK, NULL);
3672 #endif
3673
3674 static int raid_modcmd(modcmd_t, void *);
3675 static int raid_modcmd_init(void);
3676 static int raid_modcmd_fini(void);
3677
3678 static int
3679 raid_modcmd(modcmd_t cmd, void *data)
3680 {
3681 int error;
3682
3683 error = 0;
3684 switch (cmd) {
3685 case MODULE_CMD_INIT:
3686 error = raid_modcmd_init();
3687 break;
3688 case MODULE_CMD_FINI:
3689 error = raid_modcmd_fini();
3690 break;
3691 default:
3692 error = ENOTTY;
3693 break;
3694 }
3695 return error;
3696 }
3697
3698 static int
3699 raid_modcmd_init(void)
3700 {
3701 int error;
3702 int bmajor, cmajor;
3703
3704 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3705 mutex_enter(&raid_lock);
3706 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3707 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3708 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3709 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3710
3711 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3712 #endif
3713
3714 bmajor = cmajor = -1;
3715 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3716 &raid_cdevsw, &cmajor);
3717 if (error != 0 && error != EEXIST) {
3718 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3719 mutex_exit(&raid_lock);
3720 return error;
3721 }
3722 #ifdef _MODULE
3723 error = config_cfdriver_attach(&raid_cd);
3724 if (error != 0) {
3725 aprint_error("%s: config_cfdriver_attach failed %d\n",
3726 __func__, error);
3727 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3728 mutex_exit(&raid_lock);
3729 return error;
3730 }
3731 #endif
3732 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3733 if (error != 0) {
3734 aprint_error("%s: config_cfattach_attach failed %d\n",
3735 __func__, error);
3736 #ifdef _MODULE
3737 config_cfdriver_detach(&raid_cd);
3738 #endif
3739 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3740 mutex_exit(&raid_lock);
3741 return error;
3742 }
3743
3744 raidautoconfigdone = false;
3745
3746 mutex_exit(&raid_lock);
3747
3748 if (error == 0) {
3749 if (rf_BootRaidframe(true) == 0)
3750 aprint_verbose("Kernelized RAIDframe activated\n");
3751 else
3752 panic("Serious error activating RAID!!");
3753 }
3754
3755 /*
3756 * Register a finalizer which will be used to auto-config RAID
3757 * sets once all real hardware devices have been found.
3758 */
3759 error = config_finalize_register(NULL, rf_autoconfig);
3760 if (error != 0) {
3761 aprint_error("WARNING: unable to register RAIDframe "
3762 "finalizer\n");
3763 error = 0;
3764 }
3765
3766 return error;
3767 }
3768
3769 static int
3770 raid_modcmd_fini(void)
3771 {
3772 int error;
3773
3774 mutex_enter(&raid_lock);
3775
3776 /* Don't allow unload if raid device(s) exist. */
3777 if (!LIST_EMPTY(&raids)) {
3778 mutex_exit(&raid_lock);
3779 return EBUSY;
3780 }
3781
3782 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3783 if (error != 0) {
3784 aprint_error("%s: cannot detach cfattach\n",__func__);
3785 mutex_exit(&raid_lock);
3786 return error;
3787 }
3788 #ifdef _MODULE
3789 error = config_cfdriver_detach(&raid_cd);
3790 if (error != 0) {
3791 aprint_error("%s: cannot detach cfdriver\n",__func__);
3792 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3793 mutex_exit(&raid_lock);
3794 return error;
3795 }
3796 #endif
3797 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3798 if (error != 0) {
3799 aprint_error("%s: cannot detach devsw\n",__func__);
3800 #ifdef _MODULE
3801 config_cfdriver_attach(&raid_cd);
3802 #endif
3803 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3804 mutex_exit(&raid_lock);
3805 return error;
3806 }
3807 rf_BootRaidframe(false);
3808 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3809 rf_destroy_mutex2(rf_sparet_wait_mutex);
3810 rf_destroy_cond2(rf_sparet_wait_cv);
3811 rf_destroy_cond2(rf_sparet_resp_cv);
3812 #endif
3813 mutex_exit(&raid_lock);
3814 mutex_destroy(&raid_lock);
3815
3816 return error;
3817 }
3818