rf_netbsdkintf.c revision 1.341 1 /* $NetBSD: rf_netbsdkintf.c,v 1.341 2016/01/06 17:40:50 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.341 2016/01/06 17:40:50 christos Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #include "ioconf.h"
156
157 #ifdef DEBUG
158 int rf_kdebug_level = 0;
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #else /* DEBUG */
161 #define db1_printf(a) { }
162 #endif /* DEBUG */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 static rf_declare_mutex2(rf_sparet_wait_mutex);
166 static rf_declare_cond2(rf_sparet_wait_cv);
167 static rf_declare_cond2(rf_sparet_resp_cv);
168
169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
170 * spare table */
171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
172 * installation process */
173 #endif
174
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181 void *, int, struct proc *);
182 struct raid_softc;
183 static void raidinit(struct raid_softc *);
184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
185
186 static int raid_match(device_t, cfdata_t, void *);
187 static void raid_attach(device_t, device_t, void *);
188 static int raid_detach(device_t, int);
189
190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
191 daddr_t, daddr_t);
192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
193 daddr_t, daddr_t, int);
194
195 static int raidwrite_component_label(unsigned,
196 dev_t, struct vnode *, RF_ComponentLabel_t *);
197 static int raidread_component_label(unsigned,
198 dev_t, struct vnode *, RF_ComponentLabel_t *);
199
200 static int raid_diskstart(device_t, struct buf *bp);
201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
202 static int raid_lastclose(device_t);
203
204 static dev_type_open(raidopen);
205 static dev_type_close(raidclose);
206 static dev_type_read(raidread);
207 static dev_type_write(raidwrite);
208 static dev_type_ioctl(raidioctl);
209 static dev_type_strategy(raidstrategy);
210 static dev_type_dump(raiddump);
211 static dev_type_size(raidsize);
212
213 const struct bdevsw raid_bdevsw = {
214 .d_open = raidopen,
215 .d_close = raidclose,
216 .d_strategy = raidstrategy,
217 .d_ioctl = raidioctl,
218 .d_dump = raiddump,
219 .d_psize = raidsize,
220 .d_discard = nodiscard,
221 .d_flag = D_DISK
222 };
223
224 const struct cdevsw raid_cdevsw = {
225 .d_open = raidopen,
226 .d_close = raidclose,
227 .d_read = raidread,
228 .d_write = raidwrite,
229 .d_ioctl = raidioctl,
230 .d_stop = nostop,
231 .d_tty = notty,
232 .d_poll = nopoll,
233 .d_mmap = nommap,
234 .d_kqfilter = nokqfilter,
235 .d_discard = nodiscard,
236 .d_flag = D_DISK
237 };
238
239 static struct dkdriver rf_dkdriver = {
240 .d_open = raidopen,
241 .d_close = raidclose,
242 .d_strategy = raidstrategy,
243 .d_diskstart = raid_diskstart,
244 .d_dumpblocks = raid_dumpblocks,
245 .d_lastclose = raid_lastclose,
246 .d_minphys = minphys
247 };
248
249 struct raid_softc {
250 struct dk_softc sc_dksc;
251 int sc_unit;
252 int sc_flags; /* flags */
253 int sc_cflags; /* configuration flags */
254 kmutex_t sc_mutex; /* interlock mutex */
255 kcondvar_t sc_cv; /* and the condvar */
256 uint64_t sc_size; /* size of the raid device */
257 char sc_xname[20]; /* XXX external name */
258 RF_Raid_t sc_r;
259 LIST_ENTRY(raid_softc) sc_link;
260 };
261 /* sc_flags */
262 #define RAIDF_INITED 0x001 /* unit has been initialized */
263 #define RAIDF_WLABEL 0x002 /* label area is writable */
264 #define RAIDF_LABELLING 0x004 /* unit is currently being labelled */
265 #define RAIDF_SHUTDOWN 0x008 /* unit is being shutdown */
266 #define RAIDF_DETACH 0x010 /* detach after final close */
267 #define RAIDF_WANTED 0x040 /* someone is waiting to obtain a lock */
268 #define RAIDF_LOCKED 0x080 /* unit is locked */
269 #define RAIDF_UNIT_CHANGED 0x100 /* unit is being changed */
270
271 #define raidunit(x) DISKUNIT(x)
272 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
273
274 extern struct cfdriver raid_cd;
275 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
276 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
277 DVF_DETACH_SHUTDOWN);
278
279 /*
280 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
281 * Be aware that large numbers can allow the driver to consume a lot of
282 * kernel memory, especially on writes, and in degraded mode reads.
283 *
284 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
285 * a single 64K write will typically require 64K for the old data,
286 * 64K for the old parity, and 64K for the new parity, for a total
287 * of 192K (if the parity buffer is not re-used immediately).
288 * Even it if is used immediately, that's still 128K, which when multiplied
289 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
290 *
291 * Now in degraded mode, for example, a 64K read on the above setup may
292 * require data reconstruction, which will require *all* of the 4 remaining
293 * disks to participate -- 4 * 32K/disk == 128K again.
294 */
295
296 #ifndef RAIDOUTSTANDING
297 #define RAIDOUTSTANDING 6
298 #endif
299
300 #define RAIDLABELDEV(dev) \
301 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
302
303 /* declared here, and made public, for the benefit of KVM stuff.. */
304
305 static int raidlock(struct raid_softc *);
306 static void raidunlock(struct raid_softc *);
307
308 static int raid_detach_unlocked(struct raid_softc *);
309
310 static void rf_markalldirty(RF_Raid_t *);
311 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
312
313 void rf_ReconThread(struct rf_recon_req *);
314 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
315 void rf_CopybackThread(RF_Raid_t *raidPtr);
316 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
317 int rf_autoconfig(device_t);
318 void rf_buildroothack(RF_ConfigSet_t *);
319
320 RF_AutoConfig_t *rf_find_raid_components(void);
321 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
322 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
323 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
324 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
325 int rf_set_autoconfig(RF_Raid_t *, int);
326 int rf_set_rootpartition(RF_Raid_t *, int);
327 void rf_release_all_vps(RF_ConfigSet_t *);
328 void rf_cleanup_config_set(RF_ConfigSet_t *);
329 int rf_have_enough_components(RF_ConfigSet_t *);
330 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
331 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
332
333 /*
334 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
335 * Note that this is overridden by having RAID_AUTOCONFIG as an option
336 * in the kernel config file.
337 */
338 #ifdef RAID_AUTOCONFIG
339 int raidautoconfig = 1;
340 #else
341 int raidautoconfig = 0;
342 #endif
343 static bool raidautoconfigdone = false;
344
345 struct RF_Pools_s rf_pools;
346
347 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
348 static kmutex_t raid_lock;
349
350 static struct raid_softc *
351 raidcreate(int unit) {
352 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
353 if (sc == NULL) {
354 #ifdef DIAGNOSTIC
355 printf("%s: out of memory\n", __func__);
356 #endif
357 return NULL;
358 }
359 sc->sc_unit = unit;
360 cv_init(&sc->sc_cv, "raidunit");
361 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
362 return sc;
363 }
364
365 static void
366 raiddestroy(struct raid_softc *sc) {
367 cv_destroy(&sc->sc_cv);
368 mutex_destroy(&sc->sc_mutex);
369 kmem_free(sc, sizeof(*sc));
370 }
371
372 static struct raid_softc *
373 raidget(int unit, bool create) {
374 struct raid_softc *sc;
375 if (unit < 0) {
376 #ifdef DIAGNOSTIC
377 panic("%s: unit %d!", __func__, unit);
378 #endif
379 return NULL;
380 }
381 mutex_enter(&raid_lock);
382 LIST_FOREACH(sc, &raids, sc_link) {
383 if (sc->sc_unit == unit) {
384 mutex_exit(&raid_lock);
385 return sc;
386 }
387 }
388 mutex_exit(&raid_lock);
389 if (!create)
390 return NULL;
391 if ((sc = raidcreate(unit)) == NULL)
392 return NULL;
393 mutex_enter(&raid_lock);
394 LIST_INSERT_HEAD(&raids, sc, sc_link);
395 mutex_exit(&raid_lock);
396 return sc;
397 }
398
399 static void
400 raidput(struct raid_softc *sc) {
401 mutex_enter(&raid_lock);
402 LIST_REMOVE(sc, sc_link);
403 mutex_exit(&raid_lock);
404 raiddestroy(sc);
405 }
406
407 void
408 raidattach(int num)
409 {
410
411 /*
412 * Device attachment and associated initialization now occurs
413 * as part of the module initialization.
414 */
415 }
416
417 int
418 rf_autoconfig(device_t self)
419 {
420 RF_AutoConfig_t *ac_list;
421 RF_ConfigSet_t *config_sets;
422
423 if (!raidautoconfig || raidautoconfigdone == true)
424 return (0);
425
426 /* XXX This code can only be run once. */
427 raidautoconfigdone = true;
428
429 #ifdef __HAVE_CPU_BOOTCONF
430 /*
431 * 0. find the boot device if needed first so we can use it later
432 * this needs to be done before we autoconfigure any raid sets,
433 * because if we use wedges we are not going to be able to open
434 * the boot device later
435 */
436 if (booted_device == NULL)
437 cpu_bootconf();
438 #endif
439 /* 1. locate all RAID components on the system */
440 aprint_debug("Searching for RAID components...\n");
441 ac_list = rf_find_raid_components();
442
443 /* 2. Sort them into their respective sets. */
444 config_sets = rf_create_auto_sets(ac_list);
445
446 /*
447 * 3. Evaluate each set and configure the valid ones.
448 * This gets done in rf_buildroothack().
449 */
450 rf_buildroothack(config_sets);
451
452 return 1;
453 }
454
455 static int
456 rf_containsboot(RF_Raid_t *r, device_t bdv) {
457 const char *bootname = device_xname(bdv);
458 size_t len = strlen(bootname);
459
460 for (int col = 0; col < r->numCol; col++) {
461 const char *devname = r->Disks[col].devname;
462 devname += sizeof("/dev/") - 1;
463 if (strncmp(devname, "dk", 2) == 0) {
464 const char *parent =
465 dkwedge_get_parent_name(r->Disks[col].dev);
466 if (parent != NULL)
467 devname = parent;
468 }
469 if (strncmp(devname, bootname, len) == 0) {
470 struct raid_softc *sc = r->softc;
471 aprint_debug("raid%d includes boot device %s\n",
472 sc->sc_unit, devname);
473 return 1;
474 }
475 }
476 return 0;
477 }
478
479 void
480 rf_buildroothack(RF_ConfigSet_t *config_sets)
481 {
482 RF_ConfigSet_t *cset;
483 RF_ConfigSet_t *next_cset;
484 int num_root;
485 struct raid_softc *sc, *rsc;
486 struct dk_softc *dksc;
487
488 sc = rsc = NULL;
489 num_root = 0;
490 cset = config_sets;
491 while (cset != NULL) {
492 next_cset = cset->next;
493 if (rf_have_enough_components(cset) &&
494 cset->ac->clabel->autoconfigure == 1) {
495 sc = rf_auto_config_set(cset);
496 if (sc != NULL) {
497 aprint_debug("raid%d: configured ok\n",
498 sc->sc_unit);
499 if (cset->rootable) {
500 rsc = sc;
501 num_root++;
502 }
503 } else {
504 /* The autoconfig didn't work :( */
505 aprint_debug("Autoconfig failed\n");
506 rf_release_all_vps(cset);
507 }
508 } else {
509 /* we're not autoconfiguring this set...
510 release the associated resources */
511 rf_release_all_vps(cset);
512 }
513 /* cleanup */
514 rf_cleanup_config_set(cset);
515 cset = next_cset;
516 }
517 dksc = &rsc->sc_dksc;
518
519 /* if the user has specified what the root device should be
520 then we don't touch booted_device or boothowto... */
521
522 if (rootspec != NULL)
523 return;
524
525 /* we found something bootable... */
526
527 /*
528 * XXX: The following code assumes that the root raid
529 * is the first ('a') partition. This is about the best
530 * we can do with a BSD disklabel, but we might be able
531 * to do better with a GPT label, by setting a specified
532 * attribute to indicate the root partition. We can then
533 * stash the partition number in the r->root_partition
534 * high bits (the bottom 2 bits are already used). For
535 * now we just set booted_partition to 0 when we override
536 * root.
537 */
538 if (num_root == 1) {
539 device_t candidate_root;
540 if (dksc->sc_dkdev.dk_nwedges != 0) {
541 char cname[sizeof(cset->ac->devname)];
542 /* XXX: assume 'a' */
543 snprintf(cname, sizeof(cname), "%s%c",
544 device_xname(dksc->sc_dev), 'a');
545 candidate_root = dkwedge_find_by_wname(cname);
546 } else
547 candidate_root = dksc->sc_dev;
548 if (booted_device == NULL ||
549 rsc->sc_r.root_partition == 1 ||
550 rf_containsboot(&rsc->sc_r, booted_device)) {
551 booted_device = candidate_root;
552 booted_partition = 0; /* XXX assume 'a' */
553 }
554 } else if (num_root > 1) {
555
556 /*
557 * Maybe the MD code can help. If it cannot, then
558 * setroot() will discover that we have no
559 * booted_device and will ask the user if nothing was
560 * hardwired in the kernel config file
561 */
562 if (booted_device == NULL)
563 return;
564
565 num_root = 0;
566 mutex_enter(&raid_lock);
567 LIST_FOREACH(sc, &raids, sc_link) {
568 RF_Raid_t *r = &sc->sc_r;
569 if (r->valid == 0)
570 continue;
571
572 if (r->root_partition == 0)
573 continue;
574
575 if (rf_containsboot(r, booted_device)) {
576 num_root++;
577 rsc = sc;
578 dksc = &rsc->sc_dksc;
579 }
580 }
581 mutex_exit(&raid_lock);
582
583 if (num_root == 1) {
584 booted_device = dksc->sc_dev;
585 booted_partition = 0; /* XXX assume 'a' */
586 } else {
587 /* we can't guess.. require the user to answer... */
588 boothowto |= RB_ASKNAME;
589 }
590 }
591 }
592
593 static int
594 raidsize(dev_t dev)
595 {
596 struct raid_softc *rs;
597 struct dk_softc *dksc;
598 unsigned int unit;
599
600 unit = raidunit(dev);
601 if ((rs = raidget(unit, false)) == NULL)
602 return -1;
603 dksc = &rs->sc_dksc;
604
605 if ((rs->sc_flags & RAIDF_INITED) == 0)
606 return -1;
607
608 return dk_size(dksc, dev);
609 }
610
611 static int
612 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
613 {
614 unsigned int unit;
615 struct raid_softc *rs;
616 struct dk_softc *dksc;
617
618 unit = raidunit(dev);
619 if ((rs = raidget(unit, false)) == NULL)
620 return ENXIO;
621 dksc = &rs->sc_dksc;
622
623 if ((rs->sc_flags & RAIDF_INITED) == 0)
624 return ENODEV;
625
626 /*
627 Note that blkno is relative to this particular partition.
628 By adding adding RF_PROTECTED_SECTORS, we get a value that
629 is relative to the partition used for the underlying component.
630 */
631 blkno += RF_PROTECTED_SECTORS;
632
633 return dk_dump(dksc, dev, blkno, va, size);
634 }
635
636 static int
637 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
638 {
639 struct raid_softc *rs = raidsoftc(dev);
640 const struct bdevsw *bdev;
641 RF_Raid_t *raidPtr;
642 int c, sparecol, j, scol, dumpto;
643 int error = 0;
644
645 raidPtr = &rs->sc_r;
646
647 /* we only support dumping to RAID 1 sets */
648 if (raidPtr->Layout.numDataCol != 1 ||
649 raidPtr->Layout.numParityCol != 1)
650 return EINVAL;
651
652 if ((error = raidlock(rs)) != 0)
653 return error;
654
655 /* figure out what device is alive.. */
656
657 /*
658 Look for a component to dump to. The preference for the
659 component to dump to is as follows:
660 1) the master
661 2) a used_spare of the master
662 3) the slave
663 4) a used_spare of the slave
664 */
665
666 dumpto = -1;
667 for (c = 0; c < raidPtr->numCol; c++) {
668 if (raidPtr->Disks[c].status == rf_ds_optimal) {
669 /* this might be the one */
670 dumpto = c;
671 break;
672 }
673 }
674
675 /*
676 At this point we have possibly selected a live master or a
677 live slave. We now check to see if there is a spared
678 master (or a spared slave), if we didn't find a live master
679 or a live slave.
680 */
681
682 for (c = 0; c < raidPtr->numSpare; c++) {
683 sparecol = raidPtr->numCol + c;
684 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
685 /* How about this one? */
686 scol = -1;
687 for(j=0;j<raidPtr->numCol;j++) {
688 if (raidPtr->Disks[j].spareCol == sparecol) {
689 scol = j;
690 break;
691 }
692 }
693 if (scol == 0) {
694 /*
695 We must have found a spared master!
696 We'll take that over anything else
697 found so far. (We couldn't have
698 found a real master before, since
699 this is a used spare, and it's
700 saying that it's replacing the
701 master.) On reboot (with
702 autoconfiguration turned on)
703 sparecol will become the 1st
704 component (component0) of this set.
705 */
706 dumpto = sparecol;
707 break;
708 } else if (scol != -1) {
709 /*
710 Must be a spared slave. We'll dump
711 to that if we havn't found anything
712 else so far.
713 */
714 if (dumpto == -1)
715 dumpto = sparecol;
716 }
717 }
718 }
719
720 if (dumpto == -1) {
721 /* we couldn't find any live components to dump to!?!?
722 */
723 error = EINVAL;
724 goto out;
725 }
726
727 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
728
729 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
730 blkno, va, nblk * raidPtr->bytesPerSector);
731
732 out:
733 raidunlock(rs);
734
735 return error;
736 }
737
738 /* ARGSUSED */
739 static int
740 raidopen(dev_t dev, int flags, int fmt,
741 struct lwp *l)
742 {
743 int unit = raidunit(dev);
744 struct raid_softc *rs;
745 struct dk_softc *dksc;
746 int error = 0;
747 int part, pmask;
748
749 if ((rs = raidget(unit, true)) == NULL)
750 return ENXIO;
751 if ((error = raidlock(rs)) != 0)
752 return (error);
753
754 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
755 error = EBUSY;
756 goto bad;
757 }
758
759 dksc = &rs->sc_dksc;
760
761 part = DISKPART(dev);
762 pmask = (1 << part);
763
764 if (!DK_BUSY(dksc, pmask) &&
765 ((rs->sc_flags & RAIDF_INITED) != 0)) {
766 /* First one... mark things as dirty... Note that we *MUST*
767 have done a configure before this. I DO NOT WANT TO BE
768 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
769 THAT THEY BELONG TOGETHER!!!!! */
770 /* XXX should check to see if we're only open for reading
771 here... If so, we needn't do this, but then need some
772 other way of keeping track of what's happened.. */
773
774 rf_markalldirty(&rs->sc_r);
775 }
776
777 if ((rs->sc_flags & RAIDF_INITED) != 0)
778 error = dk_open(dksc, dev, flags, fmt, l);
779
780 bad:
781 raidunlock(rs);
782
783 return (error);
784
785
786 }
787
788 static int
789 raid_lastclose(device_t self)
790 {
791 struct raid_softc *rs = raidsoftc(self);
792
793 /* Last one... device is not unconfigured yet.
794 Device shutdown has taken care of setting the
795 clean bits if RAIDF_INITED is not set
796 mark things as clean... */
797
798 rf_update_component_labels(&rs->sc_r,
799 RF_FINAL_COMPONENT_UPDATE);
800
801 /* pass to unlocked code */
802 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
803 rs->sc_flags |= RAIDF_DETACH;
804
805 return 0;
806 }
807
808 /* ARGSUSED */
809 static int
810 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
811 {
812 int unit = raidunit(dev);
813 struct raid_softc *rs;
814 struct dk_softc *dksc;
815 cfdata_t cf;
816 int error = 0, do_detach = 0, do_put = 0;
817
818 if ((rs = raidget(unit, false)) == NULL)
819 return ENXIO;
820 dksc = &rs->sc_dksc;
821
822 if ((error = raidlock(rs)) != 0)
823 return (error);
824
825 if ((rs->sc_flags & RAIDF_INITED) != 0) {
826 error = dk_close(dksc, dev, flags, fmt, l);
827 if ((rs->sc_flags & RAIDF_DETACH) != 0)
828 do_detach = 1;
829 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
830 do_put = 1;
831
832 raidunlock(rs);
833
834 if (do_detach) {
835 /* free the pseudo device attach bits */
836 cf = device_cfdata(dksc->sc_dev);
837 error = config_detach(dksc->sc_dev, 0);
838 if (error == 0)
839 free(cf, M_RAIDFRAME);
840 } else if (do_put) {
841 raidput(rs);
842 }
843
844 return (error);
845
846 }
847
848 static void
849 raid_wakeup(RF_Raid_t *raidPtr)
850 {
851 rf_lock_mutex2(raidPtr->iodone_lock);
852 rf_signal_cond2(raidPtr->iodone_cv);
853 rf_unlock_mutex2(raidPtr->iodone_lock);
854 }
855
856 static void
857 raidstrategy(struct buf *bp)
858 {
859 unsigned int unit;
860 struct raid_softc *rs;
861 struct dk_softc *dksc;
862 RF_Raid_t *raidPtr;
863
864 unit = raidunit(bp->b_dev);
865 if ((rs = raidget(unit, false)) == NULL) {
866 bp->b_error = ENXIO;
867 goto fail;
868 }
869 if ((rs->sc_flags & RAIDF_INITED) == 0) {
870 bp->b_error = ENXIO;
871 goto fail;
872 }
873 dksc = &rs->sc_dksc;
874 raidPtr = &rs->sc_r;
875
876 /* Queue IO only */
877 if (dk_strategy_defer(dksc, bp))
878 goto done;
879
880 /* schedule the IO to happen at the next convenient time */
881 raid_wakeup(raidPtr);
882
883 done:
884 return;
885
886 fail:
887 bp->b_resid = bp->b_bcount;
888 biodone(bp);
889 }
890
891 static int
892 raid_diskstart(device_t dev, struct buf *bp)
893 {
894 struct raid_softc *rs = raidsoftc(dev);
895 RF_Raid_t *raidPtr;
896
897 raidPtr = &rs->sc_r;
898 if (!raidPtr->valid) {
899 db1_printf(("raid is not valid..\n"));
900 return ENODEV;
901 }
902
903 /* XXX */
904 bp->b_resid = 0;
905
906 return raiddoaccess(raidPtr, bp);
907 }
908
909 void
910 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
911 {
912 struct raid_softc *rs;
913 struct dk_softc *dksc;
914
915 rs = raidPtr->softc;
916 dksc = &rs->sc_dksc;
917
918 dk_done(dksc, bp);
919
920 rf_lock_mutex2(raidPtr->mutex);
921 raidPtr->openings++;
922 rf_unlock_mutex2(raidPtr->mutex);
923
924 /* schedule more IO */
925 raid_wakeup(raidPtr);
926 }
927
928 /* ARGSUSED */
929 static int
930 raidread(dev_t dev, struct uio *uio, int flags)
931 {
932 int unit = raidunit(dev);
933 struct raid_softc *rs;
934
935 if ((rs = raidget(unit, false)) == NULL)
936 return ENXIO;
937
938 if ((rs->sc_flags & RAIDF_INITED) == 0)
939 return (ENXIO);
940
941 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
942
943 }
944
945 /* ARGSUSED */
946 static int
947 raidwrite(dev_t dev, struct uio *uio, int flags)
948 {
949 int unit = raidunit(dev);
950 struct raid_softc *rs;
951
952 if ((rs = raidget(unit, false)) == NULL)
953 return ENXIO;
954
955 if ((rs->sc_flags & RAIDF_INITED) == 0)
956 return (ENXIO);
957
958 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
959
960 }
961
962 static int
963 raid_detach_unlocked(struct raid_softc *rs)
964 {
965 struct dk_softc *dksc = &rs->sc_dksc;
966 RF_Raid_t *raidPtr;
967 int error;
968
969 raidPtr = &rs->sc_r;
970
971 if (DK_BUSY(dksc, 0) ||
972 raidPtr->recon_in_progress != 0 ||
973 raidPtr->parity_rewrite_in_progress != 0 ||
974 raidPtr->copyback_in_progress != 0)
975 return EBUSY;
976
977 if ((rs->sc_flags & RAIDF_INITED) == 0)
978 return 0;
979
980 rs->sc_flags &= ~RAIDF_SHUTDOWN;
981
982 if ((error = rf_Shutdown(raidPtr)) != 0)
983 return error;
984
985 rs->sc_flags &= ~RAIDF_INITED;
986
987 /* Kill off any queued buffers */
988 dk_drain(dksc);
989 bufq_free(dksc->sc_bufq);
990
991 /* Detach the disk. */
992 dkwedge_delall(&dksc->sc_dkdev);
993 disk_detach(&dksc->sc_dkdev);
994 disk_destroy(&dksc->sc_dkdev);
995 dk_detach(dksc);
996
997 return 0;
998 }
999
1000 static int
1001 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1002 {
1003 int unit = raidunit(dev);
1004 int error = 0;
1005 int part, pmask;
1006 struct raid_softc *rs;
1007 struct dk_softc *dksc;
1008 RF_Config_t *k_cfg, *u_cfg;
1009 RF_Raid_t *raidPtr;
1010 RF_RaidDisk_t *diskPtr;
1011 RF_AccTotals_t *totals;
1012 RF_DeviceConfig_t *d_cfg, **ucfgp;
1013 u_char *specific_buf;
1014 int retcode = 0;
1015 int column;
1016 /* int raidid; */
1017 struct rf_recon_req *rrcopy, *rr;
1018 RF_ComponentLabel_t *clabel;
1019 RF_ComponentLabel_t *ci_label;
1020 RF_ComponentLabel_t **clabel_ptr;
1021 RF_SingleComponent_t *sparePtr,*componentPtr;
1022 RF_SingleComponent_t component;
1023 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1024 int i, j, d;
1025
1026 if ((rs = raidget(unit, false)) == NULL)
1027 return ENXIO;
1028 dksc = &rs->sc_dksc;
1029 raidPtr = &rs->sc_r;
1030
1031 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1032 (int) DISKPART(dev), (int) unit, cmd));
1033
1034 /* Must be initialized for these... */
1035 switch (cmd) {
1036 case RAIDFRAME_REWRITEPARITY:
1037 case RAIDFRAME_GET_INFO:
1038 case RAIDFRAME_RESET_ACCTOTALS:
1039 case RAIDFRAME_GET_ACCTOTALS:
1040 case RAIDFRAME_KEEP_ACCTOTALS:
1041 case RAIDFRAME_GET_SIZE:
1042 case RAIDFRAME_FAIL_DISK:
1043 case RAIDFRAME_COPYBACK:
1044 case RAIDFRAME_CHECK_RECON_STATUS:
1045 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1046 case RAIDFRAME_GET_COMPONENT_LABEL:
1047 case RAIDFRAME_SET_COMPONENT_LABEL:
1048 case RAIDFRAME_ADD_HOT_SPARE:
1049 case RAIDFRAME_REMOVE_HOT_SPARE:
1050 case RAIDFRAME_INIT_LABELS:
1051 case RAIDFRAME_REBUILD_IN_PLACE:
1052 case RAIDFRAME_CHECK_PARITY:
1053 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1054 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1055 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1056 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1057 case RAIDFRAME_SET_AUTOCONFIG:
1058 case RAIDFRAME_SET_ROOT:
1059 case RAIDFRAME_DELETE_COMPONENT:
1060 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1061 case RAIDFRAME_PARITYMAP_STATUS:
1062 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1063 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1064 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1065 if ((rs->sc_flags & RAIDF_INITED) == 0)
1066 return (ENXIO);
1067 }
1068
1069 switch (cmd) {
1070 #ifdef COMPAT_50
1071 case RAIDFRAME_GET_INFO50:
1072 return rf_get_info50(raidPtr, data);
1073
1074 case RAIDFRAME_CONFIGURE50:
1075 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1076 return retcode;
1077 goto config;
1078 #endif
1079 /* configure the system */
1080 case RAIDFRAME_CONFIGURE:
1081
1082 if (raidPtr->valid) {
1083 /* There is a valid RAID set running on this unit! */
1084 printf("raid%d: Device already configured!\n",unit);
1085 return(EINVAL);
1086 }
1087
1088 /* copy-in the configuration information */
1089 /* data points to a pointer to the configuration structure */
1090
1091 u_cfg = *((RF_Config_t **) data);
1092 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1093 if (k_cfg == NULL) {
1094 return (ENOMEM);
1095 }
1096 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1097 if (retcode) {
1098 RF_Free(k_cfg, sizeof(RF_Config_t));
1099 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1100 retcode));
1101 goto no_config;
1102 }
1103 goto config;
1104 config:
1105 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1106
1107 /* allocate a buffer for the layout-specific data, and copy it
1108 * in */
1109 if (k_cfg->layoutSpecificSize) {
1110 if (k_cfg->layoutSpecificSize > 10000) {
1111 /* sanity check */
1112 RF_Free(k_cfg, sizeof(RF_Config_t));
1113 retcode = EINVAL;
1114 goto no_config;
1115 }
1116 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1117 (u_char *));
1118 if (specific_buf == NULL) {
1119 RF_Free(k_cfg, sizeof(RF_Config_t));
1120 retcode = ENOMEM;
1121 goto no_config;
1122 }
1123 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1124 k_cfg->layoutSpecificSize);
1125 if (retcode) {
1126 RF_Free(k_cfg, sizeof(RF_Config_t));
1127 RF_Free(specific_buf,
1128 k_cfg->layoutSpecificSize);
1129 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1130 retcode));
1131 goto no_config;
1132 }
1133 } else
1134 specific_buf = NULL;
1135 k_cfg->layoutSpecific = specific_buf;
1136
1137 /* should do some kind of sanity check on the configuration.
1138 * Store the sum of all the bytes in the last byte? */
1139
1140 /* configure the system */
1141
1142 /*
1143 * Clear the entire RAID descriptor, just to make sure
1144 * there is no stale data left in the case of a
1145 * reconfiguration
1146 */
1147 memset(raidPtr, 0, sizeof(*raidPtr));
1148 raidPtr->softc = rs;
1149 raidPtr->raidid = unit;
1150
1151 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1152
1153 if (retcode == 0) {
1154
1155 /* allow this many simultaneous IO's to
1156 this RAID device */
1157 raidPtr->openings = RAIDOUTSTANDING;
1158
1159 raidinit(rs);
1160 raid_wakeup(raidPtr);
1161 rf_markalldirty(raidPtr);
1162 }
1163 /* free the buffers. No return code here. */
1164 if (k_cfg->layoutSpecificSize) {
1165 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1166 }
1167 RF_Free(k_cfg, sizeof(RF_Config_t));
1168
1169 no_config:
1170 /*
1171 * If configuration failed, set sc_flags so that we
1172 * will detach the device when we close it.
1173 */
1174 if (retcode != 0)
1175 rs->sc_flags |= RAIDF_SHUTDOWN;
1176 return (retcode);
1177
1178 /* shutdown the system */
1179 case RAIDFRAME_SHUTDOWN:
1180
1181 part = DISKPART(dev);
1182 pmask = (1 << part);
1183
1184 if ((error = raidlock(rs)) != 0)
1185 return (error);
1186
1187 if (DK_BUSY(dksc, pmask) ||
1188 raidPtr->recon_in_progress != 0 ||
1189 raidPtr->parity_rewrite_in_progress != 0 ||
1190 raidPtr->copyback_in_progress != 0)
1191 retcode = EBUSY;
1192 else {
1193 /* detach and free on close */
1194 rs->sc_flags |= RAIDF_SHUTDOWN;
1195 retcode = 0;
1196 }
1197
1198 raidunlock(rs);
1199
1200 return (retcode);
1201 case RAIDFRAME_GET_COMPONENT_LABEL:
1202 clabel_ptr = (RF_ComponentLabel_t **) data;
1203 /* need to read the component label for the disk indicated
1204 by row,column in clabel */
1205
1206 /*
1207 * Perhaps there should be an option to skip the in-core
1208 * copy and hit the disk, as with disklabel(8).
1209 */
1210 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1211
1212 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1213
1214 if (retcode) {
1215 RF_Free(clabel, sizeof(*clabel));
1216 return retcode;
1217 }
1218
1219 clabel->row = 0; /* Don't allow looking at anything else.*/
1220
1221 column = clabel->column;
1222
1223 if ((column < 0) || (column >= raidPtr->numCol +
1224 raidPtr->numSpare)) {
1225 RF_Free(clabel, sizeof(*clabel));
1226 return EINVAL;
1227 }
1228
1229 RF_Free(clabel, sizeof(*clabel));
1230
1231 clabel = raidget_component_label(raidPtr, column);
1232
1233 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1234
1235 #if 0
1236 case RAIDFRAME_SET_COMPONENT_LABEL:
1237 clabel = (RF_ComponentLabel_t *) data;
1238
1239 /* XXX check the label for valid stuff... */
1240 /* Note that some things *should not* get modified --
1241 the user should be re-initing the labels instead of
1242 trying to patch things.
1243 */
1244
1245 raidid = raidPtr->raidid;
1246 #ifdef DEBUG
1247 printf("raid%d: Got component label:\n", raidid);
1248 printf("raid%d: Version: %d\n", raidid, clabel->version);
1249 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1250 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1251 printf("raid%d: Column: %d\n", raidid, clabel->column);
1252 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1253 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1254 printf("raid%d: Status: %d\n", raidid, clabel->status);
1255 #endif
1256 clabel->row = 0;
1257 column = clabel->column;
1258
1259 if ((column < 0) || (column >= raidPtr->numCol)) {
1260 return(EINVAL);
1261 }
1262
1263 /* XXX this isn't allowed to do anything for now :-) */
1264
1265 /* XXX and before it is, we need to fill in the rest
1266 of the fields!?!?!?! */
1267 memcpy(raidget_component_label(raidPtr, column),
1268 clabel, sizeof(*clabel));
1269 raidflush_component_label(raidPtr, column);
1270 return (0);
1271 #endif
1272
1273 case RAIDFRAME_INIT_LABELS:
1274 clabel = (RF_ComponentLabel_t *) data;
1275 /*
1276 we only want the serial number from
1277 the above. We get all the rest of the information
1278 from the config that was used to create this RAID
1279 set.
1280 */
1281
1282 raidPtr->serial_number = clabel->serial_number;
1283
1284 for(column=0;column<raidPtr->numCol;column++) {
1285 diskPtr = &raidPtr->Disks[column];
1286 if (!RF_DEAD_DISK(diskPtr->status)) {
1287 ci_label = raidget_component_label(raidPtr,
1288 column);
1289 /* Zeroing this is important. */
1290 memset(ci_label, 0, sizeof(*ci_label));
1291 raid_init_component_label(raidPtr, ci_label);
1292 ci_label->serial_number =
1293 raidPtr->serial_number;
1294 ci_label->row = 0; /* we dont' pretend to support more */
1295 rf_component_label_set_partitionsize(ci_label,
1296 diskPtr->partitionSize);
1297 ci_label->column = column;
1298 raidflush_component_label(raidPtr, column);
1299 }
1300 /* XXXjld what about the spares? */
1301 }
1302
1303 return (retcode);
1304 case RAIDFRAME_SET_AUTOCONFIG:
1305 d = rf_set_autoconfig(raidPtr, *(int *) data);
1306 printf("raid%d: New autoconfig value is: %d\n",
1307 raidPtr->raidid, d);
1308 *(int *) data = d;
1309 return (retcode);
1310
1311 case RAIDFRAME_SET_ROOT:
1312 d = rf_set_rootpartition(raidPtr, *(int *) data);
1313 printf("raid%d: New rootpartition value is: %d\n",
1314 raidPtr->raidid, d);
1315 *(int *) data = d;
1316 return (retcode);
1317
1318 /* initialize all parity */
1319 case RAIDFRAME_REWRITEPARITY:
1320
1321 if (raidPtr->Layout.map->faultsTolerated == 0) {
1322 /* Parity for RAID 0 is trivially correct */
1323 raidPtr->parity_good = RF_RAID_CLEAN;
1324 return(0);
1325 }
1326
1327 if (raidPtr->parity_rewrite_in_progress == 1) {
1328 /* Re-write is already in progress! */
1329 return(EINVAL);
1330 }
1331
1332 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1333 rf_RewriteParityThread,
1334 raidPtr,"raid_parity");
1335 return (retcode);
1336
1337
1338 case RAIDFRAME_ADD_HOT_SPARE:
1339 sparePtr = (RF_SingleComponent_t *) data;
1340 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1341 retcode = rf_add_hot_spare(raidPtr, &component);
1342 return(retcode);
1343
1344 case RAIDFRAME_REMOVE_HOT_SPARE:
1345 return(retcode);
1346
1347 case RAIDFRAME_DELETE_COMPONENT:
1348 componentPtr = (RF_SingleComponent_t *)data;
1349 memcpy( &component, componentPtr,
1350 sizeof(RF_SingleComponent_t));
1351 retcode = rf_delete_component(raidPtr, &component);
1352 return(retcode);
1353
1354 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1355 componentPtr = (RF_SingleComponent_t *)data;
1356 memcpy( &component, componentPtr,
1357 sizeof(RF_SingleComponent_t));
1358 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1359 return(retcode);
1360
1361 case RAIDFRAME_REBUILD_IN_PLACE:
1362
1363 if (raidPtr->Layout.map->faultsTolerated == 0) {
1364 /* Can't do this on a RAID 0!! */
1365 return(EINVAL);
1366 }
1367
1368 if (raidPtr->recon_in_progress == 1) {
1369 /* a reconstruct is already in progress! */
1370 return(EINVAL);
1371 }
1372
1373 componentPtr = (RF_SingleComponent_t *) data;
1374 memcpy( &component, componentPtr,
1375 sizeof(RF_SingleComponent_t));
1376 component.row = 0; /* we don't support any more */
1377 column = component.column;
1378
1379 if ((column < 0) || (column >= raidPtr->numCol)) {
1380 return(EINVAL);
1381 }
1382
1383 rf_lock_mutex2(raidPtr->mutex);
1384 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1385 (raidPtr->numFailures > 0)) {
1386 /* XXX 0 above shouldn't be constant!!! */
1387 /* some component other than this has failed.
1388 Let's not make things worse than they already
1389 are... */
1390 printf("raid%d: Unable to reconstruct to disk at:\n",
1391 raidPtr->raidid);
1392 printf("raid%d: Col: %d Too many failures.\n",
1393 raidPtr->raidid, column);
1394 rf_unlock_mutex2(raidPtr->mutex);
1395 return (EINVAL);
1396 }
1397 if (raidPtr->Disks[column].status ==
1398 rf_ds_reconstructing) {
1399 printf("raid%d: Unable to reconstruct to disk at:\n",
1400 raidPtr->raidid);
1401 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1402
1403 rf_unlock_mutex2(raidPtr->mutex);
1404 return (EINVAL);
1405 }
1406 if (raidPtr->Disks[column].status == rf_ds_spared) {
1407 rf_unlock_mutex2(raidPtr->mutex);
1408 return (EINVAL);
1409 }
1410 rf_unlock_mutex2(raidPtr->mutex);
1411
1412 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1413 if (rrcopy == NULL)
1414 return(ENOMEM);
1415
1416 rrcopy->raidPtr = (void *) raidPtr;
1417 rrcopy->col = column;
1418
1419 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1420 rf_ReconstructInPlaceThread,
1421 rrcopy,"raid_reconip");
1422 return(retcode);
1423
1424 case RAIDFRAME_GET_INFO:
1425 if (!raidPtr->valid)
1426 return (ENODEV);
1427 ucfgp = (RF_DeviceConfig_t **) data;
1428 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1429 (RF_DeviceConfig_t *));
1430 if (d_cfg == NULL)
1431 return (ENOMEM);
1432 d_cfg->rows = 1; /* there is only 1 row now */
1433 d_cfg->cols = raidPtr->numCol;
1434 d_cfg->ndevs = raidPtr->numCol;
1435 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1436 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1437 return (ENOMEM);
1438 }
1439 d_cfg->nspares = raidPtr->numSpare;
1440 if (d_cfg->nspares >= RF_MAX_DISKS) {
1441 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1442 return (ENOMEM);
1443 }
1444 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1445 d = 0;
1446 for (j = 0; j < d_cfg->cols; j++) {
1447 d_cfg->devs[d] = raidPtr->Disks[j];
1448 d++;
1449 }
1450 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1451 d_cfg->spares[i] = raidPtr->Disks[j];
1452 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1453 /* XXX: raidctl(8) expects to see this as a used spare */
1454 d_cfg->spares[i].status = rf_ds_used_spare;
1455 }
1456 }
1457 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1458 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1459
1460 return (retcode);
1461
1462 case RAIDFRAME_CHECK_PARITY:
1463 *(int *) data = raidPtr->parity_good;
1464 return (0);
1465
1466 case RAIDFRAME_PARITYMAP_STATUS:
1467 if (rf_paritymap_ineligible(raidPtr))
1468 return EINVAL;
1469 rf_paritymap_status(raidPtr->parity_map,
1470 (struct rf_pmstat *)data);
1471 return 0;
1472
1473 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1474 if (rf_paritymap_ineligible(raidPtr))
1475 return EINVAL;
1476 if (raidPtr->parity_map == NULL)
1477 return ENOENT; /* ??? */
1478 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1479 (struct rf_pmparams *)data, 1))
1480 return EINVAL;
1481 return 0;
1482
1483 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1484 if (rf_paritymap_ineligible(raidPtr))
1485 return EINVAL;
1486 *(int *) data = rf_paritymap_get_disable(raidPtr);
1487 return 0;
1488
1489 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1490 if (rf_paritymap_ineligible(raidPtr))
1491 return EINVAL;
1492 rf_paritymap_set_disable(raidPtr, *(int *)data);
1493 /* XXX should errors be passed up? */
1494 return 0;
1495
1496 case RAIDFRAME_RESET_ACCTOTALS:
1497 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1498 return (0);
1499
1500 case RAIDFRAME_GET_ACCTOTALS:
1501 totals = (RF_AccTotals_t *) data;
1502 *totals = raidPtr->acc_totals;
1503 return (0);
1504
1505 case RAIDFRAME_KEEP_ACCTOTALS:
1506 raidPtr->keep_acc_totals = *(int *)data;
1507 return (0);
1508
1509 case RAIDFRAME_GET_SIZE:
1510 *(int *) data = raidPtr->totalSectors;
1511 return (0);
1512
1513 /* fail a disk & optionally start reconstruction */
1514 case RAIDFRAME_FAIL_DISK:
1515
1516 if (raidPtr->Layout.map->faultsTolerated == 0) {
1517 /* Can't do this on a RAID 0!! */
1518 return(EINVAL);
1519 }
1520
1521 rr = (struct rf_recon_req *) data;
1522 rr->row = 0;
1523 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1524 return (EINVAL);
1525
1526
1527 rf_lock_mutex2(raidPtr->mutex);
1528 if (raidPtr->status == rf_rs_reconstructing) {
1529 /* you can't fail a disk while we're reconstructing! */
1530 /* XXX wrong for RAID6 */
1531 rf_unlock_mutex2(raidPtr->mutex);
1532 return (EINVAL);
1533 }
1534 if ((raidPtr->Disks[rr->col].status ==
1535 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1536 /* some other component has failed. Let's not make
1537 things worse. XXX wrong for RAID6 */
1538 rf_unlock_mutex2(raidPtr->mutex);
1539 return (EINVAL);
1540 }
1541 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1542 /* Can't fail a spared disk! */
1543 rf_unlock_mutex2(raidPtr->mutex);
1544 return (EINVAL);
1545 }
1546 rf_unlock_mutex2(raidPtr->mutex);
1547
1548 /* make a copy of the recon request so that we don't rely on
1549 * the user's buffer */
1550 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1551 if (rrcopy == NULL)
1552 return(ENOMEM);
1553 memcpy(rrcopy, rr, sizeof(*rr));
1554 rrcopy->raidPtr = (void *) raidPtr;
1555
1556 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1557 rf_ReconThread,
1558 rrcopy,"raid_recon");
1559 return (0);
1560
1561 /* invoke a copyback operation after recon on whatever disk
1562 * needs it, if any */
1563 case RAIDFRAME_COPYBACK:
1564
1565 if (raidPtr->Layout.map->faultsTolerated == 0) {
1566 /* This makes no sense on a RAID 0!! */
1567 return(EINVAL);
1568 }
1569
1570 if (raidPtr->copyback_in_progress == 1) {
1571 /* Copyback is already in progress! */
1572 return(EINVAL);
1573 }
1574
1575 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1576 rf_CopybackThread,
1577 raidPtr,"raid_copyback");
1578 return (retcode);
1579
1580 /* return the percentage completion of reconstruction */
1581 case RAIDFRAME_CHECK_RECON_STATUS:
1582 if (raidPtr->Layout.map->faultsTolerated == 0) {
1583 /* This makes no sense on a RAID 0, so tell the
1584 user it's done. */
1585 *(int *) data = 100;
1586 return(0);
1587 }
1588 if (raidPtr->status != rf_rs_reconstructing)
1589 *(int *) data = 100;
1590 else {
1591 if (raidPtr->reconControl->numRUsTotal > 0) {
1592 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1593 } else {
1594 *(int *) data = 0;
1595 }
1596 }
1597 return (0);
1598 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1599 progressInfoPtr = (RF_ProgressInfo_t **) data;
1600 if (raidPtr->status != rf_rs_reconstructing) {
1601 progressInfo.remaining = 0;
1602 progressInfo.completed = 100;
1603 progressInfo.total = 100;
1604 } else {
1605 progressInfo.total =
1606 raidPtr->reconControl->numRUsTotal;
1607 progressInfo.completed =
1608 raidPtr->reconControl->numRUsComplete;
1609 progressInfo.remaining = progressInfo.total -
1610 progressInfo.completed;
1611 }
1612 retcode = copyout(&progressInfo, *progressInfoPtr,
1613 sizeof(RF_ProgressInfo_t));
1614 return (retcode);
1615
1616 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1617 if (raidPtr->Layout.map->faultsTolerated == 0) {
1618 /* This makes no sense on a RAID 0, so tell the
1619 user it's done. */
1620 *(int *) data = 100;
1621 return(0);
1622 }
1623 if (raidPtr->parity_rewrite_in_progress == 1) {
1624 *(int *) data = 100 *
1625 raidPtr->parity_rewrite_stripes_done /
1626 raidPtr->Layout.numStripe;
1627 } else {
1628 *(int *) data = 100;
1629 }
1630 return (0);
1631
1632 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1633 progressInfoPtr = (RF_ProgressInfo_t **) data;
1634 if (raidPtr->parity_rewrite_in_progress == 1) {
1635 progressInfo.total = raidPtr->Layout.numStripe;
1636 progressInfo.completed =
1637 raidPtr->parity_rewrite_stripes_done;
1638 progressInfo.remaining = progressInfo.total -
1639 progressInfo.completed;
1640 } else {
1641 progressInfo.remaining = 0;
1642 progressInfo.completed = 100;
1643 progressInfo.total = 100;
1644 }
1645 retcode = copyout(&progressInfo, *progressInfoPtr,
1646 sizeof(RF_ProgressInfo_t));
1647 return (retcode);
1648
1649 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1650 if (raidPtr->Layout.map->faultsTolerated == 0) {
1651 /* This makes no sense on a RAID 0 */
1652 *(int *) data = 100;
1653 return(0);
1654 }
1655 if (raidPtr->copyback_in_progress == 1) {
1656 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1657 raidPtr->Layout.numStripe;
1658 } else {
1659 *(int *) data = 100;
1660 }
1661 return (0);
1662
1663 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1664 progressInfoPtr = (RF_ProgressInfo_t **) data;
1665 if (raidPtr->copyback_in_progress == 1) {
1666 progressInfo.total = raidPtr->Layout.numStripe;
1667 progressInfo.completed =
1668 raidPtr->copyback_stripes_done;
1669 progressInfo.remaining = progressInfo.total -
1670 progressInfo.completed;
1671 } else {
1672 progressInfo.remaining = 0;
1673 progressInfo.completed = 100;
1674 progressInfo.total = 100;
1675 }
1676 retcode = copyout(&progressInfo, *progressInfoPtr,
1677 sizeof(RF_ProgressInfo_t));
1678 return (retcode);
1679
1680 case RAIDFRAME_SET_LAST_UNIT:
1681 for (column = 0; column < raidPtr->numCol; column++)
1682 if (raidPtr->Disks[column].status != rf_ds_optimal)
1683 return EBUSY;
1684
1685 for (column = 0; column < raidPtr->numCol; column++) {
1686 clabel = raidget_component_label(raidPtr, column);
1687 clabel->last_unit = *(int *)data;
1688 raidflush_component_label(raidPtr, column);
1689 }
1690 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1691 return 0;
1692
1693 /* the sparetable daemon calls this to wait for the kernel to
1694 * need a spare table. this ioctl does not return until a
1695 * spare table is needed. XXX -- calling mpsleep here in the
1696 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1697 * -- I should either compute the spare table in the kernel,
1698 * or have a different -- XXX XXX -- interface (a different
1699 * character device) for delivering the table -- XXX */
1700 #if 0
1701 case RAIDFRAME_SPARET_WAIT:
1702 rf_lock_mutex2(rf_sparet_wait_mutex);
1703 while (!rf_sparet_wait_queue)
1704 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1705 waitreq = rf_sparet_wait_queue;
1706 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1707 rf_unlock_mutex2(rf_sparet_wait_mutex);
1708
1709 /* structure assignment */
1710 *((RF_SparetWait_t *) data) = *waitreq;
1711
1712 RF_Free(waitreq, sizeof(*waitreq));
1713 return (0);
1714
1715 /* wakes up a process waiting on SPARET_WAIT and puts an error
1716 * code in it that will cause the dameon to exit */
1717 case RAIDFRAME_ABORT_SPARET_WAIT:
1718 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1719 waitreq->fcol = -1;
1720 rf_lock_mutex2(rf_sparet_wait_mutex);
1721 waitreq->next = rf_sparet_wait_queue;
1722 rf_sparet_wait_queue = waitreq;
1723 rf_broadcast_conf2(rf_sparet_wait_cv);
1724 rf_unlock_mutex2(rf_sparet_wait_mutex);
1725 return (0);
1726
1727 /* used by the spare table daemon to deliver a spare table
1728 * into the kernel */
1729 case RAIDFRAME_SEND_SPARET:
1730
1731 /* install the spare table */
1732 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1733
1734 /* respond to the requestor. the return status of the spare
1735 * table installation is passed in the "fcol" field */
1736 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1737 waitreq->fcol = retcode;
1738 rf_lock_mutex2(rf_sparet_wait_mutex);
1739 waitreq->next = rf_sparet_resp_queue;
1740 rf_sparet_resp_queue = waitreq;
1741 rf_broadcast_cond2(rf_sparet_resp_cv);
1742 rf_unlock_mutex2(rf_sparet_wait_mutex);
1743
1744 return (retcode);
1745 #endif
1746
1747 default:
1748 break; /* fall through to the os-specific code below */
1749
1750 }
1751
1752 if (!raidPtr->valid)
1753 return (EINVAL);
1754
1755 /*
1756 * Add support for "regular" device ioctls here.
1757 */
1758
1759 error = dk_ioctl(dksc, dev, cmd, data, flag, l);
1760 if (error != EPASSTHROUGH)
1761 return (error);
1762
1763 switch (cmd) {
1764 case DIOCCACHESYNC:
1765 return rf_sync_component_caches(raidPtr);
1766
1767 default:
1768 retcode = ENOTTY;
1769 }
1770 return (retcode);
1771
1772 }
1773
1774
1775 /* raidinit -- complete the rest of the initialization for the
1776 RAIDframe device. */
1777
1778
1779 static void
1780 raidinit(struct raid_softc *rs)
1781 {
1782 cfdata_t cf;
1783 unsigned int unit;
1784 struct dk_softc *dksc = &rs->sc_dksc;
1785 RF_Raid_t *raidPtr = &rs->sc_r;
1786 device_t dev;
1787
1788 unit = raidPtr->raidid;
1789
1790 /* XXX doesn't check bounds. */
1791 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1792
1793 /* attach the pseudo device */
1794 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1795 cf->cf_name = raid_cd.cd_name;
1796 cf->cf_atname = raid_cd.cd_name;
1797 cf->cf_unit = unit;
1798 cf->cf_fstate = FSTATE_STAR;
1799
1800 dev = config_attach_pseudo(cf);
1801 if (dev == NULL) {
1802 printf("raid%d: config_attach_pseudo failed\n",
1803 raidPtr->raidid);
1804 free(cf, M_RAIDFRAME);
1805 return;
1806 }
1807
1808 /* provide a backpointer to the real softc */
1809 raidsoftc(dev) = rs;
1810
1811 /* disk_attach actually creates space for the CPU disklabel, among
1812 * other things, so it's critical to call this *BEFORE* we try putzing
1813 * with disklabels. */
1814 dk_init(dksc, dev, DKTYPE_RAID);
1815 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1816
1817 /* XXX There may be a weird interaction here between this, and
1818 * protectedSectors, as used in RAIDframe. */
1819
1820 rs->sc_size = raidPtr->totalSectors;
1821
1822 /* Attach dk and disk subsystems */
1823 dk_attach(dksc);
1824 disk_attach(&dksc->sc_dkdev);
1825 rf_set_geometry(rs, raidPtr);
1826
1827 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1828
1829 /* mark unit as usuable */
1830 rs->sc_flags |= RAIDF_INITED;
1831
1832 dkwedge_discover(&dksc->sc_dkdev);
1833 }
1834
1835 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1836 /* wake up the daemon & tell it to get us a spare table
1837 * XXX
1838 * the entries in the queues should be tagged with the raidPtr
1839 * so that in the extremely rare case that two recons happen at once,
1840 * we know for which device were requesting a spare table
1841 * XXX
1842 *
1843 * XXX This code is not currently used. GO
1844 */
1845 int
1846 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1847 {
1848 int retcode;
1849
1850 rf_lock_mutex2(rf_sparet_wait_mutex);
1851 req->next = rf_sparet_wait_queue;
1852 rf_sparet_wait_queue = req;
1853 rf_broadcast_cond2(rf_sparet_wait_cv);
1854
1855 /* mpsleep unlocks the mutex */
1856 while (!rf_sparet_resp_queue) {
1857 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1858 }
1859 req = rf_sparet_resp_queue;
1860 rf_sparet_resp_queue = req->next;
1861 rf_unlock_mutex2(rf_sparet_wait_mutex);
1862
1863 retcode = req->fcol;
1864 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1865 * alloc'd */
1866 return (retcode);
1867 }
1868 #endif
1869
1870 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1871 * bp & passes it down.
1872 * any calls originating in the kernel must use non-blocking I/O
1873 * do some extra sanity checking to return "appropriate" error values for
1874 * certain conditions (to make some standard utilities work)
1875 *
1876 * Formerly known as: rf_DoAccessKernel
1877 */
1878 void
1879 raidstart(RF_Raid_t *raidPtr)
1880 {
1881 struct raid_softc *rs;
1882 struct dk_softc *dksc;
1883
1884 rs = raidPtr->softc;
1885 dksc = &rs->sc_dksc;
1886 /* quick check to see if anything has died recently */
1887 rf_lock_mutex2(raidPtr->mutex);
1888 if (raidPtr->numNewFailures > 0) {
1889 rf_unlock_mutex2(raidPtr->mutex);
1890 rf_update_component_labels(raidPtr,
1891 RF_NORMAL_COMPONENT_UPDATE);
1892 rf_lock_mutex2(raidPtr->mutex);
1893 raidPtr->numNewFailures--;
1894 }
1895 rf_unlock_mutex2(raidPtr->mutex);
1896
1897 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1898 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1899 return;
1900 }
1901
1902 dk_start(dksc, NULL);
1903 }
1904
1905 static int
1906 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1907 {
1908 RF_SectorCount_t num_blocks, pb, sum;
1909 RF_RaidAddr_t raid_addr;
1910 daddr_t blocknum;
1911 int do_async;
1912 int rc;
1913
1914 rf_lock_mutex2(raidPtr->mutex);
1915 if (raidPtr->openings == 0) {
1916 rf_unlock_mutex2(raidPtr->mutex);
1917 return EAGAIN;
1918 }
1919 rf_unlock_mutex2(raidPtr->mutex);
1920
1921 blocknum = bp->b_rawblkno;
1922
1923 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1924 (int) blocknum));
1925
1926 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1927 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1928
1929 /* *THIS* is where we adjust what block we're going to...
1930 * but DO NOT TOUCH bp->b_blkno!!! */
1931 raid_addr = blocknum;
1932
1933 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1934 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1935 sum = raid_addr + num_blocks + pb;
1936 if (1 || rf_debugKernelAccess) {
1937 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1938 (int) raid_addr, (int) sum, (int) num_blocks,
1939 (int) pb, (int) bp->b_resid));
1940 }
1941 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1942 || (sum < num_blocks) || (sum < pb)) {
1943 rc = ENOSPC;
1944 goto done;
1945 }
1946 /*
1947 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1948 */
1949
1950 if (bp->b_bcount & raidPtr->sectorMask) {
1951 rc = ENOSPC;
1952 goto done;
1953 }
1954 db1_printf(("Calling DoAccess..\n"));
1955
1956
1957 rf_lock_mutex2(raidPtr->mutex);
1958 raidPtr->openings--;
1959 rf_unlock_mutex2(raidPtr->mutex);
1960
1961 /*
1962 * Everything is async.
1963 */
1964 do_async = 1;
1965
1966 /* don't ever condition on bp->b_flags & B_WRITE.
1967 * always condition on B_READ instead */
1968
1969 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1970 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1971 do_async, raid_addr, num_blocks,
1972 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1973
1974 done:
1975 return rc;
1976 }
1977
1978 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1979
1980 int
1981 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1982 {
1983 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1984 struct buf *bp;
1985
1986 req->queue = queue;
1987 bp = req->bp;
1988
1989 switch (req->type) {
1990 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1991 /* XXX need to do something extra here.. */
1992 /* I'm leaving this in, as I've never actually seen it used,
1993 * and I'd like folks to report it... GO */
1994 printf(("WAKEUP CALLED\n"));
1995 queue->numOutstanding++;
1996
1997 bp->b_flags = 0;
1998 bp->b_private = req;
1999
2000 KernelWakeupFunc(bp);
2001 break;
2002
2003 case RF_IO_TYPE_READ:
2004 case RF_IO_TYPE_WRITE:
2005 #if RF_ACC_TRACE > 0
2006 if (req->tracerec) {
2007 RF_ETIMER_START(req->tracerec->timer);
2008 }
2009 #endif
2010 InitBP(bp, queue->rf_cinfo->ci_vp,
2011 op, queue->rf_cinfo->ci_dev,
2012 req->sectorOffset, req->numSector,
2013 req->buf, KernelWakeupFunc, (void *) req,
2014 queue->raidPtr->logBytesPerSector, req->b_proc);
2015
2016 if (rf_debugKernelAccess) {
2017 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2018 (long) bp->b_blkno));
2019 }
2020 queue->numOutstanding++;
2021 queue->last_deq_sector = req->sectorOffset;
2022 /* acc wouldn't have been let in if there were any pending
2023 * reqs at any other priority */
2024 queue->curPriority = req->priority;
2025
2026 db1_printf(("Going for %c to unit %d col %d\n",
2027 req->type, queue->raidPtr->raidid,
2028 queue->col));
2029 db1_printf(("sector %d count %d (%d bytes) %d\n",
2030 (int) req->sectorOffset, (int) req->numSector,
2031 (int) (req->numSector <<
2032 queue->raidPtr->logBytesPerSector),
2033 (int) queue->raidPtr->logBytesPerSector));
2034
2035 /*
2036 * XXX: drop lock here since this can block at
2037 * least with backing SCSI devices. Retake it
2038 * to minimize fuss with calling interfaces.
2039 */
2040
2041 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2042 bdev_strategy(bp);
2043 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2044 break;
2045
2046 default:
2047 panic("bad req->type in rf_DispatchKernelIO");
2048 }
2049 db1_printf(("Exiting from DispatchKernelIO\n"));
2050
2051 return (0);
2052 }
2053 /* this is the callback function associated with a I/O invoked from
2054 kernel code.
2055 */
2056 static void
2057 KernelWakeupFunc(struct buf *bp)
2058 {
2059 RF_DiskQueueData_t *req = NULL;
2060 RF_DiskQueue_t *queue;
2061
2062 db1_printf(("recovering the request queue:\n"));
2063
2064 req = bp->b_private;
2065
2066 queue = (RF_DiskQueue_t *) req->queue;
2067
2068 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2069
2070 #if RF_ACC_TRACE > 0
2071 if (req->tracerec) {
2072 RF_ETIMER_STOP(req->tracerec->timer);
2073 RF_ETIMER_EVAL(req->tracerec->timer);
2074 rf_lock_mutex2(rf_tracing_mutex);
2075 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2076 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2077 req->tracerec->num_phys_ios++;
2078 rf_unlock_mutex2(rf_tracing_mutex);
2079 }
2080 #endif
2081
2082 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2083 * ballistic, and mark the component as hosed... */
2084
2085 if (bp->b_error != 0) {
2086 /* Mark the disk as dead */
2087 /* but only mark it once... */
2088 /* and only if it wouldn't leave this RAID set
2089 completely broken */
2090 if (((queue->raidPtr->Disks[queue->col].status ==
2091 rf_ds_optimal) ||
2092 (queue->raidPtr->Disks[queue->col].status ==
2093 rf_ds_used_spare)) &&
2094 (queue->raidPtr->numFailures <
2095 queue->raidPtr->Layout.map->faultsTolerated)) {
2096 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2097 queue->raidPtr->raidid,
2098 bp->b_error,
2099 queue->raidPtr->Disks[queue->col].devname);
2100 queue->raidPtr->Disks[queue->col].status =
2101 rf_ds_failed;
2102 queue->raidPtr->status = rf_rs_degraded;
2103 queue->raidPtr->numFailures++;
2104 queue->raidPtr->numNewFailures++;
2105 } else { /* Disk is already dead... */
2106 /* printf("Disk already marked as dead!\n"); */
2107 }
2108
2109 }
2110
2111 /* Fill in the error value */
2112 req->error = bp->b_error;
2113
2114 /* Drop this one on the "finished" queue... */
2115 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2116
2117 /* Let the raidio thread know there is work to be done. */
2118 rf_signal_cond2(queue->raidPtr->iodone_cv);
2119
2120 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2121 }
2122
2123
2124 /*
2125 * initialize a buf structure for doing an I/O in the kernel.
2126 */
2127 static void
2128 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2129 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2130 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2131 struct proc *b_proc)
2132 {
2133 /* bp->b_flags = B_PHYS | rw_flag; */
2134 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2135 bp->b_oflags = 0;
2136 bp->b_cflags = 0;
2137 bp->b_bcount = numSect << logBytesPerSector;
2138 bp->b_bufsize = bp->b_bcount;
2139 bp->b_error = 0;
2140 bp->b_dev = dev;
2141 bp->b_data = bf;
2142 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2143 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2144 if (bp->b_bcount == 0) {
2145 panic("bp->b_bcount is zero in InitBP!!");
2146 }
2147 bp->b_proc = b_proc;
2148 bp->b_iodone = cbFunc;
2149 bp->b_private = cbArg;
2150 }
2151
2152 /*
2153 * Wait interruptibly for an exclusive lock.
2154 *
2155 * XXX
2156 * Several drivers do this; it should be abstracted and made MP-safe.
2157 * (Hmm... where have we seen this warning before :-> GO )
2158 */
2159 static int
2160 raidlock(struct raid_softc *rs)
2161 {
2162 int error;
2163
2164 error = 0;
2165 mutex_enter(&rs->sc_mutex);
2166 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2167 rs->sc_flags |= RAIDF_WANTED;
2168 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2169 if (error != 0)
2170 goto done;
2171 }
2172 rs->sc_flags |= RAIDF_LOCKED;
2173 done:
2174 mutex_exit(&rs->sc_mutex);
2175 return (error);
2176 }
2177 /*
2178 * Unlock and wake up any waiters.
2179 */
2180 static void
2181 raidunlock(struct raid_softc *rs)
2182 {
2183
2184 mutex_enter(&rs->sc_mutex);
2185 rs->sc_flags &= ~RAIDF_LOCKED;
2186 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2187 rs->sc_flags &= ~RAIDF_WANTED;
2188 cv_broadcast(&rs->sc_cv);
2189 }
2190 mutex_exit(&rs->sc_mutex);
2191 }
2192
2193
2194 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2195 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2196 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2197
2198 static daddr_t
2199 rf_component_info_offset(void)
2200 {
2201
2202 return RF_COMPONENT_INFO_OFFSET;
2203 }
2204
2205 static daddr_t
2206 rf_component_info_size(unsigned secsize)
2207 {
2208 daddr_t info_size;
2209
2210 KASSERT(secsize);
2211 if (secsize > RF_COMPONENT_INFO_SIZE)
2212 info_size = secsize;
2213 else
2214 info_size = RF_COMPONENT_INFO_SIZE;
2215
2216 return info_size;
2217 }
2218
2219 static daddr_t
2220 rf_parity_map_offset(RF_Raid_t *raidPtr)
2221 {
2222 daddr_t map_offset;
2223
2224 KASSERT(raidPtr->bytesPerSector);
2225 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2226 map_offset = raidPtr->bytesPerSector;
2227 else
2228 map_offset = RF_COMPONENT_INFO_SIZE;
2229 map_offset += rf_component_info_offset();
2230
2231 return map_offset;
2232 }
2233
2234 static daddr_t
2235 rf_parity_map_size(RF_Raid_t *raidPtr)
2236 {
2237 daddr_t map_size;
2238
2239 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2240 map_size = raidPtr->bytesPerSector;
2241 else
2242 map_size = RF_PARITY_MAP_SIZE;
2243
2244 return map_size;
2245 }
2246
2247 int
2248 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2249 {
2250 RF_ComponentLabel_t *clabel;
2251
2252 clabel = raidget_component_label(raidPtr, col);
2253 clabel->clean = RF_RAID_CLEAN;
2254 raidflush_component_label(raidPtr, col);
2255 return(0);
2256 }
2257
2258
2259 int
2260 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2261 {
2262 RF_ComponentLabel_t *clabel;
2263
2264 clabel = raidget_component_label(raidPtr, col);
2265 clabel->clean = RF_RAID_DIRTY;
2266 raidflush_component_label(raidPtr, col);
2267 return(0);
2268 }
2269
2270 int
2271 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2272 {
2273 KASSERT(raidPtr->bytesPerSector);
2274 return raidread_component_label(raidPtr->bytesPerSector,
2275 raidPtr->Disks[col].dev,
2276 raidPtr->raid_cinfo[col].ci_vp,
2277 &raidPtr->raid_cinfo[col].ci_label);
2278 }
2279
2280 RF_ComponentLabel_t *
2281 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2282 {
2283 return &raidPtr->raid_cinfo[col].ci_label;
2284 }
2285
2286 int
2287 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2288 {
2289 RF_ComponentLabel_t *label;
2290
2291 label = &raidPtr->raid_cinfo[col].ci_label;
2292 label->mod_counter = raidPtr->mod_counter;
2293 #ifndef RF_NO_PARITY_MAP
2294 label->parity_map_modcount = label->mod_counter;
2295 #endif
2296 return raidwrite_component_label(raidPtr->bytesPerSector,
2297 raidPtr->Disks[col].dev,
2298 raidPtr->raid_cinfo[col].ci_vp, label);
2299 }
2300
2301
2302 static int
2303 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2304 RF_ComponentLabel_t *clabel)
2305 {
2306 return raidread_component_area(dev, b_vp, clabel,
2307 sizeof(RF_ComponentLabel_t),
2308 rf_component_info_offset(),
2309 rf_component_info_size(secsize));
2310 }
2311
2312 /* ARGSUSED */
2313 static int
2314 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2315 size_t msize, daddr_t offset, daddr_t dsize)
2316 {
2317 struct buf *bp;
2318 int error;
2319
2320 /* XXX should probably ensure that we don't try to do this if
2321 someone has changed rf_protected_sectors. */
2322
2323 if (b_vp == NULL) {
2324 /* For whatever reason, this component is not valid.
2325 Don't try to read a component label from it. */
2326 return(EINVAL);
2327 }
2328
2329 /* get a block of the appropriate size... */
2330 bp = geteblk((int)dsize);
2331 bp->b_dev = dev;
2332
2333 /* get our ducks in a row for the read */
2334 bp->b_blkno = offset / DEV_BSIZE;
2335 bp->b_bcount = dsize;
2336 bp->b_flags |= B_READ;
2337 bp->b_resid = dsize;
2338
2339 bdev_strategy(bp);
2340 error = biowait(bp);
2341
2342 if (!error) {
2343 memcpy(data, bp->b_data, msize);
2344 }
2345
2346 brelse(bp, 0);
2347 return(error);
2348 }
2349
2350
2351 static int
2352 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2353 RF_ComponentLabel_t *clabel)
2354 {
2355 return raidwrite_component_area(dev, b_vp, clabel,
2356 sizeof(RF_ComponentLabel_t),
2357 rf_component_info_offset(),
2358 rf_component_info_size(secsize), 0);
2359 }
2360
2361 /* ARGSUSED */
2362 static int
2363 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2364 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2365 {
2366 struct buf *bp;
2367 int error;
2368
2369 /* get a block of the appropriate size... */
2370 bp = geteblk((int)dsize);
2371 bp->b_dev = dev;
2372
2373 /* get our ducks in a row for the write */
2374 bp->b_blkno = offset / DEV_BSIZE;
2375 bp->b_bcount = dsize;
2376 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2377 bp->b_resid = dsize;
2378
2379 memset(bp->b_data, 0, dsize);
2380 memcpy(bp->b_data, data, msize);
2381
2382 bdev_strategy(bp);
2383 if (asyncp)
2384 return 0;
2385 error = biowait(bp);
2386 brelse(bp, 0);
2387 if (error) {
2388 #if 1
2389 printf("Failed to write RAID component info!\n");
2390 #endif
2391 }
2392
2393 return(error);
2394 }
2395
2396 void
2397 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2398 {
2399 int c;
2400
2401 for (c = 0; c < raidPtr->numCol; c++) {
2402 /* Skip dead disks. */
2403 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2404 continue;
2405 /* XXXjld: what if an error occurs here? */
2406 raidwrite_component_area(raidPtr->Disks[c].dev,
2407 raidPtr->raid_cinfo[c].ci_vp, map,
2408 RF_PARITYMAP_NBYTE,
2409 rf_parity_map_offset(raidPtr),
2410 rf_parity_map_size(raidPtr), 0);
2411 }
2412 }
2413
2414 void
2415 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2416 {
2417 struct rf_paritymap_ondisk tmp;
2418 int c,first;
2419
2420 first=1;
2421 for (c = 0; c < raidPtr->numCol; c++) {
2422 /* Skip dead disks. */
2423 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2424 continue;
2425 raidread_component_area(raidPtr->Disks[c].dev,
2426 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2427 RF_PARITYMAP_NBYTE,
2428 rf_parity_map_offset(raidPtr),
2429 rf_parity_map_size(raidPtr));
2430 if (first) {
2431 memcpy(map, &tmp, sizeof(*map));
2432 first = 0;
2433 } else {
2434 rf_paritymap_merge(map, &tmp);
2435 }
2436 }
2437 }
2438
2439 void
2440 rf_markalldirty(RF_Raid_t *raidPtr)
2441 {
2442 RF_ComponentLabel_t *clabel;
2443 int sparecol;
2444 int c;
2445 int j;
2446 int scol = -1;
2447
2448 raidPtr->mod_counter++;
2449 for (c = 0; c < raidPtr->numCol; c++) {
2450 /* we don't want to touch (at all) a disk that has
2451 failed */
2452 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2453 clabel = raidget_component_label(raidPtr, c);
2454 if (clabel->status == rf_ds_spared) {
2455 /* XXX do something special...
2456 but whatever you do, don't
2457 try to access it!! */
2458 } else {
2459 raidmarkdirty(raidPtr, c);
2460 }
2461 }
2462 }
2463
2464 for( c = 0; c < raidPtr->numSpare ; c++) {
2465 sparecol = raidPtr->numCol + c;
2466 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2467 /*
2468
2469 we claim this disk is "optimal" if it's
2470 rf_ds_used_spare, as that means it should be
2471 directly substitutable for the disk it replaced.
2472 We note that too...
2473
2474 */
2475
2476 for(j=0;j<raidPtr->numCol;j++) {
2477 if (raidPtr->Disks[j].spareCol == sparecol) {
2478 scol = j;
2479 break;
2480 }
2481 }
2482
2483 clabel = raidget_component_label(raidPtr, sparecol);
2484 /* make sure status is noted */
2485
2486 raid_init_component_label(raidPtr, clabel);
2487
2488 clabel->row = 0;
2489 clabel->column = scol;
2490 /* Note: we *don't* change status from rf_ds_used_spare
2491 to rf_ds_optimal */
2492 /* clabel.status = rf_ds_optimal; */
2493
2494 raidmarkdirty(raidPtr, sparecol);
2495 }
2496 }
2497 }
2498
2499
2500 void
2501 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2502 {
2503 RF_ComponentLabel_t *clabel;
2504 int sparecol;
2505 int c;
2506 int j;
2507 int scol;
2508 struct raid_softc *rs = raidPtr->softc;
2509
2510 scol = -1;
2511
2512 /* XXX should do extra checks to make sure things really are clean,
2513 rather than blindly setting the clean bit... */
2514
2515 raidPtr->mod_counter++;
2516
2517 for (c = 0; c < raidPtr->numCol; c++) {
2518 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2519 clabel = raidget_component_label(raidPtr, c);
2520 /* make sure status is noted */
2521 clabel->status = rf_ds_optimal;
2522
2523 /* note what unit we are configured as */
2524 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2525 clabel->last_unit = raidPtr->raidid;
2526
2527 raidflush_component_label(raidPtr, c);
2528 if (final == RF_FINAL_COMPONENT_UPDATE) {
2529 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2530 raidmarkclean(raidPtr, c);
2531 }
2532 }
2533 }
2534 /* else we don't touch it.. */
2535 }
2536
2537 for( c = 0; c < raidPtr->numSpare ; c++) {
2538 sparecol = raidPtr->numCol + c;
2539 /* Need to ensure that the reconstruct actually completed! */
2540 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2541 /*
2542
2543 we claim this disk is "optimal" if it's
2544 rf_ds_used_spare, as that means it should be
2545 directly substitutable for the disk it replaced.
2546 We note that too...
2547
2548 */
2549
2550 for(j=0;j<raidPtr->numCol;j++) {
2551 if (raidPtr->Disks[j].spareCol == sparecol) {
2552 scol = j;
2553 break;
2554 }
2555 }
2556
2557 /* XXX shouldn't *really* need this... */
2558 clabel = raidget_component_label(raidPtr, sparecol);
2559 /* make sure status is noted */
2560
2561 raid_init_component_label(raidPtr, clabel);
2562
2563 clabel->column = scol;
2564 clabel->status = rf_ds_optimal;
2565 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2566 clabel->last_unit = raidPtr->raidid;
2567
2568 raidflush_component_label(raidPtr, sparecol);
2569 if (final == RF_FINAL_COMPONENT_UPDATE) {
2570 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2571 raidmarkclean(raidPtr, sparecol);
2572 }
2573 }
2574 }
2575 }
2576 }
2577
2578 void
2579 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2580 {
2581
2582 if (vp != NULL) {
2583 if (auto_configured == 1) {
2584 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2585 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2586 vput(vp);
2587
2588 } else {
2589 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2590 }
2591 }
2592 }
2593
2594
2595 void
2596 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2597 {
2598 int r,c;
2599 struct vnode *vp;
2600 int acd;
2601
2602
2603 /* We take this opportunity to close the vnodes like we should.. */
2604
2605 for (c = 0; c < raidPtr->numCol; c++) {
2606 vp = raidPtr->raid_cinfo[c].ci_vp;
2607 acd = raidPtr->Disks[c].auto_configured;
2608 rf_close_component(raidPtr, vp, acd);
2609 raidPtr->raid_cinfo[c].ci_vp = NULL;
2610 raidPtr->Disks[c].auto_configured = 0;
2611 }
2612
2613 for (r = 0; r < raidPtr->numSpare; r++) {
2614 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2615 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2616 rf_close_component(raidPtr, vp, acd);
2617 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2618 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2619 }
2620 }
2621
2622
2623 void
2624 rf_ReconThread(struct rf_recon_req *req)
2625 {
2626 int s;
2627 RF_Raid_t *raidPtr;
2628
2629 s = splbio();
2630 raidPtr = (RF_Raid_t *) req->raidPtr;
2631 raidPtr->recon_in_progress = 1;
2632
2633 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2634 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2635
2636 RF_Free(req, sizeof(*req));
2637
2638 raidPtr->recon_in_progress = 0;
2639 splx(s);
2640
2641 /* That's all... */
2642 kthread_exit(0); /* does not return */
2643 }
2644
2645 void
2646 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2647 {
2648 int retcode;
2649 int s;
2650
2651 raidPtr->parity_rewrite_stripes_done = 0;
2652 raidPtr->parity_rewrite_in_progress = 1;
2653 s = splbio();
2654 retcode = rf_RewriteParity(raidPtr);
2655 splx(s);
2656 if (retcode) {
2657 printf("raid%d: Error re-writing parity (%d)!\n",
2658 raidPtr->raidid, retcode);
2659 } else {
2660 /* set the clean bit! If we shutdown correctly,
2661 the clean bit on each component label will get
2662 set */
2663 raidPtr->parity_good = RF_RAID_CLEAN;
2664 }
2665 raidPtr->parity_rewrite_in_progress = 0;
2666
2667 /* Anyone waiting for us to stop? If so, inform them... */
2668 if (raidPtr->waitShutdown) {
2669 wakeup(&raidPtr->parity_rewrite_in_progress);
2670 }
2671
2672 /* That's all... */
2673 kthread_exit(0); /* does not return */
2674 }
2675
2676
2677 void
2678 rf_CopybackThread(RF_Raid_t *raidPtr)
2679 {
2680 int s;
2681
2682 raidPtr->copyback_in_progress = 1;
2683 s = splbio();
2684 rf_CopybackReconstructedData(raidPtr);
2685 splx(s);
2686 raidPtr->copyback_in_progress = 0;
2687
2688 /* That's all... */
2689 kthread_exit(0); /* does not return */
2690 }
2691
2692
2693 void
2694 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2695 {
2696 int s;
2697 RF_Raid_t *raidPtr;
2698
2699 s = splbio();
2700 raidPtr = req->raidPtr;
2701 raidPtr->recon_in_progress = 1;
2702 rf_ReconstructInPlace(raidPtr, req->col);
2703 RF_Free(req, sizeof(*req));
2704 raidPtr->recon_in_progress = 0;
2705 splx(s);
2706
2707 /* That's all... */
2708 kthread_exit(0); /* does not return */
2709 }
2710
2711 static RF_AutoConfig_t *
2712 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2713 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2714 unsigned secsize)
2715 {
2716 int good_one = 0;
2717 RF_ComponentLabel_t *clabel;
2718 RF_AutoConfig_t *ac;
2719
2720 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2721 if (clabel == NULL) {
2722 oomem:
2723 while(ac_list) {
2724 ac = ac_list;
2725 if (ac->clabel)
2726 free(ac->clabel, M_RAIDFRAME);
2727 ac_list = ac_list->next;
2728 free(ac, M_RAIDFRAME);
2729 }
2730 printf("RAID auto config: out of memory!\n");
2731 return NULL; /* XXX probably should panic? */
2732 }
2733
2734 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2735 /* Got the label. Does it look reasonable? */
2736 if (rf_reasonable_label(clabel, numsecs) &&
2737 (rf_component_label_partitionsize(clabel) <= size)) {
2738 #ifdef DEBUG
2739 printf("Component on: %s: %llu\n",
2740 cname, (unsigned long long)size);
2741 rf_print_component_label(clabel);
2742 #endif
2743 /* if it's reasonable, add it, else ignore it. */
2744 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2745 M_NOWAIT);
2746 if (ac == NULL) {
2747 free(clabel, M_RAIDFRAME);
2748 goto oomem;
2749 }
2750 strlcpy(ac->devname, cname, sizeof(ac->devname));
2751 ac->dev = dev;
2752 ac->vp = vp;
2753 ac->clabel = clabel;
2754 ac->next = ac_list;
2755 ac_list = ac;
2756 good_one = 1;
2757 }
2758 }
2759 if (!good_one) {
2760 /* cleanup */
2761 free(clabel, M_RAIDFRAME);
2762 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2763 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2764 vput(vp);
2765 }
2766 return ac_list;
2767 }
2768
2769 RF_AutoConfig_t *
2770 rf_find_raid_components(void)
2771 {
2772 struct vnode *vp;
2773 struct disklabel label;
2774 device_t dv;
2775 deviter_t di;
2776 dev_t dev;
2777 int bmajor, bminor, wedge, rf_part_found;
2778 int error;
2779 int i;
2780 RF_AutoConfig_t *ac_list;
2781 uint64_t numsecs;
2782 unsigned secsize;
2783 int dowedges;
2784
2785 /* initialize the AutoConfig list */
2786 ac_list = NULL;
2787
2788 /*
2789 * we begin by trolling through *all* the devices on the system *twice*
2790 * first we scan for wedges, second for other devices. This avoids
2791 * using a raw partition instead of a wedge that covers the whole disk
2792 */
2793
2794 for (dowedges=1; dowedges>=0; --dowedges) {
2795 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2796 dv = deviter_next(&di)) {
2797
2798 /* we are only interested in disks... */
2799 if (device_class(dv) != DV_DISK)
2800 continue;
2801
2802 /* we don't care about floppies... */
2803 if (device_is_a(dv, "fd")) {
2804 continue;
2805 }
2806
2807 /* we don't care about CD's... */
2808 if (device_is_a(dv, "cd")) {
2809 continue;
2810 }
2811
2812 /* we don't care about md's... */
2813 if (device_is_a(dv, "md")) {
2814 continue;
2815 }
2816
2817 /* hdfd is the Atari/Hades floppy driver */
2818 if (device_is_a(dv, "hdfd")) {
2819 continue;
2820 }
2821
2822 /* fdisa is the Atari/Milan floppy driver */
2823 if (device_is_a(dv, "fdisa")) {
2824 continue;
2825 }
2826
2827 /* are we in the wedges pass ? */
2828 wedge = device_is_a(dv, "dk");
2829 if (wedge != dowedges) {
2830 continue;
2831 }
2832
2833 /* need to find the device_name_to_block_device_major stuff */
2834 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2835
2836 rf_part_found = 0; /*No raid partition as yet*/
2837
2838 /* get a vnode for the raw partition of this disk */
2839 bminor = minor(device_unit(dv));
2840 dev = wedge ? makedev(bmajor, bminor) :
2841 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2842 if (bdevvp(dev, &vp))
2843 panic("RAID can't alloc vnode");
2844
2845 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2846
2847 if (error) {
2848 /* "Who cares." Continue looking
2849 for something that exists*/
2850 vput(vp);
2851 continue;
2852 }
2853
2854 error = getdisksize(vp, &numsecs, &secsize);
2855 if (error) {
2856 /*
2857 * Pseudo devices like vnd and cgd can be
2858 * opened but may still need some configuration.
2859 * Ignore these quietly.
2860 */
2861 if (error != ENXIO)
2862 printf("RAIDframe: can't get disk size"
2863 " for dev %s (%d)\n",
2864 device_xname(dv), error);
2865 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2866 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2867 vput(vp);
2868 continue;
2869 }
2870 if (wedge) {
2871 struct dkwedge_info dkw;
2872 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2873 NOCRED);
2874 if (error) {
2875 printf("RAIDframe: can't get wedge info for "
2876 "dev %s (%d)\n", device_xname(dv), error);
2877 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2878 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2879 vput(vp);
2880 continue;
2881 }
2882
2883 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2884 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2885 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2886 vput(vp);
2887 continue;
2888 }
2889
2890 ac_list = rf_get_component(ac_list, dev, vp,
2891 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2892 rf_part_found = 1; /*There is a raid component on this disk*/
2893 continue;
2894 }
2895
2896 /* Ok, the disk exists. Go get the disklabel. */
2897 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2898 if (error) {
2899 /*
2900 * XXX can't happen - open() would
2901 * have errored out (or faked up one)
2902 */
2903 if (error != ENOTTY)
2904 printf("RAIDframe: can't get label for dev "
2905 "%s (%d)\n", device_xname(dv), error);
2906 }
2907
2908 /* don't need this any more. We'll allocate it again
2909 a little later if we really do... */
2910 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2911 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2912 vput(vp);
2913
2914 if (error)
2915 continue;
2916
2917 rf_part_found = 0; /*No raid partitions yet*/
2918 for (i = 0; i < label.d_npartitions; i++) {
2919 char cname[sizeof(ac_list->devname)];
2920
2921 /* We only support partitions marked as RAID */
2922 if (label.d_partitions[i].p_fstype != FS_RAID)
2923 continue;
2924
2925 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2926 if (bdevvp(dev, &vp))
2927 panic("RAID can't alloc vnode");
2928
2929 error = VOP_OPEN(vp, FREAD, NOCRED);
2930 if (error) {
2931 /* Whatever... */
2932 vput(vp);
2933 continue;
2934 }
2935 snprintf(cname, sizeof(cname), "%s%c",
2936 device_xname(dv), 'a' + i);
2937 ac_list = rf_get_component(ac_list, dev, vp, cname,
2938 label.d_partitions[i].p_size, numsecs, secsize);
2939 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2940 }
2941
2942 /*
2943 *If there is no raid component on this disk, either in a
2944 *disklabel or inside a wedge, check the raw partition as well,
2945 *as it is possible to configure raid components on raw disk
2946 *devices.
2947 */
2948
2949 if (!rf_part_found) {
2950 char cname[sizeof(ac_list->devname)];
2951
2952 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2953 if (bdevvp(dev, &vp))
2954 panic("RAID can't alloc vnode");
2955
2956 error = VOP_OPEN(vp, FREAD, NOCRED);
2957 if (error) {
2958 /* Whatever... */
2959 vput(vp);
2960 continue;
2961 }
2962 snprintf(cname, sizeof(cname), "%s%c",
2963 device_xname(dv), 'a' + RAW_PART);
2964 ac_list = rf_get_component(ac_list, dev, vp, cname,
2965 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2966 }
2967 }
2968 deviter_release(&di);
2969 }
2970 return ac_list;
2971 }
2972
2973
2974 int
2975 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2976 {
2977
2978 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2979 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2980 ((clabel->clean == RF_RAID_CLEAN) ||
2981 (clabel->clean == RF_RAID_DIRTY)) &&
2982 clabel->row >=0 &&
2983 clabel->column >= 0 &&
2984 clabel->num_rows > 0 &&
2985 clabel->num_columns > 0 &&
2986 clabel->row < clabel->num_rows &&
2987 clabel->column < clabel->num_columns &&
2988 clabel->blockSize > 0 &&
2989 /*
2990 * numBlocksHi may contain garbage, but it is ok since
2991 * the type is unsigned. If it is really garbage,
2992 * rf_fix_old_label_size() will fix it.
2993 */
2994 rf_component_label_numblocks(clabel) > 0) {
2995 /*
2996 * label looks reasonable enough...
2997 * let's make sure it has no old garbage.
2998 */
2999 if (numsecs)
3000 rf_fix_old_label_size(clabel, numsecs);
3001 return(1);
3002 }
3003 return(0);
3004 }
3005
3006
3007 /*
3008 * For reasons yet unknown, some old component labels have garbage in
3009 * the newer numBlocksHi region, and this causes lossage. Since those
3010 * disks will also have numsecs set to less than 32 bits of sectors,
3011 * we can determine when this corruption has occurred, and fix it.
3012 *
3013 * The exact same problem, with the same unknown reason, happens to
3014 * the partitionSizeHi member as well.
3015 */
3016 static void
3017 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3018 {
3019
3020 if (numsecs < ((uint64_t)1 << 32)) {
3021 if (clabel->numBlocksHi) {
3022 printf("WARNING: total sectors < 32 bits, yet "
3023 "numBlocksHi set\n"
3024 "WARNING: resetting numBlocksHi to zero.\n");
3025 clabel->numBlocksHi = 0;
3026 }
3027
3028 if (clabel->partitionSizeHi) {
3029 printf("WARNING: total sectors < 32 bits, yet "
3030 "partitionSizeHi set\n"
3031 "WARNING: resetting partitionSizeHi to zero.\n");
3032 clabel->partitionSizeHi = 0;
3033 }
3034 }
3035 }
3036
3037
3038 #ifdef DEBUG
3039 void
3040 rf_print_component_label(RF_ComponentLabel_t *clabel)
3041 {
3042 uint64_t numBlocks;
3043 static const char *rp[] = {
3044 "No", "Force", "Soft", "*invalid*"
3045 };
3046
3047
3048 numBlocks = rf_component_label_numblocks(clabel);
3049
3050 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3051 clabel->row, clabel->column,
3052 clabel->num_rows, clabel->num_columns);
3053 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3054 clabel->version, clabel->serial_number,
3055 clabel->mod_counter);
3056 printf(" Clean: %s Status: %d\n",
3057 clabel->clean ? "Yes" : "No", clabel->status);
3058 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3059 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3060 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3061 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3062 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3063 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3064 printf(" Last configured as: raid%d\n", clabel->last_unit);
3065 #if 0
3066 printf(" Config order: %d\n", clabel->config_order);
3067 #endif
3068
3069 }
3070 #endif
3071
3072 RF_ConfigSet_t *
3073 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3074 {
3075 RF_AutoConfig_t *ac;
3076 RF_ConfigSet_t *config_sets;
3077 RF_ConfigSet_t *cset;
3078 RF_AutoConfig_t *ac_next;
3079
3080
3081 config_sets = NULL;
3082
3083 /* Go through the AutoConfig list, and figure out which components
3084 belong to what sets. */
3085 ac = ac_list;
3086 while(ac!=NULL) {
3087 /* we're going to putz with ac->next, so save it here
3088 for use at the end of the loop */
3089 ac_next = ac->next;
3090
3091 if (config_sets == NULL) {
3092 /* will need at least this one... */
3093 config_sets = (RF_ConfigSet_t *)
3094 malloc(sizeof(RF_ConfigSet_t),
3095 M_RAIDFRAME, M_NOWAIT);
3096 if (config_sets == NULL) {
3097 panic("rf_create_auto_sets: No memory!");
3098 }
3099 /* this one is easy :) */
3100 config_sets->ac = ac;
3101 config_sets->next = NULL;
3102 config_sets->rootable = 0;
3103 ac->next = NULL;
3104 } else {
3105 /* which set does this component fit into? */
3106 cset = config_sets;
3107 while(cset!=NULL) {
3108 if (rf_does_it_fit(cset, ac)) {
3109 /* looks like it matches... */
3110 ac->next = cset->ac;
3111 cset->ac = ac;
3112 break;
3113 }
3114 cset = cset->next;
3115 }
3116 if (cset==NULL) {
3117 /* didn't find a match above... new set..*/
3118 cset = (RF_ConfigSet_t *)
3119 malloc(sizeof(RF_ConfigSet_t),
3120 M_RAIDFRAME, M_NOWAIT);
3121 if (cset == NULL) {
3122 panic("rf_create_auto_sets: No memory!");
3123 }
3124 cset->ac = ac;
3125 ac->next = NULL;
3126 cset->next = config_sets;
3127 cset->rootable = 0;
3128 config_sets = cset;
3129 }
3130 }
3131 ac = ac_next;
3132 }
3133
3134
3135 return(config_sets);
3136 }
3137
3138 static int
3139 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3140 {
3141 RF_ComponentLabel_t *clabel1, *clabel2;
3142
3143 /* If this one matches the *first* one in the set, that's good
3144 enough, since the other members of the set would have been
3145 through here too... */
3146 /* note that we are not checking partitionSize here..
3147
3148 Note that we are also not checking the mod_counters here.
3149 If everything else matches except the mod_counter, that's
3150 good enough for this test. We will deal with the mod_counters
3151 a little later in the autoconfiguration process.
3152
3153 (clabel1->mod_counter == clabel2->mod_counter) &&
3154
3155 The reason we don't check for this is that failed disks
3156 will have lower modification counts. If those disks are
3157 not added to the set they used to belong to, then they will
3158 form their own set, which may result in 2 different sets,
3159 for example, competing to be configured at raid0, and
3160 perhaps competing to be the root filesystem set. If the
3161 wrong ones get configured, or both attempt to become /,
3162 weird behaviour and or serious lossage will occur. Thus we
3163 need to bring them into the fold here, and kick them out at
3164 a later point.
3165
3166 */
3167
3168 clabel1 = cset->ac->clabel;
3169 clabel2 = ac->clabel;
3170 if ((clabel1->version == clabel2->version) &&
3171 (clabel1->serial_number == clabel2->serial_number) &&
3172 (clabel1->num_rows == clabel2->num_rows) &&
3173 (clabel1->num_columns == clabel2->num_columns) &&
3174 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3175 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3176 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3177 (clabel1->parityConfig == clabel2->parityConfig) &&
3178 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3179 (clabel1->blockSize == clabel2->blockSize) &&
3180 rf_component_label_numblocks(clabel1) ==
3181 rf_component_label_numblocks(clabel2) &&
3182 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3183 (clabel1->root_partition == clabel2->root_partition) &&
3184 (clabel1->last_unit == clabel2->last_unit) &&
3185 (clabel1->config_order == clabel2->config_order)) {
3186 /* if it get's here, it almost *has* to be a match */
3187 } else {
3188 /* it's not consistent with somebody in the set..
3189 punt */
3190 return(0);
3191 }
3192 /* all was fine.. it must fit... */
3193 return(1);
3194 }
3195
3196 int
3197 rf_have_enough_components(RF_ConfigSet_t *cset)
3198 {
3199 RF_AutoConfig_t *ac;
3200 RF_AutoConfig_t *auto_config;
3201 RF_ComponentLabel_t *clabel;
3202 int c;
3203 int num_cols;
3204 int num_missing;
3205 int mod_counter;
3206 int mod_counter_found;
3207 int even_pair_failed;
3208 char parity_type;
3209
3210
3211 /* check to see that we have enough 'live' components
3212 of this set. If so, we can configure it if necessary */
3213
3214 num_cols = cset->ac->clabel->num_columns;
3215 parity_type = cset->ac->clabel->parityConfig;
3216
3217 /* XXX Check for duplicate components!?!?!? */
3218
3219 /* Determine what the mod_counter is supposed to be for this set. */
3220
3221 mod_counter_found = 0;
3222 mod_counter = 0;
3223 ac = cset->ac;
3224 while(ac!=NULL) {
3225 if (mod_counter_found==0) {
3226 mod_counter = ac->clabel->mod_counter;
3227 mod_counter_found = 1;
3228 } else {
3229 if (ac->clabel->mod_counter > mod_counter) {
3230 mod_counter = ac->clabel->mod_counter;
3231 }
3232 }
3233 ac = ac->next;
3234 }
3235
3236 num_missing = 0;
3237 auto_config = cset->ac;
3238
3239 even_pair_failed = 0;
3240 for(c=0; c<num_cols; c++) {
3241 ac = auto_config;
3242 while(ac!=NULL) {
3243 if ((ac->clabel->column == c) &&
3244 (ac->clabel->mod_counter == mod_counter)) {
3245 /* it's this one... */
3246 #ifdef DEBUG
3247 printf("Found: %s at %d\n",
3248 ac->devname,c);
3249 #endif
3250 break;
3251 }
3252 ac=ac->next;
3253 }
3254 if (ac==NULL) {
3255 /* Didn't find one here! */
3256 /* special case for RAID 1, especially
3257 where there are more than 2
3258 components (where RAIDframe treats
3259 things a little differently :( ) */
3260 if (parity_type == '1') {
3261 if (c%2 == 0) { /* even component */
3262 even_pair_failed = 1;
3263 } else { /* odd component. If
3264 we're failed, and
3265 so is the even
3266 component, it's
3267 "Good Night, Charlie" */
3268 if (even_pair_failed == 1) {
3269 return(0);
3270 }
3271 }
3272 } else {
3273 /* normal accounting */
3274 num_missing++;
3275 }
3276 }
3277 if ((parity_type == '1') && (c%2 == 1)) {
3278 /* Just did an even component, and we didn't
3279 bail.. reset the even_pair_failed flag,
3280 and go on to the next component.... */
3281 even_pair_failed = 0;
3282 }
3283 }
3284
3285 clabel = cset->ac->clabel;
3286
3287 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3288 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3289 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3290 /* XXX this needs to be made *much* more general */
3291 /* Too many failures */
3292 return(0);
3293 }
3294 /* otherwise, all is well, and we've got enough to take a kick
3295 at autoconfiguring this set */
3296 return(1);
3297 }
3298
3299 void
3300 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3301 RF_Raid_t *raidPtr)
3302 {
3303 RF_ComponentLabel_t *clabel;
3304 int i;
3305
3306 clabel = ac->clabel;
3307
3308 /* 1. Fill in the common stuff */
3309 config->numRow = clabel->num_rows = 1;
3310 config->numCol = clabel->num_columns;
3311 config->numSpare = 0; /* XXX should this be set here? */
3312 config->sectPerSU = clabel->sectPerSU;
3313 config->SUsPerPU = clabel->SUsPerPU;
3314 config->SUsPerRU = clabel->SUsPerRU;
3315 config->parityConfig = clabel->parityConfig;
3316 /* XXX... */
3317 strcpy(config->diskQueueType,"fifo");
3318 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3319 config->layoutSpecificSize = 0; /* XXX ?? */
3320
3321 while(ac!=NULL) {
3322 /* row/col values will be in range due to the checks
3323 in reasonable_label() */
3324 strcpy(config->devnames[0][ac->clabel->column],
3325 ac->devname);
3326 ac = ac->next;
3327 }
3328
3329 for(i=0;i<RF_MAXDBGV;i++) {
3330 config->debugVars[i][0] = 0;
3331 }
3332 }
3333
3334 int
3335 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3336 {
3337 RF_ComponentLabel_t *clabel;
3338 int column;
3339 int sparecol;
3340
3341 raidPtr->autoconfigure = new_value;
3342
3343 for(column=0; column<raidPtr->numCol; column++) {
3344 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3345 clabel = raidget_component_label(raidPtr, column);
3346 clabel->autoconfigure = new_value;
3347 raidflush_component_label(raidPtr, column);
3348 }
3349 }
3350 for(column = 0; column < raidPtr->numSpare ; column++) {
3351 sparecol = raidPtr->numCol + column;
3352 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3353 clabel = raidget_component_label(raidPtr, sparecol);
3354 clabel->autoconfigure = new_value;
3355 raidflush_component_label(raidPtr, sparecol);
3356 }
3357 }
3358 return(new_value);
3359 }
3360
3361 int
3362 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3363 {
3364 RF_ComponentLabel_t *clabel;
3365 int column;
3366 int sparecol;
3367
3368 raidPtr->root_partition = new_value;
3369 for(column=0; column<raidPtr->numCol; column++) {
3370 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3371 clabel = raidget_component_label(raidPtr, column);
3372 clabel->root_partition = new_value;
3373 raidflush_component_label(raidPtr, column);
3374 }
3375 }
3376 for(column = 0; column < raidPtr->numSpare ; column++) {
3377 sparecol = raidPtr->numCol + column;
3378 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3379 clabel = raidget_component_label(raidPtr, sparecol);
3380 clabel->root_partition = new_value;
3381 raidflush_component_label(raidPtr, sparecol);
3382 }
3383 }
3384 return(new_value);
3385 }
3386
3387 void
3388 rf_release_all_vps(RF_ConfigSet_t *cset)
3389 {
3390 RF_AutoConfig_t *ac;
3391
3392 ac = cset->ac;
3393 while(ac!=NULL) {
3394 /* Close the vp, and give it back */
3395 if (ac->vp) {
3396 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3397 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3398 vput(ac->vp);
3399 ac->vp = NULL;
3400 }
3401 ac = ac->next;
3402 }
3403 }
3404
3405
3406 void
3407 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3408 {
3409 RF_AutoConfig_t *ac;
3410 RF_AutoConfig_t *next_ac;
3411
3412 ac = cset->ac;
3413 while(ac!=NULL) {
3414 next_ac = ac->next;
3415 /* nuke the label */
3416 free(ac->clabel, M_RAIDFRAME);
3417 /* cleanup the config structure */
3418 free(ac, M_RAIDFRAME);
3419 /* "next.." */
3420 ac = next_ac;
3421 }
3422 /* and, finally, nuke the config set */
3423 free(cset, M_RAIDFRAME);
3424 }
3425
3426
3427 void
3428 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3429 {
3430 /* current version number */
3431 clabel->version = RF_COMPONENT_LABEL_VERSION;
3432 clabel->serial_number = raidPtr->serial_number;
3433 clabel->mod_counter = raidPtr->mod_counter;
3434
3435 clabel->num_rows = 1;
3436 clabel->num_columns = raidPtr->numCol;
3437 clabel->clean = RF_RAID_DIRTY; /* not clean */
3438 clabel->status = rf_ds_optimal; /* "It's good!" */
3439
3440 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3441 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3442 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3443
3444 clabel->blockSize = raidPtr->bytesPerSector;
3445 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3446
3447 /* XXX not portable */
3448 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3449 clabel->maxOutstanding = raidPtr->maxOutstanding;
3450 clabel->autoconfigure = raidPtr->autoconfigure;
3451 clabel->root_partition = raidPtr->root_partition;
3452 clabel->last_unit = raidPtr->raidid;
3453 clabel->config_order = raidPtr->config_order;
3454
3455 #ifndef RF_NO_PARITY_MAP
3456 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3457 #endif
3458 }
3459
3460 struct raid_softc *
3461 rf_auto_config_set(RF_ConfigSet_t *cset)
3462 {
3463 RF_Raid_t *raidPtr;
3464 RF_Config_t *config;
3465 int raidID;
3466 struct raid_softc *sc;
3467
3468 #ifdef DEBUG
3469 printf("RAID autoconfigure\n");
3470 #endif
3471
3472 /* 1. Create a config structure */
3473 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3474 if (config == NULL) {
3475 printf("%s: Out of mem - config!?!?\n", __func__);
3476 /* XXX do something more intelligent here. */
3477 return NULL;
3478 }
3479
3480 /*
3481 2. Figure out what RAID ID this one is supposed to live at
3482 See if we can get the same RAID dev that it was configured
3483 on last time..
3484 */
3485
3486 raidID = cset->ac->clabel->last_unit;
3487 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3488 sc = raidget(++raidID, false))
3489 continue;
3490 #ifdef DEBUG
3491 printf("Configuring raid%d:\n",raidID);
3492 #endif
3493
3494 if (sc == NULL)
3495 sc = raidget(raidID, true);
3496 if (sc == NULL) {
3497 printf("%s: Out of mem - softc!?!?\n", __func__);
3498 /* XXX do something more intelligent here. */
3499 free(config, M_RAIDFRAME);
3500 return NULL;
3501 }
3502
3503 raidPtr = &sc->sc_r;
3504
3505 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3506 raidPtr->softc = sc;
3507 raidPtr->raidid = raidID;
3508 raidPtr->openings = RAIDOUTSTANDING;
3509
3510 /* 3. Build the configuration structure */
3511 rf_create_configuration(cset->ac, config, raidPtr);
3512
3513 /* 4. Do the configuration */
3514 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3515 raidinit(sc);
3516
3517 rf_markalldirty(raidPtr);
3518 raidPtr->autoconfigure = 1; /* XXX do this here? */
3519 switch (cset->ac->clabel->root_partition) {
3520 case 1: /* Force Root */
3521 case 2: /* Soft Root: root when boot partition part of raid */
3522 /*
3523 * everything configured just fine. Make a note
3524 * that this set is eligible to be root,
3525 * or forced to be root
3526 */
3527 cset->rootable = cset->ac->clabel->root_partition;
3528 /* XXX do this here? */
3529 raidPtr->root_partition = cset->rootable;
3530 break;
3531 default:
3532 break;
3533 }
3534 } else {
3535 raidput(sc);
3536 sc = NULL;
3537 }
3538
3539 /* 5. Cleanup */
3540 free(config, M_RAIDFRAME);
3541 return sc;
3542 }
3543
3544 void
3545 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3546 size_t xmin, size_t xmax)
3547 {
3548 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3549 pool_sethiwat(p, xmax);
3550 pool_prime(p, xmin);
3551 pool_setlowat(p, xmin);
3552 }
3553
3554 /*
3555 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3556 * to see if there is IO pending and if that IO could possibly be done
3557 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3558 * otherwise.
3559 *
3560 */
3561 int
3562 rf_buf_queue_check(RF_Raid_t *raidPtr)
3563 {
3564 struct raid_softc *rs;
3565 struct dk_softc *dksc;
3566
3567 rs = raidPtr->softc;
3568 dksc = &rs->sc_dksc;
3569
3570 if ((rs->sc_flags & RAIDF_INITED) == 0)
3571 return 1;
3572
3573 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3574 /* there is work to do */
3575 return 0;
3576 }
3577 /* default is nothing to do */
3578 return 1;
3579 }
3580
3581 int
3582 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3583 {
3584 uint64_t numsecs;
3585 unsigned secsize;
3586 int error;
3587
3588 error = getdisksize(vp, &numsecs, &secsize);
3589 if (error == 0) {
3590 diskPtr->blockSize = secsize;
3591 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3592 diskPtr->partitionSize = numsecs;
3593 return 0;
3594 }
3595 return error;
3596 }
3597
3598 static int
3599 raid_match(device_t self, cfdata_t cfdata, void *aux)
3600 {
3601 return 1;
3602 }
3603
3604 static void
3605 raid_attach(device_t parent, device_t self, void *aux)
3606 {
3607 }
3608
3609
3610 static int
3611 raid_detach(device_t self, int flags)
3612 {
3613 int error;
3614 struct raid_softc *rs = raidsoftc(self);
3615
3616 if (rs == NULL)
3617 return ENXIO;
3618
3619 if ((error = raidlock(rs)) != 0)
3620 return (error);
3621
3622 error = raid_detach_unlocked(rs);
3623
3624 raidunlock(rs);
3625
3626 /* XXX raid can be referenced here */
3627
3628 if (error)
3629 return error;
3630
3631 /* Free the softc */
3632 raidput(rs);
3633
3634 return 0;
3635 }
3636
3637 static void
3638 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3639 {
3640 struct dk_softc *dksc = &rs->sc_dksc;
3641 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3642
3643 memset(dg, 0, sizeof(*dg));
3644
3645 dg->dg_secperunit = raidPtr->totalSectors;
3646 dg->dg_secsize = raidPtr->bytesPerSector;
3647 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3648 dg->dg_ntracks = 4 * raidPtr->numCol;
3649
3650 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3651 }
3652
3653 /*
3654 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3655 * We end up returning whatever error was returned by the first cache flush
3656 * that fails.
3657 */
3658
3659 int
3660 rf_sync_component_caches(RF_Raid_t *raidPtr)
3661 {
3662 int c, sparecol;
3663 int e,error;
3664 int force = 1;
3665
3666 error = 0;
3667 for (c = 0; c < raidPtr->numCol; c++) {
3668 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3669 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3670 &force, FWRITE, NOCRED);
3671 if (e) {
3672 if (e != ENODEV)
3673 printf("raid%d: cache flush to component %s failed.\n",
3674 raidPtr->raidid, raidPtr->Disks[c].devname);
3675 if (error == 0) {
3676 error = e;
3677 }
3678 }
3679 }
3680 }
3681
3682 for( c = 0; c < raidPtr->numSpare ; c++) {
3683 sparecol = raidPtr->numCol + c;
3684 /* Need to ensure that the reconstruct actually completed! */
3685 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3686 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3687 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3688 if (e) {
3689 if (e != ENODEV)
3690 printf("raid%d: cache flush to component %s failed.\n",
3691 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3692 if (error == 0) {
3693 error = e;
3694 }
3695 }
3696 }
3697 }
3698 return error;
3699 }
3700
3701 /*
3702 * Module interface
3703 */
3704
3705 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3706
3707 #ifdef _MODULE
3708 CFDRIVER_DECL(raid, DV_DISK, NULL);
3709 #endif
3710
3711 static int raid_modcmd(modcmd_t, void *);
3712 static int raid_modcmd_init(void);
3713 static int raid_modcmd_fini(void);
3714
3715 static int
3716 raid_modcmd(modcmd_t cmd, void *data)
3717 {
3718 int error;
3719
3720 error = 0;
3721 switch (cmd) {
3722 case MODULE_CMD_INIT:
3723 error = raid_modcmd_init();
3724 break;
3725 case MODULE_CMD_FINI:
3726 error = raid_modcmd_fini();
3727 break;
3728 default:
3729 error = ENOTTY;
3730 break;
3731 }
3732 return error;
3733 }
3734
3735 static int
3736 raid_modcmd_init(void)
3737 {
3738 int error;
3739 int bmajor, cmajor;
3740
3741 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3742 mutex_enter(&raid_lock);
3743 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3744 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3745 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3746 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3747
3748 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3749 #endif
3750
3751 bmajor = cmajor = -1;
3752 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3753 &raid_cdevsw, &cmajor);
3754 if (error != 0 && error != EEXIST) {
3755 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3756 mutex_exit(&raid_lock);
3757 return error;
3758 }
3759 #ifdef _MODULE
3760 error = config_cfdriver_attach(&raid_cd);
3761 if (error != 0) {
3762 aprint_error("%s: config_cfdriver_attach failed %d\n",
3763 __func__, error);
3764 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3765 mutex_exit(&raid_lock);
3766 return error;
3767 }
3768 #endif
3769 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3770 if (error != 0) {
3771 aprint_error("%s: config_cfattach_attach failed %d\n",
3772 __func__, error);
3773 #ifdef _MODULE
3774 config_cfdriver_detach(&raid_cd);
3775 #endif
3776 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3777 mutex_exit(&raid_lock);
3778 return error;
3779 }
3780
3781 raidautoconfigdone = false;
3782
3783 mutex_exit(&raid_lock);
3784
3785 if (error == 0) {
3786 if (rf_BootRaidframe(true) == 0)
3787 aprint_verbose("Kernelized RAIDframe activated\n");
3788 else
3789 panic("Serious error activating RAID!!");
3790 }
3791
3792 /*
3793 * Register a finalizer which will be used to auto-config RAID
3794 * sets once all real hardware devices have been found.
3795 */
3796 error = config_finalize_register(NULL, rf_autoconfig);
3797 if (error != 0) {
3798 aprint_error("WARNING: unable to register RAIDframe "
3799 "finalizer\n");
3800 error = 0;
3801 }
3802
3803 return error;
3804 }
3805
3806 static int
3807 raid_modcmd_fini(void)
3808 {
3809 int error;
3810
3811 mutex_enter(&raid_lock);
3812
3813 /* Don't allow unload if raid device(s) exist. */
3814 if (!LIST_EMPTY(&raids)) {
3815 mutex_exit(&raid_lock);
3816 return EBUSY;
3817 }
3818
3819 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3820 if (error != 0) {
3821 aprint_error("%s: cannot detach cfattach\n",__func__);
3822 mutex_exit(&raid_lock);
3823 return error;
3824 }
3825 #ifdef _MODULE
3826 error = config_cfdriver_detach(&raid_cd);
3827 if (error != 0) {
3828 aprint_error("%s: cannot detach cfdriver\n",__func__);
3829 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3830 mutex_exit(&raid_lock);
3831 return error;
3832 }
3833 #endif
3834 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3835 if (error != 0) {
3836 aprint_error("%s: cannot detach devsw\n",__func__);
3837 #ifdef _MODULE
3838 config_cfdriver_attach(&raid_cd);
3839 #endif
3840 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3841 mutex_exit(&raid_lock);
3842 return error;
3843 }
3844 rf_BootRaidframe(false);
3845 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3846 rf_destroy_mutex2(rf_sparet_wait_mutex);
3847 rf_destroy_cond2(rf_sparet_wait_cv);
3848 rf_destroy_cond2(rf_sparet_resp_cv);
3849 #endif
3850 mutex_exit(&raid_lock);
3851 mutex_destroy(&raid_lock);
3852
3853 return error;
3854 }
3855