rf_netbsdkintf.c revision 1.343 1 /* $NetBSD: rf_netbsdkintf.c,v 1.343 2016/01/07 14:15:26 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.343 2016/01/07 14:15:26 christos Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #include "ioconf.h"
156
157 #ifdef DEBUG
158 int rf_kdebug_level = 0;
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #else /* DEBUG */
161 #define db1_printf(a) { }
162 #endif /* DEBUG */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 static rf_declare_mutex2(rf_sparet_wait_mutex);
166 static rf_declare_cond2(rf_sparet_wait_cv);
167 static rf_declare_cond2(rf_sparet_resp_cv);
168
169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
170 * spare table */
171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
172 * installation process */
173 #endif
174
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181 void *, int, struct proc *);
182 struct raid_softc;
183 static void raidinit(struct raid_softc *);
184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
185
186 static int raid_match(device_t, cfdata_t, void *);
187 static void raid_attach(device_t, device_t, void *);
188 static int raid_detach(device_t, int);
189
190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
191 daddr_t, daddr_t);
192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
193 daddr_t, daddr_t, int);
194
195 static int raidwrite_component_label(unsigned,
196 dev_t, struct vnode *, RF_ComponentLabel_t *);
197 static int raidread_component_label(unsigned,
198 dev_t, struct vnode *, RF_ComponentLabel_t *);
199
200 static int raid_diskstart(device_t, struct buf *bp);
201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
202 static int raid_lastclose(device_t);
203
204 static dev_type_open(raidopen);
205 static dev_type_close(raidclose);
206 static dev_type_read(raidread);
207 static dev_type_write(raidwrite);
208 static dev_type_ioctl(raidioctl);
209 static dev_type_strategy(raidstrategy);
210 static dev_type_dump(raiddump);
211 static dev_type_size(raidsize);
212
213 const struct bdevsw raid_bdevsw = {
214 .d_open = raidopen,
215 .d_close = raidclose,
216 .d_strategy = raidstrategy,
217 .d_ioctl = raidioctl,
218 .d_dump = raiddump,
219 .d_psize = raidsize,
220 .d_discard = nodiscard,
221 .d_flag = D_DISK
222 };
223
224 const struct cdevsw raid_cdevsw = {
225 .d_open = raidopen,
226 .d_close = raidclose,
227 .d_read = raidread,
228 .d_write = raidwrite,
229 .d_ioctl = raidioctl,
230 .d_stop = nostop,
231 .d_tty = notty,
232 .d_poll = nopoll,
233 .d_mmap = nommap,
234 .d_kqfilter = nokqfilter,
235 .d_discard = nodiscard,
236 .d_flag = D_DISK
237 };
238
239 static struct dkdriver rf_dkdriver = {
240 .d_open = raidopen,
241 .d_close = raidclose,
242 .d_strategy = raidstrategy,
243 .d_diskstart = raid_diskstart,
244 .d_dumpblocks = raid_dumpblocks,
245 .d_lastclose = raid_lastclose,
246 .d_minphys = minphys
247 };
248
249 struct raid_softc {
250 struct dk_softc sc_dksc;
251 int sc_unit;
252 int sc_flags; /* flags */
253 int sc_cflags; /* configuration flags */
254 kmutex_t sc_mutex; /* interlock mutex */
255 kcondvar_t sc_cv; /* and the condvar */
256 uint64_t sc_size; /* size of the raid device */
257 char sc_xname[20]; /* XXX external name */
258 RF_Raid_t sc_r;
259 LIST_ENTRY(raid_softc) sc_link;
260 };
261 /* sc_flags */
262 #define RAIDF_INITED 0x01 /* unit has been initialized */
263 #define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */
264 #define RAIDF_DETACH 0x04 /* detach after final close */
265 #define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */
266 #define RAIDF_LOCKED 0x10 /* unit is locked */
267 #define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */
268
269 #define raidunit(x) DISKUNIT(x)
270 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
271
272 extern struct cfdriver raid_cd;
273 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
274 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
275 DVF_DETACH_SHUTDOWN);
276
277 /*
278 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
279 * Be aware that large numbers can allow the driver to consume a lot of
280 * kernel memory, especially on writes, and in degraded mode reads.
281 *
282 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
283 * a single 64K write will typically require 64K for the old data,
284 * 64K for the old parity, and 64K for the new parity, for a total
285 * of 192K (if the parity buffer is not re-used immediately).
286 * Even it if is used immediately, that's still 128K, which when multiplied
287 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
288 *
289 * Now in degraded mode, for example, a 64K read on the above setup may
290 * require data reconstruction, which will require *all* of the 4 remaining
291 * disks to participate -- 4 * 32K/disk == 128K again.
292 */
293
294 #ifndef RAIDOUTSTANDING
295 #define RAIDOUTSTANDING 6
296 #endif
297
298 #define RAIDLABELDEV(dev) \
299 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
300
301 /* declared here, and made public, for the benefit of KVM stuff.. */
302
303 static int raidlock(struct raid_softc *);
304 static void raidunlock(struct raid_softc *);
305
306 static int raid_detach_unlocked(struct raid_softc *);
307
308 static void rf_markalldirty(RF_Raid_t *);
309 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
310
311 void rf_ReconThread(struct rf_recon_req *);
312 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
313 void rf_CopybackThread(RF_Raid_t *raidPtr);
314 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
315 int rf_autoconfig(device_t);
316 void rf_buildroothack(RF_ConfigSet_t *);
317
318 RF_AutoConfig_t *rf_find_raid_components(void);
319 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
320 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
321 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
322 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
323 int rf_set_autoconfig(RF_Raid_t *, int);
324 int rf_set_rootpartition(RF_Raid_t *, int);
325 void rf_release_all_vps(RF_ConfigSet_t *);
326 void rf_cleanup_config_set(RF_ConfigSet_t *);
327 int rf_have_enough_components(RF_ConfigSet_t *);
328 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
329 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
330
331 /*
332 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
333 * Note that this is overridden by having RAID_AUTOCONFIG as an option
334 * in the kernel config file.
335 */
336 #ifdef RAID_AUTOCONFIG
337 int raidautoconfig = 1;
338 #else
339 int raidautoconfig = 0;
340 #endif
341 static bool raidautoconfigdone = false;
342
343 struct RF_Pools_s rf_pools;
344
345 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
346 static kmutex_t raid_lock;
347
348 static struct raid_softc *
349 raidcreate(int unit) {
350 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
351 if (sc == NULL) {
352 #ifdef DIAGNOSTIC
353 printf("%s: out of memory\n", __func__);
354 #endif
355 return NULL;
356 }
357 sc->sc_unit = unit;
358 cv_init(&sc->sc_cv, "raidunit");
359 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
360 return sc;
361 }
362
363 static void
364 raiddestroy(struct raid_softc *sc) {
365 cv_destroy(&sc->sc_cv);
366 mutex_destroy(&sc->sc_mutex);
367 kmem_free(sc, sizeof(*sc));
368 }
369
370 static struct raid_softc *
371 raidget(int unit, bool create) {
372 struct raid_softc *sc;
373 if (unit < 0) {
374 #ifdef DIAGNOSTIC
375 panic("%s: unit %d!", __func__, unit);
376 #endif
377 return NULL;
378 }
379 mutex_enter(&raid_lock);
380 LIST_FOREACH(sc, &raids, sc_link) {
381 if (sc->sc_unit == unit) {
382 mutex_exit(&raid_lock);
383 return sc;
384 }
385 }
386 mutex_exit(&raid_lock);
387 if (!create)
388 return NULL;
389 if ((sc = raidcreate(unit)) == NULL)
390 return NULL;
391 mutex_enter(&raid_lock);
392 LIST_INSERT_HEAD(&raids, sc, sc_link);
393 mutex_exit(&raid_lock);
394 return sc;
395 }
396
397 static void
398 raidput(struct raid_softc *sc) {
399 mutex_enter(&raid_lock);
400 LIST_REMOVE(sc, sc_link);
401 mutex_exit(&raid_lock);
402 raiddestroy(sc);
403 }
404
405 void
406 raidattach(int num)
407 {
408
409 /*
410 * Device attachment and associated initialization now occurs
411 * as part of the module initialization.
412 */
413 }
414
415 int
416 rf_autoconfig(device_t self)
417 {
418 RF_AutoConfig_t *ac_list;
419 RF_ConfigSet_t *config_sets;
420
421 if (!raidautoconfig || raidautoconfigdone == true)
422 return (0);
423
424 /* XXX This code can only be run once. */
425 raidautoconfigdone = true;
426
427 #ifdef __HAVE_CPU_BOOTCONF
428 /*
429 * 0. find the boot device if needed first so we can use it later
430 * this needs to be done before we autoconfigure any raid sets,
431 * because if we use wedges we are not going to be able to open
432 * the boot device later
433 */
434 if (booted_device == NULL)
435 cpu_bootconf();
436 #endif
437 /* 1. locate all RAID components on the system */
438 aprint_debug("Searching for RAID components...\n");
439 ac_list = rf_find_raid_components();
440
441 /* 2. Sort them into their respective sets. */
442 config_sets = rf_create_auto_sets(ac_list);
443
444 /*
445 * 3. Evaluate each set and configure the valid ones.
446 * This gets done in rf_buildroothack().
447 */
448 rf_buildroothack(config_sets);
449
450 return 1;
451 }
452
453 static int
454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
455 const char *bootname = device_xname(bdv);
456 size_t len = strlen(bootname);
457
458 for (int col = 0; col < r->numCol; col++) {
459 const char *devname = r->Disks[col].devname;
460 devname += sizeof("/dev/") - 1;
461 if (strncmp(devname, "dk", 2) == 0) {
462 const char *parent =
463 dkwedge_get_parent_name(r->Disks[col].dev);
464 if (parent != NULL)
465 devname = parent;
466 }
467 if (strncmp(devname, bootname, len) == 0) {
468 struct raid_softc *sc = r->softc;
469 aprint_debug("raid%d includes boot device %s\n",
470 sc->sc_unit, devname);
471 return 1;
472 }
473 }
474 return 0;
475 }
476
477 void
478 rf_buildroothack(RF_ConfigSet_t *config_sets)
479 {
480 RF_ConfigSet_t *cset;
481 RF_ConfigSet_t *next_cset;
482 int num_root;
483 struct raid_softc *sc, *rsc;
484 struct dk_softc *dksc;
485
486 sc = rsc = NULL;
487 num_root = 0;
488 cset = config_sets;
489 while (cset != NULL) {
490 next_cset = cset->next;
491 if (rf_have_enough_components(cset) &&
492 cset->ac->clabel->autoconfigure == 1) {
493 sc = rf_auto_config_set(cset);
494 if (sc != NULL) {
495 aprint_debug("raid%d: configured ok\n",
496 sc->sc_unit);
497 if (cset->rootable) {
498 rsc = sc;
499 num_root++;
500 }
501 } else {
502 /* The autoconfig didn't work :( */
503 aprint_debug("Autoconfig failed\n");
504 rf_release_all_vps(cset);
505 }
506 } else {
507 /* we're not autoconfiguring this set...
508 release the associated resources */
509 rf_release_all_vps(cset);
510 }
511 /* cleanup */
512 rf_cleanup_config_set(cset);
513 cset = next_cset;
514 }
515 dksc = &rsc->sc_dksc;
516
517 /* if the user has specified what the root device should be
518 then we don't touch booted_device or boothowto... */
519
520 if (rootspec != NULL)
521 return;
522
523 /* we found something bootable... */
524
525 /*
526 * XXX: The following code assumes that the root raid
527 * is the first ('a') partition. This is about the best
528 * we can do with a BSD disklabel, but we might be able
529 * to do better with a GPT label, by setting a specified
530 * attribute to indicate the root partition. We can then
531 * stash the partition number in the r->root_partition
532 * high bits (the bottom 2 bits are already used). For
533 * now we just set booted_partition to 0 when we override
534 * root.
535 */
536 if (num_root == 1) {
537 device_t candidate_root;
538 if (dksc->sc_dkdev.dk_nwedges != 0) {
539 char cname[sizeof(cset->ac->devname)];
540 /* XXX: assume 'a' */
541 snprintf(cname, sizeof(cname), "%s%c",
542 device_xname(dksc->sc_dev), 'a');
543 candidate_root = dkwedge_find_by_wname(cname);
544 } else
545 candidate_root = dksc->sc_dev;
546 if (booted_device == NULL ||
547 rsc->sc_r.root_partition == 1 ||
548 rf_containsboot(&rsc->sc_r, booted_device)) {
549 booted_device = candidate_root;
550 booted_partition = 0; /* XXX assume 'a' */
551 }
552 } else if (num_root > 1) {
553
554 /*
555 * Maybe the MD code can help. If it cannot, then
556 * setroot() will discover that we have no
557 * booted_device and will ask the user if nothing was
558 * hardwired in the kernel config file
559 */
560 if (booted_device == NULL)
561 return;
562
563 num_root = 0;
564 mutex_enter(&raid_lock);
565 LIST_FOREACH(sc, &raids, sc_link) {
566 RF_Raid_t *r = &sc->sc_r;
567 if (r->valid == 0)
568 continue;
569
570 if (r->root_partition == 0)
571 continue;
572
573 if (rf_containsboot(r, booted_device)) {
574 num_root++;
575 rsc = sc;
576 dksc = &rsc->sc_dksc;
577 }
578 }
579 mutex_exit(&raid_lock);
580
581 if (num_root == 1) {
582 booted_device = dksc->sc_dev;
583 booted_partition = 0; /* XXX assume 'a' */
584 } else {
585 /* we can't guess.. require the user to answer... */
586 boothowto |= RB_ASKNAME;
587 }
588 }
589 }
590
591 static int
592 raidsize(dev_t dev)
593 {
594 struct raid_softc *rs;
595 struct dk_softc *dksc;
596 unsigned int unit;
597
598 unit = raidunit(dev);
599 if ((rs = raidget(unit, false)) == NULL)
600 return -1;
601 dksc = &rs->sc_dksc;
602
603 if ((rs->sc_flags & RAIDF_INITED) == 0)
604 return -1;
605
606 return dk_size(dksc, dev);
607 }
608
609 static int
610 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
611 {
612 unsigned int unit;
613 struct raid_softc *rs;
614 struct dk_softc *dksc;
615
616 unit = raidunit(dev);
617 if ((rs = raidget(unit, false)) == NULL)
618 return ENXIO;
619 dksc = &rs->sc_dksc;
620
621 if ((rs->sc_flags & RAIDF_INITED) == 0)
622 return ENODEV;
623
624 /*
625 Note that blkno is relative to this particular partition.
626 By adding adding RF_PROTECTED_SECTORS, we get a value that
627 is relative to the partition used for the underlying component.
628 */
629 blkno += RF_PROTECTED_SECTORS;
630
631 return dk_dump(dksc, dev, blkno, va, size);
632 }
633
634 static int
635 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
636 {
637 struct raid_softc *rs = raidsoftc(dev);
638 const struct bdevsw *bdev;
639 RF_Raid_t *raidPtr;
640 int c, sparecol, j, scol, dumpto;
641 int error = 0;
642
643 raidPtr = &rs->sc_r;
644
645 /* we only support dumping to RAID 1 sets */
646 if (raidPtr->Layout.numDataCol != 1 ||
647 raidPtr->Layout.numParityCol != 1)
648 return EINVAL;
649
650 if ((error = raidlock(rs)) != 0)
651 return error;
652
653 /* figure out what device is alive.. */
654
655 /*
656 Look for a component to dump to. The preference for the
657 component to dump to is as follows:
658 1) the master
659 2) a used_spare of the master
660 3) the slave
661 4) a used_spare of the slave
662 */
663
664 dumpto = -1;
665 for (c = 0; c < raidPtr->numCol; c++) {
666 if (raidPtr->Disks[c].status == rf_ds_optimal) {
667 /* this might be the one */
668 dumpto = c;
669 break;
670 }
671 }
672
673 /*
674 At this point we have possibly selected a live master or a
675 live slave. We now check to see if there is a spared
676 master (or a spared slave), if we didn't find a live master
677 or a live slave.
678 */
679
680 for (c = 0; c < raidPtr->numSpare; c++) {
681 sparecol = raidPtr->numCol + c;
682 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
683 /* How about this one? */
684 scol = -1;
685 for(j=0;j<raidPtr->numCol;j++) {
686 if (raidPtr->Disks[j].spareCol == sparecol) {
687 scol = j;
688 break;
689 }
690 }
691 if (scol == 0) {
692 /*
693 We must have found a spared master!
694 We'll take that over anything else
695 found so far. (We couldn't have
696 found a real master before, since
697 this is a used spare, and it's
698 saying that it's replacing the
699 master.) On reboot (with
700 autoconfiguration turned on)
701 sparecol will become the 1st
702 component (component0) of this set.
703 */
704 dumpto = sparecol;
705 break;
706 } else if (scol != -1) {
707 /*
708 Must be a spared slave. We'll dump
709 to that if we havn't found anything
710 else so far.
711 */
712 if (dumpto == -1)
713 dumpto = sparecol;
714 }
715 }
716 }
717
718 if (dumpto == -1) {
719 /* we couldn't find any live components to dump to!?!?
720 */
721 error = EINVAL;
722 goto out;
723 }
724
725 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
726 if (bdev == NULL) {
727 error = ENXIO;
728 goto out;
729 }
730
731 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
732 blkno, va, nblk * raidPtr->bytesPerSector);
733
734 out:
735 raidunlock(rs);
736
737 return error;
738 }
739
740 /* ARGSUSED */
741 static int
742 raidopen(dev_t dev, int flags, int fmt,
743 struct lwp *l)
744 {
745 int unit = raidunit(dev);
746 struct raid_softc *rs;
747 struct dk_softc *dksc;
748 int error = 0;
749 int part, pmask;
750
751 if ((rs = raidget(unit, true)) == NULL)
752 return ENXIO;
753 if ((error = raidlock(rs)) != 0)
754 return (error);
755
756 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
757 error = EBUSY;
758 goto bad;
759 }
760
761 dksc = &rs->sc_dksc;
762
763 part = DISKPART(dev);
764 pmask = (1 << part);
765
766 if (!DK_BUSY(dksc, pmask) &&
767 ((rs->sc_flags & RAIDF_INITED) != 0)) {
768 /* First one... mark things as dirty... Note that we *MUST*
769 have done a configure before this. I DO NOT WANT TO BE
770 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
771 THAT THEY BELONG TOGETHER!!!!! */
772 /* XXX should check to see if we're only open for reading
773 here... If so, we needn't do this, but then need some
774 other way of keeping track of what's happened.. */
775
776 rf_markalldirty(&rs->sc_r);
777 }
778
779 if ((rs->sc_flags & RAIDF_INITED) != 0)
780 error = dk_open(dksc, dev, flags, fmt, l);
781
782 bad:
783 raidunlock(rs);
784
785 return (error);
786
787
788 }
789
790 static int
791 raid_lastclose(device_t self)
792 {
793 struct raid_softc *rs = raidsoftc(self);
794
795 /* Last one... device is not unconfigured yet.
796 Device shutdown has taken care of setting the
797 clean bits if RAIDF_INITED is not set
798 mark things as clean... */
799
800 rf_update_component_labels(&rs->sc_r,
801 RF_FINAL_COMPONENT_UPDATE);
802
803 /* pass to unlocked code */
804 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
805 rs->sc_flags |= RAIDF_DETACH;
806
807 return 0;
808 }
809
810 /* ARGSUSED */
811 static int
812 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
813 {
814 int unit = raidunit(dev);
815 struct raid_softc *rs;
816 struct dk_softc *dksc;
817 cfdata_t cf;
818 int error = 0, do_detach = 0, do_put = 0;
819
820 if ((rs = raidget(unit, false)) == NULL)
821 return ENXIO;
822 dksc = &rs->sc_dksc;
823
824 if ((error = raidlock(rs)) != 0)
825 return (error);
826
827 if ((rs->sc_flags & RAIDF_INITED) != 0) {
828 error = dk_close(dksc, dev, flags, fmt, l);
829 if ((rs->sc_flags & RAIDF_DETACH) != 0)
830 do_detach = 1;
831 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
832 do_put = 1;
833
834 raidunlock(rs);
835
836 if (do_detach) {
837 /* free the pseudo device attach bits */
838 cf = device_cfdata(dksc->sc_dev);
839 error = config_detach(dksc->sc_dev, 0);
840 if (error == 0)
841 free(cf, M_RAIDFRAME);
842 } else if (do_put) {
843 raidput(rs);
844 }
845
846 return (error);
847
848 }
849
850 static void
851 raid_wakeup(RF_Raid_t *raidPtr)
852 {
853 rf_lock_mutex2(raidPtr->iodone_lock);
854 rf_signal_cond2(raidPtr->iodone_cv);
855 rf_unlock_mutex2(raidPtr->iodone_lock);
856 }
857
858 static void
859 raidstrategy(struct buf *bp)
860 {
861 unsigned int unit;
862 struct raid_softc *rs;
863 struct dk_softc *dksc;
864 RF_Raid_t *raidPtr;
865
866 unit = raidunit(bp->b_dev);
867 if ((rs = raidget(unit, false)) == NULL) {
868 bp->b_error = ENXIO;
869 goto fail;
870 }
871 if ((rs->sc_flags & RAIDF_INITED) == 0) {
872 bp->b_error = ENXIO;
873 goto fail;
874 }
875 dksc = &rs->sc_dksc;
876 raidPtr = &rs->sc_r;
877
878 /* Queue IO only */
879 if (dk_strategy_defer(dksc, bp))
880 goto done;
881
882 /* schedule the IO to happen at the next convenient time */
883 raid_wakeup(raidPtr);
884
885 done:
886 return;
887
888 fail:
889 bp->b_resid = bp->b_bcount;
890 biodone(bp);
891 }
892
893 static int
894 raid_diskstart(device_t dev, struct buf *bp)
895 {
896 struct raid_softc *rs = raidsoftc(dev);
897 RF_Raid_t *raidPtr;
898
899 raidPtr = &rs->sc_r;
900 if (!raidPtr->valid) {
901 db1_printf(("raid is not valid..\n"));
902 return ENODEV;
903 }
904
905 /* XXX */
906 bp->b_resid = 0;
907
908 return raiddoaccess(raidPtr, bp);
909 }
910
911 void
912 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
913 {
914 struct raid_softc *rs;
915 struct dk_softc *dksc;
916
917 rs = raidPtr->softc;
918 dksc = &rs->sc_dksc;
919
920 dk_done(dksc, bp);
921
922 rf_lock_mutex2(raidPtr->mutex);
923 raidPtr->openings++;
924 rf_unlock_mutex2(raidPtr->mutex);
925
926 /* schedule more IO */
927 raid_wakeup(raidPtr);
928 }
929
930 /* ARGSUSED */
931 static int
932 raidread(dev_t dev, struct uio *uio, int flags)
933 {
934 int unit = raidunit(dev);
935 struct raid_softc *rs;
936
937 if ((rs = raidget(unit, false)) == NULL)
938 return ENXIO;
939
940 if ((rs->sc_flags & RAIDF_INITED) == 0)
941 return (ENXIO);
942
943 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
944
945 }
946
947 /* ARGSUSED */
948 static int
949 raidwrite(dev_t dev, struct uio *uio, int flags)
950 {
951 int unit = raidunit(dev);
952 struct raid_softc *rs;
953
954 if ((rs = raidget(unit, false)) == NULL)
955 return ENXIO;
956
957 if ((rs->sc_flags & RAIDF_INITED) == 0)
958 return (ENXIO);
959
960 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
961
962 }
963
964 static int
965 raid_detach_unlocked(struct raid_softc *rs)
966 {
967 struct dk_softc *dksc = &rs->sc_dksc;
968 RF_Raid_t *raidPtr;
969 int error;
970
971 raidPtr = &rs->sc_r;
972
973 if (DK_BUSY(dksc, 0) ||
974 raidPtr->recon_in_progress != 0 ||
975 raidPtr->parity_rewrite_in_progress != 0 ||
976 raidPtr->copyback_in_progress != 0)
977 return EBUSY;
978
979 if ((rs->sc_flags & RAIDF_INITED) == 0)
980 return 0;
981
982 rs->sc_flags &= ~RAIDF_SHUTDOWN;
983
984 if ((error = rf_Shutdown(raidPtr)) != 0)
985 return error;
986
987 rs->sc_flags &= ~RAIDF_INITED;
988
989 /* Kill off any queued buffers */
990 dk_drain(dksc);
991 bufq_free(dksc->sc_bufq);
992
993 /* Detach the disk. */
994 dkwedge_delall(&dksc->sc_dkdev);
995 disk_detach(&dksc->sc_dkdev);
996 disk_destroy(&dksc->sc_dkdev);
997 dk_detach(dksc);
998
999 return 0;
1000 }
1001
1002 static int
1003 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1004 {
1005 int unit = raidunit(dev);
1006 int error = 0;
1007 int part, pmask;
1008 struct raid_softc *rs;
1009 struct dk_softc *dksc;
1010 RF_Config_t *k_cfg, *u_cfg;
1011 RF_Raid_t *raidPtr;
1012 RF_RaidDisk_t *diskPtr;
1013 RF_AccTotals_t *totals;
1014 RF_DeviceConfig_t *d_cfg, **ucfgp;
1015 u_char *specific_buf;
1016 int retcode = 0;
1017 int column;
1018 /* int raidid; */
1019 struct rf_recon_req *rrcopy, *rr;
1020 RF_ComponentLabel_t *clabel;
1021 RF_ComponentLabel_t *ci_label;
1022 RF_ComponentLabel_t **clabel_ptr;
1023 RF_SingleComponent_t *sparePtr,*componentPtr;
1024 RF_SingleComponent_t component;
1025 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1026 int i, j, d;
1027
1028 if ((rs = raidget(unit, false)) == NULL)
1029 return ENXIO;
1030 dksc = &rs->sc_dksc;
1031 raidPtr = &rs->sc_r;
1032
1033 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1034 (int) DISKPART(dev), (int) unit, cmd));
1035
1036 /* Must be initialized for these... */
1037 switch (cmd) {
1038 case RAIDFRAME_REWRITEPARITY:
1039 case RAIDFRAME_GET_INFO:
1040 case RAIDFRAME_RESET_ACCTOTALS:
1041 case RAIDFRAME_GET_ACCTOTALS:
1042 case RAIDFRAME_KEEP_ACCTOTALS:
1043 case RAIDFRAME_GET_SIZE:
1044 case RAIDFRAME_FAIL_DISK:
1045 case RAIDFRAME_COPYBACK:
1046 case RAIDFRAME_CHECK_RECON_STATUS:
1047 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1048 case RAIDFRAME_GET_COMPONENT_LABEL:
1049 case RAIDFRAME_SET_COMPONENT_LABEL:
1050 case RAIDFRAME_ADD_HOT_SPARE:
1051 case RAIDFRAME_REMOVE_HOT_SPARE:
1052 case RAIDFRAME_INIT_LABELS:
1053 case RAIDFRAME_REBUILD_IN_PLACE:
1054 case RAIDFRAME_CHECK_PARITY:
1055 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1056 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1057 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1058 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1059 case RAIDFRAME_SET_AUTOCONFIG:
1060 case RAIDFRAME_SET_ROOT:
1061 case RAIDFRAME_DELETE_COMPONENT:
1062 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1063 case RAIDFRAME_PARITYMAP_STATUS:
1064 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1065 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1066 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1067 if ((rs->sc_flags & RAIDF_INITED) == 0)
1068 return (ENXIO);
1069 }
1070
1071 switch (cmd) {
1072 #ifdef COMPAT_50
1073 case RAIDFRAME_GET_INFO50:
1074 return rf_get_info50(raidPtr, data);
1075
1076 case RAIDFRAME_CONFIGURE50:
1077 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1078 return retcode;
1079 goto config;
1080 #endif
1081 /* configure the system */
1082 case RAIDFRAME_CONFIGURE:
1083
1084 if (raidPtr->valid) {
1085 /* There is a valid RAID set running on this unit! */
1086 printf("raid%d: Device already configured!\n",unit);
1087 return(EINVAL);
1088 }
1089
1090 /* copy-in the configuration information */
1091 /* data points to a pointer to the configuration structure */
1092
1093 u_cfg = *((RF_Config_t **) data);
1094 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1095 if (k_cfg == NULL) {
1096 return (ENOMEM);
1097 }
1098 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1099 if (retcode) {
1100 RF_Free(k_cfg, sizeof(RF_Config_t));
1101 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1102 retcode));
1103 goto no_config;
1104 }
1105 goto config;
1106 config:
1107 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1108
1109 /* allocate a buffer for the layout-specific data, and copy it
1110 * in */
1111 if (k_cfg->layoutSpecificSize) {
1112 if (k_cfg->layoutSpecificSize > 10000) {
1113 /* sanity check */
1114 RF_Free(k_cfg, sizeof(RF_Config_t));
1115 retcode = EINVAL;
1116 goto no_config;
1117 }
1118 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1119 (u_char *));
1120 if (specific_buf == NULL) {
1121 RF_Free(k_cfg, sizeof(RF_Config_t));
1122 retcode = ENOMEM;
1123 goto no_config;
1124 }
1125 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1126 k_cfg->layoutSpecificSize);
1127 if (retcode) {
1128 RF_Free(k_cfg, sizeof(RF_Config_t));
1129 RF_Free(specific_buf,
1130 k_cfg->layoutSpecificSize);
1131 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1132 retcode));
1133 goto no_config;
1134 }
1135 } else
1136 specific_buf = NULL;
1137 k_cfg->layoutSpecific = specific_buf;
1138
1139 /* should do some kind of sanity check on the configuration.
1140 * Store the sum of all the bytes in the last byte? */
1141
1142 /* configure the system */
1143
1144 /*
1145 * Clear the entire RAID descriptor, just to make sure
1146 * there is no stale data left in the case of a
1147 * reconfiguration
1148 */
1149 memset(raidPtr, 0, sizeof(*raidPtr));
1150 raidPtr->softc = rs;
1151 raidPtr->raidid = unit;
1152
1153 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1154
1155 if (retcode == 0) {
1156
1157 /* allow this many simultaneous IO's to
1158 this RAID device */
1159 raidPtr->openings = RAIDOUTSTANDING;
1160
1161 raidinit(rs);
1162 raid_wakeup(raidPtr);
1163 rf_markalldirty(raidPtr);
1164 }
1165 /* free the buffers. No return code here. */
1166 if (k_cfg->layoutSpecificSize) {
1167 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1168 }
1169 RF_Free(k_cfg, sizeof(RF_Config_t));
1170
1171 no_config:
1172 /*
1173 * If configuration failed, set sc_flags so that we
1174 * will detach the device when we close it.
1175 */
1176 if (retcode != 0)
1177 rs->sc_flags |= RAIDF_SHUTDOWN;
1178 return (retcode);
1179
1180 /* shutdown the system */
1181 case RAIDFRAME_SHUTDOWN:
1182
1183 part = DISKPART(dev);
1184 pmask = (1 << part);
1185
1186 if ((error = raidlock(rs)) != 0)
1187 return (error);
1188
1189 if (DK_BUSY(dksc, pmask) ||
1190 raidPtr->recon_in_progress != 0 ||
1191 raidPtr->parity_rewrite_in_progress != 0 ||
1192 raidPtr->copyback_in_progress != 0)
1193 retcode = EBUSY;
1194 else {
1195 /* detach and free on close */
1196 rs->sc_flags |= RAIDF_SHUTDOWN;
1197 retcode = 0;
1198 }
1199
1200 raidunlock(rs);
1201
1202 return (retcode);
1203 case RAIDFRAME_GET_COMPONENT_LABEL:
1204 clabel_ptr = (RF_ComponentLabel_t **) data;
1205 /* need to read the component label for the disk indicated
1206 by row,column in clabel */
1207
1208 /*
1209 * Perhaps there should be an option to skip the in-core
1210 * copy and hit the disk, as with disklabel(8).
1211 */
1212 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1213
1214 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1215
1216 if (retcode) {
1217 RF_Free(clabel, sizeof(*clabel));
1218 return retcode;
1219 }
1220
1221 clabel->row = 0; /* Don't allow looking at anything else.*/
1222
1223 column = clabel->column;
1224
1225 if ((column < 0) || (column >= raidPtr->numCol +
1226 raidPtr->numSpare)) {
1227 RF_Free(clabel, sizeof(*clabel));
1228 return EINVAL;
1229 }
1230
1231 RF_Free(clabel, sizeof(*clabel));
1232
1233 clabel = raidget_component_label(raidPtr, column);
1234
1235 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1236
1237 #if 0
1238 case RAIDFRAME_SET_COMPONENT_LABEL:
1239 clabel = (RF_ComponentLabel_t *) data;
1240
1241 /* XXX check the label for valid stuff... */
1242 /* Note that some things *should not* get modified --
1243 the user should be re-initing the labels instead of
1244 trying to patch things.
1245 */
1246
1247 raidid = raidPtr->raidid;
1248 #ifdef DEBUG
1249 printf("raid%d: Got component label:\n", raidid);
1250 printf("raid%d: Version: %d\n", raidid, clabel->version);
1251 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1252 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1253 printf("raid%d: Column: %d\n", raidid, clabel->column);
1254 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1255 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1256 printf("raid%d: Status: %d\n", raidid, clabel->status);
1257 #endif
1258 clabel->row = 0;
1259 column = clabel->column;
1260
1261 if ((column < 0) || (column >= raidPtr->numCol)) {
1262 return(EINVAL);
1263 }
1264
1265 /* XXX this isn't allowed to do anything for now :-) */
1266
1267 /* XXX and before it is, we need to fill in the rest
1268 of the fields!?!?!?! */
1269 memcpy(raidget_component_label(raidPtr, column),
1270 clabel, sizeof(*clabel));
1271 raidflush_component_label(raidPtr, column);
1272 return (0);
1273 #endif
1274
1275 case RAIDFRAME_INIT_LABELS:
1276 clabel = (RF_ComponentLabel_t *) data;
1277 /*
1278 we only want the serial number from
1279 the above. We get all the rest of the information
1280 from the config that was used to create this RAID
1281 set.
1282 */
1283
1284 raidPtr->serial_number = clabel->serial_number;
1285
1286 for(column=0;column<raidPtr->numCol;column++) {
1287 diskPtr = &raidPtr->Disks[column];
1288 if (!RF_DEAD_DISK(diskPtr->status)) {
1289 ci_label = raidget_component_label(raidPtr,
1290 column);
1291 /* Zeroing this is important. */
1292 memset(ci_label, 0, sizeof(*ci_label));
1293 raid_init_component_label(raidPtr, ci_label);
1294 ci_label->serial_number =
1295 raidPtr->serial_number;
1296 ci_label->row = 0; /* we dont' pretend to support more */
1297 rf_component_label_set_partitionsize(ci_label,
1298 diskPtr->partitionSize);
1299 ci_label->column = column;
1300 raidflush_component_label(raidPtr, column);
1301 }
1302 /* XXXjld what about the spares? */
1303 }
1304
1305 return (retcode);
1306 case RAIDFRAME_SET_AUTOCONFIG:
1307 d = rf_set_autoconfig(raidPtr, *(int *) data);
1308 printf("raid%d: New autoconfig value is: %d\n",
1309 raidPtr->raidid, d);
1310 *(int *) data = d;
1311 return (retcode);
1312
1313 case RAIDFRAME_SET_ROOT:
1314 d = rf_set_rootpartition(raidPtr, *(int *) data);
1315 printf("raid%d: New rootpartition value is: %d\n",
1316 raidPtr->raidid, d);
1317 *(int *) data = d;
1318 return (retcode);
1319
1320 /* initialize all parity */
1321 case RAIDFRAME_REWRITEPARITY:
1322
1323 if (raidPtr->Layout.map->faultsTolerated == 0) {
1324 /* Parity for RAID 0 is trivially correct */
1325 raidPtr->parity_good = RF_RAID_CLEAN;
1326 return(0);
1327 }
1328
1329 if (raidPtr->parity_rewrite_in_progress == 1) {
1330 /* Re-write is already in progress! */
1331 return(EINVAL);
1332 }
1333
1334 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1335 rf_RewriteParityThread,
1336 raidPtr,"raid_parity");
1337 return (retcode);
1338
1339
1340 case RAIDFRAME_ADD_HOT_SPARE:
1341 sparePtr = (RF_SingleComponent_t *) data;
1342 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1343 retcode = rf_add_hot_spare(raidPtr, &component);
1344 return(retcode);
1345
1346 case RAIDFRAME_REMOVE_HOT_SPARE:
1347 return(retcode);
1348
1349 case RAIDFRAME_DELETE_COMPONENT:
1350 componentPtr = (RF_SingleComponent_t *)data;
1351 memcpy( &component, componentPtr,
1352 sizeof(RF_SingleComponent_t));
1353 retcode = rf_delete_component(raidPtr, &component);
1354 return(retcode);
1355
1356 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1357 componentPtr = (RF_SingleComponent_t *)data;
1358 memcpy( &component, componentPtr,
1359 sizeof(RF_SingleComponent_t));
1360 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1361 return(retcode);
1362
1363 case RAIDFRAME_REBUILD_IN_PLACE:
1364
1365 if (raidPtr->Layout.map->faultsTolerated == 0) {
1366 /* Can't do this on a RAID 0!! */
1367 return(EINVAL);
1368 }
1369
1370 if (raidPtr->recon_in_progress == 1) {
1371 /* a reconstruct is already in progress! */
1372 return(EINVAL);
1373 }
1374
1375 componentPtr = (RF_SingleComponent_t *) data;
1376 memcpy( &component, componentPtr,
1377 sizeof(RF_SingleComponent_t));
1378 component.row = 0; /* we don't support any more */
1379 column = component.column;
1380
1381 if ((column < 0) || (column >= raidPtr->numCol)) {
1382 return(EINVAL);
1383 }
1384
1385 rf_lock_mutex2(raidPtr->mutex);
1386 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1387 (raidPtr->numFailures > 0)) {
1388 /* XXX 0 above shouldn't be constant!!! */
1389 /* some component other than this has failed.
1390 Let's not make things worse than they already
1391 are... */
1392 printf("raid%d: Unable to reconstruct to disk at:\n",
1393 raidPtr->raidid);
1394 printf("raid%d: Col: %d Too many failures.\n",
1395 raidPtr->raidid, column);
1396 rf_unlock_mutex2(raidPtr->mutex);
1397 return (EINVAL);
1398 }
1399 if (raidPtr->Disks[column].status ==
1400 rf_ds_reconstructing) {
1401 printf("raid%d: Unable to reconstruct to disk at:\n",
1402 raidPtr->raidid);
1403 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1404
1405 rf_unlock_mutex2(raidPtr->mutex);
1406 return (EINVAL);
1407 }
1408 if (raidPtr->Disks[column].status == rf_ds_spared) {
1409 rf_unlock_mutex2(raidPtr->mutex);
1410 return (EINVAL);
1411 }
1412 rf_unlock_mutex2(raidPtr->mutex);
1413
1414 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1415 if (rrcopy == NULL)
1416 return(ENOMEM);
1417
1418 rrcopy->raidPtr = (void *) raidPtr;
1419 rrcopy->col = column;
1420
1421 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1422 rf_ReconstructInPlaceThread,
1423 rrcopy,"raid_reconip");
1424 return(retcode);
1425
1426 case RAIDFRAME_GET_INFO:
1427 if (!raidPtr->valid)
1428 return (ENODEV);
1429 ucfgp = (RF_DeviceConfig_t **) data;
1430 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1431 (RF_DeviceConfig_t *));
1432 if (d_cfg == NULL)
1433 return (ENOMEM);
1434 d_cfg->rows = 1; /* there is only 1 row now */
1435 d_cfg->cols = raidPtr->numCol;
1436 d_cfg->ndevs = raidPtr->numCol;
1437 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1438 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1439 return (ENOMEM);
1440 }
1441 d_cfg->nspares = raidPtr->numSpare;
1442 if (d_cfg->nspares >= RF_MAX_DISKS) {
1443 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1444 return (ENOMEM);
1445 }
1446 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1447 d = 0;
1448 for (j = 0; j < d_cfg->cols; j++) {
1449 d_cfg->devs[d] = raidPtr->Disks[j];
1450 d++;
1451 }
1452 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1453 d_cfg->spares[i] = raidPtr->Disks[j];
1454 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1455 /* XXX: raidctl(8) expects to see this as a used spare */
1456 d_cfg->spares[i].status = rf_ds_used_spare;
1457 }
1458 }
1459 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1460 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1461
1462 return (retcode);
1463
1464 case RAIDFRAME_CHECK_PARITY:
1465 *(int *) data = raidPtr->parity_good;
1466 return (0);
1467
1468 case RAIDFRAME_PARITYMAP_STATUS:
1469 if (rf_paritymap_ineligible(raidPtr))
1470 return EINVAL;
1471 rf_paritymap_status(raidPtr->parity_map,
1472 (struct rf_pmstat *)data);
1473 return 0;
1474
1475 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1476 if (rf_paritymap_ineligible(raidPtr))
1477 return EINVAL;
1478 if (raidPtr->parity_map == NULL)
1479 return ENOENT; /* ??? */
1480 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1481 (struct rf_pmparams *)data, 1))
1482 return EINVAL;
1483 return 0;
1484
1485 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1486 if (rf_paritymap_ineligible(raidPtr))
1487 return EINVAL;
1488 *(int *) data = rf_paritymap_get_disable(raidPtr);
1489 return 0;
1490
1491 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1492 if (rf_paritymap_ineligible(raidPtr))
1493 return EINVAL;
1494 rf_paritymap_set_disable(raidPtr, *(int *)data);
1495 /* XXX should errors be passed up? */
1496 return 0;
1497
1498 case RAIDFRAME_RESET_ACCTOTALS:
1499 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1500 return (0);
1501
1502 case RAIDFRAME_GET_ACCTOTALS:
1503 totals = (RF_AccTotals_t *) data;
1504 *totals = raidPtr->acc_totals;
1505 return (0);
1506
1507 case RAIDFRAME_KEEP_ACCTOTALS:
1508 raidPtr->keep_acc_totals = *(int *)data;
1509 return (0);
1510
1511 case RAIDFRAME_GET_SIZE:
1512 *(int *) data = raidPtr->totalSectors;
1513 return (0);
1514
1515 /* fail a disk & optionally start reconstruction */
1516 case RAIDFRAME_FAIL_DISK:
1517
1518 if (raidPtr->Layout.map->faultsTolerated == 0) {
1519 /* Can't do this on a RAID 0!! */
1520 return(EINVAL);
1521 }
1522
1523 rr = (struct rf_recon_req *) data;
1524 rr->row = 0;
1525 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1526 return (EINVAL);
1527
1528
1529 rf_lock_mutex2(raidPtr->mutex);
1530 if (raidPtr->status == rf_rs_reconstructing) {
1531 /* you can't fail a disk while we're reconstructing! */
1532 /* XXX wrong for RAID6 */
1533 rf_unlock_mutex2(raidPtr->mutex);
1534 return (EINVAL);
1535 }
1536 if ((raidPtr->Disks[rr->col].status ==
1537 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1538 /* some other component has failed. Let's not make
1539 things worse. XXX wrong for RAID6 */
1540 rf_unlock_mutex2(raidPtr->mutex);
1541 return (EINVAL);
1542 }
1543 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1544 /* Can't fail a spared disk! */
1545 rf_unlock_mutex2(raidPtr->mutex);
1546 return (EINVAL);
1547 }
1548 rf_unlock_mutex2(raidPtr->mutex);
1549
1550 /* make a copy of the recon request so that we don't rely on
1551 * the user's buffer */
1552 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1553 if (rrcopy == NULL)
1554 return(ENOMEM);
1555 memcpy(rrcopy, rr, sizeof(*rr));
1556 rrcopy->raidPtr = (void *) raidPtr;
1557
1558 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1559 rf_ReconThread,
1560 rrcopy,"raid_recon");
1561 return (0);
1562
1563 /* invoke a copyback operation after recon on whatever disk
1564 * needs it, if any */
1565 case RAIDFRAME_COPYBACK:
1566
1567 if (raidPtr->Layout.map->faultsTolerated == 0) {
1568 /* This makes no sense on a RAID 0!! */
1569 return(EINVAL);
1570 }
1571
1572 if (raidPtr->copyback_in_progress == 1) {
1573 /* Copyback is already in progress! */
1574 return(EINVAL);
1575 }
1576
1577 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1578 rf_CopybackThread,
1579 raidPtr,"raid_copyback");
1580 return (retcode);
1581
1582 /* return the percentage completion of reconstruction */
1583 case RAIDFRAME_CHECK_RECON_STATUS:
1584 if (raidPtr->Layout.map->faultsTolerated == 0) {
1585 /* This makes no sense on a RAID 0, so tell the
1586 user it's done. */
1587 *(int *) data = 100;
1588 return(0);
1589 }
1590 if (raidPtr->status != rf_rs_reconstructing)
1591 *(int *) data = 100;
1592 else {
1593 if (raidPtr->reconControl->numRUsTotal > 0) {
1594 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1595 } else {
1596 *(int *) data = 0;
1597 }
1598 }
1599 return (0);
1600 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1601 progressInfoPtr = (RF_ProgressInfo_t **) data;
1602 if (raidPtr->status != rf_rs_reconstructing) {
1603 progressInfo.remaining = 0;
1604 progressInfo.completed = 100;
1605 progressInfo.total = 100;
1606 } else {
1607 progressInfo.total =
1608 raidPtr->reconControl->numRUsTotal;
1609 progressInfo.completed =
1610 raidPtr->reconControl->numRUsComplete;
1611 progressInfo.remaining = progressInfo.total -
1612 progressInfo.completed;
1613 }
1614 retcode = copyout(&progressInfo, *progressInfoPtr,
1615 sizeof(RF_ProgressInfo_t));
1616 return (retcode);
1617
1618 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1619 if (raidPtr->Layout.map->faultsTolerated == 0) {
1620 /* This makes no sense on a RAID 0, so tell the
1621 user it's done. */
1622 *(int *) data = 100;
1623 return(0);
1624 }
1625 if (raidPtr->parity_rewrite_in_progress == 1) {
1626 *(int *) data = 100 *
1627 raidPtr->parity_rewrite_stripes_done /
1628 raidPtr->Layout.numStripe;
1629 } else {
1630 *(int *) data = 100;
1631 }
1632 return (0);
1633
1634 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1635 progressInfoPtr = (RF_ProgressInfo_t **) data;
1636 if (raidPtr->parity_rewrite_in_progress == 1) {
1637 progressInfo.total = raidPtr->Layout.numStripe;
1638 progressInfo.completed =
1639 raidPtr->parity_rewrite_stripes_done;
1640 progressInfo.remaining = progressInfo.total -
1641 progressInfo.completed;
1642 } else {
1643 progressInfo.remaining = 0;
1644 progressInfo.completed = 100;
1645 progressInfo.total = 100;
1646 }
1647 retcode = copyout(&progressInfo, *progressInfoPtr,
1648 sizeof(RF_ProgressInfo_t));
1649 return (retcode);
1650
1651 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1652 if (raidPtr->Layout.map->faultsTolerated == 0) {
1653 /* This makes no sense on a RAID 0 */
1654 *(int *) data = 100;
1655 return(0);
1656 }
1657 if (raidPtr->copyback_in_progress == 1) {
1658 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1659 raidPtr->Layout.numStripe;
1660 } else {
1661 *(int *) data = 100;
1662 }
1663 return (0);
1664
1665 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1666 progressInfoPtr = (RF_ProgressInfo_t **) data;
1667 if (raidPtr->copyback_in_progress == 1) {
1668 progressInfo.total = raidPtr->Layout.numStripe;
1669 progressInfo.completed =
1670 raidPtr->copyback_stripes_done;
1671 progressInfo.remaining = progressInfo.total -
1672 progressInfo.completed;
1673 } else {
1674 progressInfo.remaining = 0;
1675 progressInfo.completed = 100;
1676 progressInfo.total = 100;
1677 }
1678 retcode = copyout(&progressInfo, *progressInfoPtr,
1679 sizeof(RF_ProgressInfo_t));
1680 return (retcode);
1681
1682 case RAIDFRAME_SET_LAST_UNIT:
1683 for (column = 0; column < raidPtr->numCol; column++)
1684 if (raidPtr->Disks[column].status != rf_ds_optimal)
1685 return EBUSY;
1686
1687 for (column = 0; column < raidPtr->numCol; column++) {
1688 clabel = raidget_component_label(raidPtr, column);
1689 clabel->last_unit = *(int *)data;
1690 raidflush_component_label(raidPtr, column);
1691 }
1692 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1693 return 0;
1694
1695 /* the sparetable daemon calls this to wait for the kernel to
1696 * need a spare table. this ioctl does not return until a
1697 * spare table is needed. XXX -- calling mpsleep here in the
1698 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1699 * -- I should either compute the spare table in the kernel,
1700 * or have a different -- XXX XXX -- interface (a different
1701 * character device) for delivering the table -- XXX */
1702 #if 0
1703 case RAIDFRAME_SPARET_WAIT:
1704 rf_lock_mutex2(rf_sparet_wait_mutex);
1705 while (!rf_sparet_wait_queue)
1706 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1707 waitreq = rf_sparet_wait_queue;
1708 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1709 rf_unlock_mutex2(rf_sparet_wait_mutex);
1710
1711 /* structure assignment */
1712 *((RF_SparetWait_t *) data) = *waitreq;
1713
1714 RF_Free(waitreq, sizeof(*waitreq));
1715 return (0);
1716
1717 /* wakes up a process waiting on SPARET_WAIT and puts an error
1718 * code in it that will cause the dameon to exit */
1719 case RAIDFRAME_ABORT_SPARET_WAIT:
1720 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1721 waitreq->fcol = -1;
1722 rf_lock_mutex2(rf_sparet_wait_mutex);
1723 waitreq->next = rf_sparet_wait_queue;
1724 rf_sparet_wait_queue = waitreq;
1725 rf_broadcast_conf2(rf_sparet_wait_cv);
1726 rf_unlock_mutex2(rf_sparet_wait_mutex);
1727 return (0);
1728
1729 /* used by the spare table daemon to deliver a spare table
1730 * into the kernel */
1731 case RAIDFRAME_SEND_SPARET:
1732
1733 /* install the spare table */
1734 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1735
1736 /* respond to the requestor. the return status of the spare
1737 * table installation is passed in the "fcol" field */
1738 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1739 waitreq->fcol = retcode;
1740 rf_lock_mutex2(rf_sparet_wait_mutex);
1741 waitreq->next = rf_sparet_resp_queue;
1742 rf_sparet_resp_queue = waitreq;
1743 rf_broadcast_cond2(rf_sparet_resp_cv);
1744 rf_unlock_mutex2(rf_sparet_wait_mutex);
1745
1746 return (retcode);
1747 #endif
1748
1749 default:
1750 break; /* fall through to the os-specific code below */
1751
1752 }
1753
1754 if (!raidPtr->valid)
1755 return (EINVAL);
1756
1757 /*
1758 * Add support for "regular" device ioctls here.
1759 */
1760
1761 error = dk_ioctl(dksc, dev, cmd, data, flag, l);
1762 if (error != EPASSTHROUGH)
1763 return (error);
1764
1765 switch (cmd) {
1766 case DIOCCACHESYNC:
1767 return rf_sync_component_caches(raidPtr);
1768
1769 default:
1770 retcode = ENOTTY;
1771 }
1772 return (retcode);
1773
1774 }
1775
1776
1777 /* raidinit -- complete the rest of the initialization for the
1778 RAIDframe device. */
1779
1780
1781 static void
1782 raidinit(struct raid_softc *rs)
1783 {
1784 cfdata_t cf;
1785 unsigned int unit;
1786 struct dk_softc *dksc = &rs->sc_dksc;
1787 RF_Raid_t *raidPtr = &rs->sc_r;
1788 device_t dev;
1789
1790 unit = raidPtr->raidid;
1791
1792 /* XXX doesn't check bounds. */
1793 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1794
1795 /* attach the pseudo device */
1796 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1797 cf->cf_name = raid_cd.cd_name;
1798 cf->cf_atname = raid_cd.cd_name;
1799 cf->cf_unit = unit;
1800 cf->cf_fstate = FSTATE_STAR;
1801
1802 dev = config_attach_pseudo(cf);
1803 if (dev == NULL) {
1804 printf("raid%d: config_attach_pseudo failed\n",
1805 raidPtr->raidid);
1806 free(cf, M_RAIDFRAME);
1807 return;
1808 }
1809
1810 /* provide a backpointer to the real softc */
1811 raidsoftc(dev) = rs;
1812
1813 /* disk_attach actually creates space for the CPU disklabel, among
1814 * other things, so it's critical to call this *BEFORE* we try putzing
1815 * with disklabels. */
1816 dk_init(dksc, dev, DKTYPE_RAID);
1817 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1818
1819 /* XXX There may be a weird interaction here between this, and
1820 * protectedSectors, as used in RAIDframe. */
1821
1822 rs->sc_size = raidPtr->totalSectors;
1823
1824 /* Attach dk and disk subsystems */
1825 dk_attach(dksc);
1826 disk_attach(&dksc->sc_dkdev);
1827 rf_set_geometry(rs, raidPtr);
1828
1829 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1830
1831 /* mark unit as usuable */
1832 rs->sc_flags |= RAIDF_INITED;
1833
1834 dkwedge_discover(&dksc->sc_dkdev);
1835 }
1836
1837 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1838 /* wake up the daemon & tell it to get us a spare table
1839 * XXX
1840 * the entries in the queues should be tagged with the raidPtr
1841 * so that in the extremely rare case that two recons happen at once,
1842 * we know for which device were requesting a spare table
1843 * XXX
1844 *
1845 * XXX This code is not currently used. GO
1846 */
1847 int
1848 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1849 {
1850 int retcode;
1851
1852 rf_lock_mutex2(rf_sparet_wait_mutex);
1853 req->next = rf_sparet_wait_queue;
1854 rf_sparet_wait_queue = req;
1855 rf_broadcast_cond2(rf_sparet_wait_cv);
1856
1857 /* mpsleep unlocks the mutex */
1858 while (!rf_sparet_resp_queue) {
1859 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1860 }
1861 req = rf_sparet_resp_queue;
1862 rf_sparet_resp_queue = req->next;
1863 rf_unlock_mutex2(rf_sparet_wait_mutex);
1864
1865 retcode = req->fcol;
1866 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1867 * alloc'd */
1868 return (retcode);
1869 }
1870 #endif
1871
1872 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1873 * bp & passes it down.
1874 * any calls originating in the kernel must use non-blocking I/O
1875 * do some extra sanity checking to return "appropriate" error values for
1876 * certain conditions (to make some standard utilities work)
1877 *
1878 * Formerly known as: rf_DoAccessKernel
1879 */
1880 void
1881 raidstart(RF_Raid_t *raidPtr)
1882 {
1883 struct raid_softc *rs;
1884 struct dk_softc *dksc;
1885
1886 rs = raidPtr->softc;
1887 dksc = &rs->sc_dksc;
1888 /* quick check to see if anything has died recently */
1889 rf_lock_mutex2(raidPtr->mutex);
1890 if (raidPtr->numNewFailures > 0) {
1891 rf_unlock_mutex2(raidPtr->mutex);
1892 rf_update_component_labels(raidPtr,
1893 RF_NORMAL_COMPONENT_UPDATE);
1894 rf_lock_mutex2(raidPtr->mutex);
1895 raidPtr->numNewFailures--;
1896 }
1897 rf_unlock_mutex2(raidPtr->mutex);
1898
1899 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1900 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1901 return;
1902 }
1903
1904 dk_start(dksc, NULL);
1905 }
1906
1907 static int
1908 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1909 {
1910 RF_SectorCount_t num_blocks, pb, sum;
1911 RF_RaidAddr_t raid_addr;
1912 daddr_t blocknum;
1913 int do_async;
1914 int rc;
1915
1916 rf_lock_mutex2(raidPtr->mutex);
1917 if (raidPtr->openings == 0) {
1918 rf_unlock_mutex2(raidPtr->mutex);
1919 return EAGAIN;
1920 }
1921 rf_unlock_mutex2(raidPtr->mutex);
1922
1923 blocknum = bp->b_rawblkno;
1924
1925 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1926 (int) blocknum));
1927
1928 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1929 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1930
1931 /* *THIS* is where we adjust what block we're going to...
1932 * but DO NOT TOUCH bp->b_blkno!!! */
1933 raid_addr = blocknum;
1934
1935 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1936 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1937 sum = raid_addr + num_blocks + pb;
1938 if (1 || rf_debugKernelAccess) {
1939 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1940 (int) raid_addr, (int) sum, (int) num_blocks,
1941 (int) pb, (int) bp->b_resid));
1942 }
1943 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1944 || (sum < num_blocks) || (sum < pb)) {
1945 rc = ENOSPC;
1946 goto done;
1947 }
1948 /*
1949 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1950 */
1951
1952 if (bp->b_bcount & raidPtr->sectorMask) {
1953 rc = ENOSPC;
1954 goto done;
1955 }
1956 db1_printf(("Calling DoAccess..\n"));
1957
1958
1959 rf_lock_mutex2(raidPtr->mutex);
1960 raidPtr->openings--;
1961 rf_unlock_mutex2(raidPtr->mutex);
1962
1963 /*
1964 * Everything is async.
1965 */
1966 do_async = 1;
1967
1968 /* don't ever condition on bp->b_flags & B_WRITE.
1969 * always condition on B_READ instead */
1970
1971 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1972 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1973 do_async, raid_addr, num_blocks,
1974 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1975
1976 done:
1977 return rc;
1978 }
1979
1980 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1981
1982 int
1983 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1984 {
1985 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1986 struct buf *bp;
1987
1988 req->queue = queue;
1989 bp = req->bp;
1990
1991 switch (req->type) {
1992 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1993 /* XXX need to do something extra here.. */
1994 /* I'm leaving this in, as I've never actually seen it used,
1995 * and I'd like folks to report it... GO */
1996 printf(("WAKEUP CALLED\n"));
1997 queue->numOutstanding++;
1998
1999 bp->b_flags = 0;
2000 bp->b_private = req;
2001
2002 KernelWakeupFunc(bp);
2003 break;
2004
2005 case RF_IO_TYPE_READ:
2006 case RF_IO_TYPE_WRITE:
2007 #if RF_ACC_TRACE > 0
2008 if (req->tracerec) {
2009 RF_ETIMER_START(req->tracerec->timer);
2010 }
2011 #endif
2012 InitBP(bp, queue->rf_cinfo->ci_vp,
2013 op, queue->rf_cinfo->ci_dev,
2014 req->sectorOffset, req->numSector,
2015 req->buf, KernelWakeupFunc, (void *) req,
2016 queue->raidPtr->logBytesPerSector, req->b_proc);
2017
2018 if (rf_debugKernelAccess) {
2019 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2020 (long) bp->b_blkno));
2021 }
2022 queue->numOutstanding++;
2023 queue->last_deq_sector = req->sectorOffset;
2024 /* acc wouldn't have been let in if there were any pending
2025 * reqs at any other priority */
2026 queue->curPriority = req->priority;
2027
2028 db1_printf(("Going for %c to unit %d col %d\n",
2029 req->type, queue->raidPtr->raidid,
2030 queue->col));
2031 db1_printf(("sector %d count %d (%d bytes) %d\n",
2032 (int) req->sectorOffset, (int) req->numSector,
2033 (int) (req->numSector <<
2034 queue->raidPtr->logBytesPerSector),
2035 (int) queue->raidPtr->logBytesPerSector));
2036
2037 /*
2038 * XXX: drop lock here since this can block at
2039 * least with backing SCSI devices. Retake it
2040 * to minimize fuss with calling interfaces.
2041 */
2042
2043 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2044 bdev_strategy(bp);
2045 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2046 break;
2047
2048 default:
2049 panic("bad req->type in rf_DispatchKernelIO");
2050 }
2051 db1_printf(("Exiting from DispatchKernelIO\n"));
2052
2053 return (0);
2054 }
2055 /* this is the callback function associated with a I/O invoked from
2056 kernel code.
2057 */
2058 static void
2059 KernelWakeupFunc(struct buf *bp)
2060 {
2061 RF_DiskQueueData_t *req = NULL;
2062 RF_DiskQueue_t *queue;
2063
2064 db1_printf(("recovering the request queue:\n"));
2065
2066 req = bp->b_private;
2067
2068 queue = (RF_DiskQueue_t *) req->queue;
2069
2070 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2071
2072 #if RF_ACC_TRACE > 0
2073 if (req->tracerec) {
2074 RF_ETIMER_STOP(req->tracerec->timer);
2075 RF_ETIMER_EVAL(req->tracerec->timer);
2076 rf_lock_mutex2(rf_tracing_mutex);
2077 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2078 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2079 req->tracerec->num_phys_ios++;
2080 rf_unlock_mutex2(rf_tracing_mutex);
2081 }
2082 #endif
2083
2084 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2085 * ballistic, and mark the component as hosed... */
2086
2087 if (bp->b_error != 0) {
2088 /* Mark the disk as dead */
2089 /* but only mark it once... */
2090 /* and only if it wouldn't leave this RAID set
2091 completely broken */
2092 if (((queue->raidPtr->Disks[queue->col].status ==
2093 rf_ds_optimal) ||
2094 (queue->raidPtr->Disks[queue->col].status ==
2095 rf_ds_used_spare)) &&
2096 (queue->raidPtr->numFailures <
2097 queue->raidPtr->Layout.map->faultsTolerated)) {
2098 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2099 queue->raidPtr->raidid,
2100 bp->b_error,
2101 queue->raidPtr->Disks[queue->col].devname);
2102 queue->raidPtr->Disks[queue->col].status =
2103 rf_ds_failed;
2104 queue->raidPtr->status = rf_rs_degraded;
2105 queue->raidPtr->numFailures++;
2106 queue->raidPtr->numNewFailures++;
2107 } else { /* Disk is already dead... */
2108 /* printf("Disk already marked as dead!\n"); */
2109 }
2110
2111 }
2112
2113 /* Fill in the error value */
2114 req->error = bp->b_error;
2115
2116 /* Drop this one on the "finished" queue... */
2117 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2118
2119 /* Let the raidio thread know there is work to be done. */
2120 rf_signal_cond2(queue->raidPtr->iodone_cv);
2121
2122 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2123 }
2124
2125
2126 /*
2127 * initialize a buf structure for doing an I/O in the kernel.
2128 */
2129 static void
2130 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2131 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2132 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2133 struct proc *b_proc)
2134 {
2135 /* bp->b_flags = B_PHYS | rw_flag; */
2136 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2137 bp->b_oflags = 0;
2138 bp->b_cflags = 0;
2139 bp->b_bcount = numSect << logBytesPerSector;
2140 bp->b_bufsize = bp->b_bcount;
2141 bp->b_error = 0;
2142 bp->b_dev = dev;
2143 bp->b_data = bf;
2144 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2145 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2146 if (bp->b_bcount == 0) {
2147 panic("bp->b_bcount is zero in InitBP!!");
2148 }
2149 bp->b_proc = b_proc;
2150 bp->b_iodone = cbFunc;
2151 bp->b_private = cbArg;
2152 }
2153
2154 /*
2155 * Wait interruptibly for an exclusive lock.
2156 *
2157 * XXX
2158 * Several drivers do this; it should be abstracted and made MP-safe.
2159 * (Hmm... where have we seen this warning before :-> GO )
2160 */
2161 static int
2162 raidlock(struct raid_softc *rs)
2163 {
2164 int error;
2165
2166 error = 0;
2167 mutex_enter(&rs->sc_mutex);
2168 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2169 rs->sc_flags |= RAIDF_WANTED;
2170 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2171 if (error != 0)
2172 goto done;
2173 }
2174 rs->sc_flags |= RAIDF_LOCKED;
2175 done:
2176 mutex_exit(&rs->sc_mutex);
2177 return (error);
2178 }
2179 /*
2180 * Unlock and wake up any waiters.
2181 */
2182 static void
2183 raidunlock(struct raid_softc *rs)
2184 {
2185
2186 mutex_enter(&rs->sc_mutex);
2187 rs->sc_flags &= ~RAIDF_LOCKED;
2188 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2189 rs->sc_flags &= ~RAIDF_WANTED;
2190 cv_broadcast(&rs->sc_cv);
2191 }
2192 mutex_exit(&rs->sc_mutex);
2193 }
2194
2195
2196 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2197 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2198 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2199
2200 static daddr_t
2201 rf_component_info_offset(void)
2202 {
2203
2204 return RF_COMPONENT_INFO_OFFSET;
2205 }
2206
2207 static daddr_t
2208 rf_component_info_size(unsigned secsize)
2209 {
2210 daddr_t info_size;
2211
2212 KASSERT(secsize);
2213 if (secsize > RF_COMPONENT_INFO_SIZE)
2214 info_size = secsize;
2215 else
2216 info_size = RF_COMPONENT_INFO_SIZE;
2217
2218 return info_size;
2219 }
2220
2221 static daddr_t
2222 rf_parity_map_offset(RF_Raid_t *raidPtr)
2223 {
2224 daddr_t map_offset;
2225
2226 KASSERT(raidPtr->bytesPerSector);
2227 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2228 map_offset = raidPtr->bytesPerSector;
2229 else
2230 map_offset = RF_COMPONENT_INFO_SIZE;
2231 map_offset += rf_component_info_offset();
2232
2233 return map_offset;
2234 }
2235
2236 static daddr_t
2237 rf_parity_map_size(RF_Raid_t *raidPtr)
2238 {
2239 daddr_t map_size;
2240
2241 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2242 map_size = raidPtr->bytesPerSector;
2243 else
2244 map_size = RF_PARITY_MAP_SIZE;
2245
2246 return map_size;
2247 }
2248
2249 int
2250 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2251 {
2252 RF_ComponentLabel_t *clabel;
2253
2254 clabel = raidget_component_label(raidPtr, col);
2255 clabel->clean = RF_RAID_CLEAN;
2256 raidflush_component_label(raidPtr, col);
2257 return(0);
2258 }
2259
2260
2261 int
2262 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2263 {
2264 RF_ComponentLabel_t *clabel;
2265
2266 clabel = raidget_component_label(raidPtr, col);
2267 clabel->clean = RF_RAID_DIRTY;
2268 raidflush_component_label(raidPtr, col);
2269 return(0);
2270 }
2271
2272 int
2273 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2274 {
2275 KASSERT(raidPtr->bytesPerSector);
2276 return raidread_component_label(raidPtr->bytesPerSector,
2277 raidPtr->Disks[col].dev,
2278 raidPtr->raid_cinfo[col].ci_vp,
2279 &raidPtr->raid_cinfo[col].ci_label);
2280 }
2281
2282 RF_ComponentLabel_t *
2283 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2284 {
2285 return &raidPtr->raid_cinfo[col].ci_label;
2286 }
2287
2288 int
2289 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2290 {
2291 RF_ComponentLabel_t *label;
2292
2293 label = &raidPtr->raid_cinfo[col].ci_label;
2294 label->mod_counter = raidPtr->mod_counter;
2295 #ifndef RF_NO_PARITY_MAP
2296 label->parity_map_modcount = label->mod_counter;
2297 #endif
2298 return raidwrite_component_label(raidPtr->bytesPerSector,
2299 raidPtr->Disks[col].dev,
2300 raidPtr->raid_cinfo[col].ci_vp, label);
2301 }
2302
2303
2304 static int
2305 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2306 RF_ComponentLabel_t *clabel)
2307 {
2308 return raidread_component_area(dev, b_vp, clabel,
2309 sizeof(RF_ComponentLabel_t),
2310 rf_component_info_offset(),
2311 rf_component_info_size(secsize));
2312 }
2313
2314 /* ARGSUSED */
2315 static int
2316 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2317 size_t msize, daddr_t offset, daddr_t dsize)
2318 {
2319 struct buf *bp;
2320 int error;
2321
2322 /* XXX should probably ensure that we don't try to do this if
2323 someone has changed rf_protected_sectors. */
2324
2325 if (b_vp == NULL) {
2326 /* For whatever reason, this component is not valid.
2327 Don't try to read a component label from it. */
2328 return(EINVAL);
2329 }
2330
2331 /* get a block of the appropriate size... */
2332 bp = geteblk((int)dsize);
2333 bp->b_dev = dev;
2334
2335 /* get our ducks in a row for the read */
2336 bp->b_blkno = offset / DEV_BSIZE;
2337 bp->b_bcount = dsize;
2338 bp->b_flags |= B_READ;
2339 bp->b_resid = dsize;
2340
2341 bdev_strategy(bp);
2342 error = biowait(bp);
2343
2344 if (!error) {
2345 memcpy(data, bp->b_data, msize);
2346 }
2347
2348 brelse(bp, 0);
2349 return(error);
2350 }
2351
2352
2353 static int
2354 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2355 RF_ComponentLabel_t *clabel)
2356 {
2357 return raidwrite_component_area(dev, b_vp, clabel,
2358 sizeof(RF_ComponentLabel_t),
2359 rf_component_info_offset(),
2360 rf_component_info_size(secsize), 0);
2361 }
2362
2363 /* ARGSUSED */
2364 static int
2365 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2366 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2367 {
2368 struct buf *bp;
2369 int error;
2370
2371 /* get a block of the appropriate size... */
2372 bp = geteblk((int)dsize);
2373 bp->b_dev = dev;
2374
2375 /* get our ducks in a row for the write */
2376 bp->b_blkno = offset / DEV_BSIZE;
2377 bp->b_bcount = dsize;
2378 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2379 bp->b_resid = dsize;
2380
2381 memset(bp->b_data, 0, dsize);
2382 memcpy(bp->b_data, data, msize);
2383
2384 bdev_strategy(bp);
2385 if (asyncp)
2386 return 0;
2387 error = biowait(bp);
2388 brelse(bp, 0);
2389 if (error) {
2390 #if 1
2391 printf("Failed to write RAID component info!\n");
2392 #endif
2393 }
2394
2395 return(error);
2396 }
2397
2398 void
2399 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2400 {
2401 int c;
2402
2403 for (c = 0; c < raidPtr->numCol; c++) {
2404 /* Skip dead disks. */
2405 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2406 continue;
2407 /* XXXjld: what if an error occurs here? */
2408 raidwrite_component_area(raidPtr->Disks[c].dev,
2409 raidPtr->raid_cinfo[c].ci_vp, map,
2410 RF_PARITYMAP_NBYTE,
2411 rf_parity_map_offset(raidPtr),
2412 rf_parity_map_size(raidPtr), 0);
2413 }
2414 }
2415
2416 void
2417 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2418 {
2419 struct rf_paritymap_ondisk tmp;
2420 int c,first;
2421
2422 first=1;
2423 for (c = 0; c < raidPtr->numCol; c++) {
2424 /* Skip dead disks. */
2425 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2426 continue;
2427 raidread_component_area(raidPtr->Disks[c].dev,
2428 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2429 RF_PARITYMAP_NBYTE,
2430 rf_parity_map_offset(raidPtr),
2431 rf_parity_map_size(raidPtr));
2432 if (first) {
2433 memcpy(map, &tmp, sizeof(*map));
2434 first = 0;
2435 } else {
2436 rf_paritymap_merge(map, &tmp);
2437 }
2438 }
2439 }
2440
2441 void
2442 rf_markalldirty(RF_Raid_t *raidPtr)
2443 {
2444 RF_ComponentLabel_t *clabel;
2445 int sparecol;
2446 int c;
2447 int j;
2448 int scol = -1;
2449
2450 raidPtr->mod_counter++;
2451 for (c = 0; c < raidPtr->numCol; c++) {
2452 /* we don't want to touch (at all) a disk that has
2453 failed */
2454 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2455 clabel = raidget_component_label(raidPtr, c);
2456 if (clabel->status == rf_ds_spared) {
2457 /* XXX do something special...
2458 but whatever you do, don't
2459 try to access it!! */
2460 } else {
2461 raidmarkdirty(raidPtr, c);
2462 }
2463 }
2464 }
2465
2466 for( c = 0; c < raidPtr->numSpare ; c++) {
2467 sparecol = raidPtr->numCol + c;
2468 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2469 /*
2470
2471 we claim this disk is "optimal" if it's
2472 rf_ds_used_spare, as that means it should be
2473 directly substitutable for the disk it replaced.
2474 We note that too...
2475
2476 */
2477
2478 for(j=0;j<raidPtr->numCol;j++) {
2479 if (raidPtr->Disks[j].spareCol == sparecol) {
2480 scol = j;
2481 break;
2482 }
2483 }
2484
2485 clabel = raidget_component_label(raidPtr, sparecol);
2486 /* make sure status is noted */
2487
2488 raid_init_component_label(raidPtr, clabel);
2489
2490 clabel->row = 0;
2491 clabel->column = scol;
2492 /* Note: we *don't* change status from rf_ds_used_spare
2493 to rf_ds_optimal */
2494 /* clabel.status = rf_ds_optimal; */
2495
2496 raidmarkdirty(raidPtr, sparecol);
2497 }
2498 }
2499 }
2500
2501
2502 void
2503 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2504 {
2505 RF_ComponentLabel_t *clabel;
2506 int sparecol;
2507 int c;
2508 int j;
2509 int scol;
2510 struct raid_softc *rs = raidPtr->softc;
2511
2512 scol = -1;
2513
2514 /* XXX should do extra checks to make sure things really are clean,
2515 rather than blindly setting the clean bit... */
2516
2517 raidPtr->mod_counter++;
2518
2519 for (c = 0; c < raidPtr->numCol; c++) {
2520 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2521 clabel = raidget_component_label(raidPtr, c);
2522 /* make sure status is noted */
2523 clabel->status = rf_ds_optimal;
2524
2525 /* note what unit we are configured as */
2526 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2527 clabel->last_unit = raidPtr->raidid;
2528
2529 raidflush_component_label(raidPtr, c);
2530 if (final == RF_FINAL_COMPONENT_UPDATE) {
2531 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2532 raidmarkclean(raidPtr, c);
2533 }
2534 }
2535 }
2536 /* else we don't touch it.. */
2537 }
2538
2539 for( c = 0; c < raidPtr->numSpare ; c++) {
2540 sparecol = raidPtr->numCol + c;
2541 /* Need to ensure that the reconstruct actually completed! */
2542 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2543 /*
2544
2545 we claim this disk is "optimal" if it's
2546 rf_ds_used_spare, as that means it should be
2547 directly substitutable for the disk it replaced.
2548 We note that too...
2549
2550 */
2551
2552 for(j=0;j<raidPtr->numCol;j++) {
2553 if (raidPtr->Disks[j].spareCol == sparecol) {
2554 scol = j;
2555 break;
2556 }
2557 }
2558
2559 /* XXX shouldn't *really* need this... */
2560 clabel = raidget_component_label(raidPtr, sparecol);
2561 /* make sure status is noted */
2562
2563 raid_init_component_label(raidPtr, clabel);
2564
2565 clabel->column = scol;
2566 clabel->status = rf_ds_optimal;
2567 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2568 clabel->last_unit = raidPtr->raidid;
2569
2570 raidflush_component_label(raidPtr, sparecol);
2571 if (final == RF_FINAL_COMPONENT_UPDATE) {
2572 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2573 raidmarkclean(raidPtr, sparecol);
2574 }
2575 }
2576 }
2577 }
2578 }
2579
2580 void
2581 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2582 {
2583
2584 if (vp != NULL) {
2585 if (auto_configured == 1) {
2586 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2587 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2588 vput(vp);
2589
2590 } else {
2591 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2592 }
2593 }
2594 }
2595
2596
2597 void
2598 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2599 {
2600 int r,c;
2601 struct vnode *vp;
2602 int acd;
2603
2604
2605 /* We take this opportunity to close the vnodes like we should.. */
2606
2607 for (c = 0; c < raidPtr->numCol; c++) {
2608 vp = raidPtr->raid_cinfo[c].ci_vp;
2609 acd = raidPtr->Disks[c].auto_configured;
2610 rf_close_component(raidPtr, vp, acd);
2611 raidPtr->raid_cinfo[c].ci_vp = NULL;
2612 raidPtr->Disks[c].auto_configured = 0;
2613 }
2614
2615 for (r = 0; r < raidPtr->numSpare; r++) {
2616 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2617 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2618 rf_close_component(raidPtr, vp, acd);
2619 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2620 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2621 }
2622 }
2623
2624
2625 void
2626 rf_ReconThread(struct rf_recon_req *req)
2627 {
2628 int s;
2629 RF_Raid_t *raidPtr;
2630
2631 s = splbio();
2632 raidPtr = (RF_Raid_t *) req->raidPtr;
2633 raidPtr->recon_in_progress = 1;
2634
2635 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2636 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2637
2638 RF_Free(req, sizeof(*req));
2639
2640 raidPtr->recon_in_progress = 0;
2641 splx(s);
2642
2643 /* That's all... */
2644 kthread_exit(0); /* does not return */
2645 }
2646
2647 void
2648 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2649 {
2650 int retcode;
2651 int s;
2652
2653 raidPtr->parity_rewrite_stripes_done = 0;
2654 raidPtr->parity_rewrite_in_progress = 1;
2655 s = splbio();
2656 retcode = rf_RewriteParity(raidPtr);
2657 splx(s);
2658 if (retcode) {
2659 printf("raid%d: Error re-writing parity (%d)!\n",
2660 raidPtr->raidid, retcode);
2661 } else {
2662 /* set the clean bit! If we shutdown correctly,
2663 the clean bit on each component label will get
2664 set */
2665 raidPtr->parity_good = RF_RAID_CLEAN;
2666 }
2667 raidPtr->parity_rewrite_in_progress = 0;
2668
2669 /* Anyone waiting for us to stop? If so, inform them... */
2670 if (raidPtr->waitShutdown) {
2671 wakeup(&raidPtr->parity_rewrite_in_progress);
2672 }
2673
2674 /* That's all... */
2675 kthread_exit(0); /* does not return */
2676 }
2677
2678
2679 void
2680 rf_CopybackThread(RF_Raid_t *raidPtr)
2681 {
2682 int s;
2683
2684 raidPtr->copyback_in_progress = 1;
2685 s = splbio();
2686 rf_CopybackReconstructedData(raidPtr);
2687 splx(s);
2688 raidPtr->copyback_in_progress = 0;
2689
2690 /* That's all... */
2691 kthread_exit(0); /* does not return */
2692 }
2693
2694
2695 void
2696 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2697 {
2698 int s;
2699 RF_Raid_t *raidPtr;
2700
2701 s = splbio();
2702 raidPtr = req->raidPtr;
2703 raidPtr->recon_in_progress = 1;
2704 rf_ReconstructInPlace(raidPtr, req->col);
2705 RF_Free(req, sizeof(*req));
2706 raidPtr->recon_in_progress = 0;
2707 splx(s);
2708
2709 /* That's all... */
2710 kthread_exit(0); /* does not return */
2711 }
2712
2713 static RF_AutoConfig_t *
2714 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2715 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2716 unsigned secsize)
2717 {
2718 int good_one = 0;
2719 RF_ComponentLabel_t *clabel;
2720 RF_AutoConfig_t *ac;
2721
2722 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2723 if (clabel == NULL) {
2724 oomem:
2725 while(ac_list) {
2726 ac = ac_list;
2727 if (ac->clabel)
2728 free(ac->clabel, M_RAIDFRAME);
2729 ac_list = ac_list->next;
2730 free(ac, M_RAIDFRAME);
2731 }
2732 printf("RAID auto config: out of memory!\n");
2733 return NULL; /* XXX probably should panic? */
2734 }
2735
2736 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2737 /* Got the label. Does it look reasonable? */
2738 if (rf_reasonable_label(clabel, numsecs) &&
2739 (rf_component_label_partitionsize(clabel) <= size)) {
2740 #ifdef DEBUG
2741 printf("Component on: %s: %llu\n",
2742 cname, (unsigned long long)size);
2743 rf_print_component_label(clabel);
2744 #endif
2745 /* if it's reasonable, add it, else ignore it. */
2746 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2747 M_NOWAIT);
2748 if (ac == NULL) {
2749 free(clabel, M_RAIDFRAME);
2750 goto oomem;
2751 }
2752 strlcpy(ac->devname, cname, sizeof(ac->devname));
2753 ac->dev = dev;
2754 ac->vp = vp;
2755 ac->clabel = clabel;
2756 ac->next = ac_list;
2757 ac_list = ac;
2758 good_one = 1;
2759 }
2760 }
2761 if (!good_one) {
2762 /* cleanup */
2763 free(clabel, M_RAIDFRAME);
2764 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2765 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2766 vput(vp);
2767 }
2768 return ac_list;
2769 }
2770
2771 RF_AutoConfig_t *
2772 rf_find_raid_components(void)
2773 {
2774 struct vnode *vp;
2775 struct disklabel label;
2776 device_t dv;
2777 deviter_t di;
2778 dev_t dev;
2779 int bmajor, bminor, wedge, rf_part_found;
2780 int error;
2781 int i;
2782 RF_AutoConfig_t *ac_list;
2783 uint64_t numsecs;
2784 unsigned secsize;
2785 int dowedges;
2786
2787 /* initialize the AutoConfig list */
2788 ac_list = NULL;
2789
2790 /*
2791 * we begin by trolling through *all* the devices on the system *twice*
2792 * first we scan for wedges, second for other devices. This avoids
2793 * using a raw partition instead of a wedge that covers the whole disk
2794 */
2795
2796 for (dowedges=1; dowedges>=0; --dowedges) {
2797 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2798 dv = deviter_next(&di)) {
2799
2800 /* we are only interested in disks... */
2801 if (device_class(dv) != DV_DISK)
2802 continue;
2803
2804 /* we don't care about floppies... */
2805 if (device_is_a(dv, "fd")) {
2806 continue;
2807 }
2808
2809 /* we don't care about CD's... */
2810 if (device_is_a(dv, "cd")) {
2811 continue;
2812 }
2813
2814 /* we don't care about md's... */
2815 if (device_is_a(dv, "md")) {
2816 continue;
2817 }
2818
2819 /* hdfd is the Atari/Hades floppy driver */
2820 if (device_is_a(dv, "hdfd")) {
2821 continue;
2822 }
2823
2824 /* fdisa is the Atari/Milan floppy driver */
2825 if (device_is_a(dv, "fdisa")) {
2826 continue;
2827 }
2828
2829 /* are we in the wedges pass ? */
2830 wedge = device_is_a(dv, "dk");
2831 if (wedge != dowedges) {
2832 continue;
2833 }
2834
2835 /* need to find the device_name_to_block_device_major stuff */
2836 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2837
2838 rf_part_found = 0; /*No raid partition as yet*/
2839
2840 /* get a vnode for the raw partition of this disk */
2841 bminor = minor(device_unit(dv));
2842 dev = wedge ? makedev(bmajor, bminor) :
2843 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2844 if (bdevvp(dev, &vp))
2845 panic("RAID can't alloc vnode");
2846
2847 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2848
2849 if (error) {
2850 /* "Who cares." Continue looking
2851 for something that exists*/
2852 vput(vp);
2853 continue;
2854 }
2855
2856 error = getdisksize(vp, &numsecs, &secsize);
2857 if (error) {
2858 /*
2859 * Pseudo devices like vnd and cgd can be
2860 * opened but may still need some configuration.
2861 * Ignore these quietly.
2862 */
2863 if (error != ENXIO)
2864 printf("RAIDframe: can't get disk size"
2865 " for dev %s (%d)\n",
2866 device_xname(dv), error);
2867 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2868 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2869 vput(vp);
2870 continue;
2871 }
2872 if (wedge) {
2873 struct dkwedge_info dkw;
2874 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2875 NOCRED);
2876 if (error) {
2877 printf("RAIDframe: can't get wedge info for "
2878 "dev %s (%d)\n", device_xname(dv), error);
2879 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2880 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2881 vput(vp);
2882 continue;
2883 }
2884
2885 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2886 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2887 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2888 vput(vp);
2889 continue;
2890 }
2891
2892 ac_list = rf_get_component(ac_list, dev, vp,
2893 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2894 rf_part_found = 1; /*There is a raid component on this disk*/
2895 continue;
2896 }
2897
2898 /* Ok, the disk exists. Go get the disklabel. */
2899 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2900 if (error) {
2901 /*
2902 * XXX can't happen - open() would
2903 * have errored out (or faked up one)
2904 */
2905 if (error != ENOTTY)
2906 printf("RAIDframe: can't get label for dev "
2907 "%s (%d)\n", device_xname(dv), error);
2908 }
2909
2910 /* don't need this any more. We'll allocate it again
2911 a little later if we really do... */
2912 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2913 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2914 vput(vp);
2915
2916 if (error)
2917 continue;
2918
2919 rf_part_found = 0; /*No raid partitions yet*/
2920 for (i = 0; i < label.d_npartitions; i++) {
2921 char cname[sizeof(ac_list->devname)];
2922
2923 /* We only support partitions marked as RAID */
2924 if (label.d_partitions[i].p_fstype != FS_RAID)
2925 continue;
2926
2927 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2928 if (bdevvp(dev, &vp))
2929 panic("RAID can't alloc vnode");
2930
2931 error = VOP_OPEN(vp, FREAD, NOCRED);
2932 if (error) {
2933 /* Whatever... */
2934 vput(vp);
2935 continue;
2936 }
2937 snprintf(cname, sizeof(cname), "%s%c",
2938 device_xname(dv), 'a' + i);
2939 ac_list = rf_get_component(ac_list, dev, vp, cname,
2940 label.d_partitions[i].p_size, numsecs, secsize);
2941 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2942 }
2943
2944 /*
2945 *If there is no raid component on this disk, either in a
2946 *disklabel or inside a wedge, check the raw partition as well,
2947 *as it is possible to configure raid components on raw disk
2948 *devices.
2949 */
2950
2951 if (!rf_part_found) {
2952 char cname[sizeof(ac_list->devname)];
2953
2954 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2955 if (bdevvp(dev, &vp))
2956 panic("RAID can't alloc vnode");
2957
2958 error = VOP_OPEN(vp, FREAD, NOCRED);
2959 if (error) {
2960 /* Whatever... */
2961 vput(vp);
2962 continue;
2963 }
2964 snprintf(cname, sizeof(cname), "%s%c",
2965 device_xname(dv), 'a' + RAW_PART);
2966 ac_list = rf_get_component(ac_list, dev, vp, cname,
2967 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2968 }
2969 }
2970 deviter_release(&di);
2971 }
2972 return ac_list;
2973 }
2974
2975
2976 int
2977 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2978 {
2979
2980 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2981 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2982 ((clabel->clean == RF_RAID_CLEAN) ||
2983 (clabel->clean == RF_RAID_DIRTY)) &&
2984 clabel->row >=0 &&
2985 clabel->column >= 0 &&
2986 clabel->num_rows > 0 &&
2987 clabel->num_columns > 0 &&
2988 clabel->row < clabel->num_rows &&
2989 clabel->column < clabel->num_columns &&
2990 clabel->blockSize > 0 &&
2991 /*
2992 * numBlocksHi may contain garbage, but it is ok since
2993 * the type is unsigned. If it is really garbage,
2994 * rf_fix_old_label_size() will fix it.
2995 */
2996 rf_component_label_numblocks(clabel) > 0) {
2997 /*
2998 * label looks reasonable enough...
2999 * let's make sure it has no old garbage.
3000 */
3001 if (numsecs)
3002 rf_fix_old_label_size(clabel, numsecs);
3003 return(1);
3004 }
3005 return(0);
3006 }
3007
3008
3009 /*
3010 * For reasons yet unknown, some old component labels have garbage in
3011 * the newer numBlocksHi region, and this causes lossage. Since those
3012 * disks will also have numsecs set to less than 32 bits of sectors,
3013 * we can determine when this corruption has occurred, and fix it.
3014 *
3015 * The exact same problem, with the same unknown reason, happens to
3016 * the partitionSizeHi member as well.
3017 */
3018 static void
3019 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3020 {
3021
3022 if (numsecs < ((uint64_t)1 << 32)) {
3023 if (clabel->numBlocksHi) {
3024 printf("WARNING: total sectors < 32 bits, yet "
3025 "numBlocksHi set\n"
3026 "WARNING: resetting numBlocksHi to zero.\n");
3027 clabel->numBlocksHi = 0;
3028 }
3029
3030 if (clabel->partitionSizeHi) {
3031 printf("WARNING: total sectors < 32 bits, yet "
3032 "partitionSizeHi set\n"
3033 "WARNING: resetting partitionSizeHi to zero.\n");
3034 clabel->partitionSizeHi = 0;
3035 }
3036 }
3037 }
3038
3039
3040 #ifdef DEBUG
3041 void
3042 rf_print_component_label(RF_ComponentLabel_t *clabel)
3043 {
3044 uint64_t numBlocks;
3045 static const char *rp[] = {
3046 "No", "Force", "Soft", "*invalid*"
3047 };
3048
3049
3050 numBlocks = rf_component_label_numblocks(clabel);
3051
3052 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3053 clabel->row, clabel->column,
3054 clabel->num_rows, clabel->num_columns);
3055 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3056 clabel->version, clabel->serial_number,
3057 clabel->mod_counter);
3058 printf(" Clean: %s Status: %d\n",
3059 clabel->clean ? "Yes" : "No", clabel->status);
3060 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3061 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3062 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3063 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3064 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3065 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3066 printf(" Last configured as: raid%d\n", clabel->last_unit);
3067 #if 0
3068 printf(" Config order: %d\n", clabel->config_order);
3069 #endif
3070
3071 }
3072 #endif
3073
3074 RF_ConfigSet_t *
3075 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3076 {
3077 RF_AutoConfig_t *ac;
3078 RF_ConfigSet_t *config_sets;
3079 RF_ConfigSet_t *cset;
3080 RF_AutoConfig_t *ac_next;
3081
3082
3083 config_sets = NULL;
3084
3085 /* Go through the AutoConfig list, and figure out which components
3086 belong to what sets. */
3087 ac = ac_list;
3088 while(ac!=NULL) {
3089 /* we're going to putz with ac->next, so save it here
3090 for use at the end of the loop */
3091 ac_next = ac->next;
3092
3093 if (config_sets == NULL) {
3094 /* will need at least this one... */
3095 config_sets = (RF_ConfigSet_t *)
3096 malloc(sizeof(RF_ConfigSet_t),
3097 M_RAIDFRAME, M_NOWAIT);
3098 if (config_sets == NULL) {
3099 panic("rf_create_auto_sets: No memory!");
3100 }
3101 /* this one is easy :) */
3102 config_sets->ac = ac;
3103 config_sets->next = NULL;
3104 config_sets->rootable = 0;
3105 ac->next = NULL;
3106 } else {
3107 /* which set does this component fit into? */
3108 cset = config_sets;
3109 while(cset!=NULL) {
3110 if (rf_does_it_fit(cset, ac)) {
3111 /* looks like it matches... */
3112 ac->next = cset->ac;
3113 cset->ac = ac;
3114 break;
3115 }
3116 cset = cset->next;
3117 }
3118 if (cset==NULL) {
3119 /* didn't find a match above... new set..*/
3120 cset = (RF_ConfigSet_t *)
3121 malloc(sizeof(RF_ConfigSet_t),
3122 M_RAIDFRAME, M_NOWAIT);
3123 if (cset == NULL) {
3124 panic("rf_create_auto_sets: No memory!");
3125 }
3126 cset->ac = ac;
3127 ac->next = NULL;
3128 cset->next = config_sets;
3129 cset->rootable = 0;
3130 config_sets = cset;
3131 }
3132 }
3133 ac = ac_next;
3134 }
3135
3136
3137 return(config_sets);
3138 }
3139
3140 static int
3141 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3142 {
3143 RF_ComponentLabel_t *clabel1, *clabel2;
3144
3145 /* If this one matches the *first* one in the set, that's good
3146 enough, since the other members of the set would have been
3147 through here too... */
3148 /* note that we are not checking partitionSize here..
3149
3150 Note that we are also not checking the mod_counters here.
3151 If everything else matches except the mod_counter, that's
3152 good enough for this test. We will deal with the mod_counters
3153 a little later in the autoconfiguration process.
3154
3155 (clabel1->mod_counter == clabel2->mod_counter) &&
3156
3157 The reason we don't check for this is that failed disks
3158 will have lower modification counts. If those disks are
3159 not added to the set they used to belong to, then they will
3160 form their own set, which may result in 2 different sets,
3161 for example, competing to be configured at raid0, and
3162 perhaps competing to be the root filesystem set. If the
3163 wrong ones get configured, or both attempt to become /,
3164 weird behaviour and or serious lossage will occur. Thus we
3165 need to bring them into the fold here, and kick them out at
3166 a later point.
3167
3168 */
3169
3170 clabel1 = cset->ac->clabel;
3171 clabel2 = ac->clabel;
3172 if ((clabel1->version == clabel2->version) &&
3173 (clabel1->serial_number == clabel2->serial_number) &&
3174 (clabel1->num_rows == clabel2->num_rows) &&
3175 (clabel1->num_columns == clabel2->num_columns) &&
3176 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3177 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3178 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3179 (clabel1->parityConfig == clabel2->parityConfig) &&
3180 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3181 (clabel1->blockSize == clabel2->blockSize) &&
3182 rf_component_label_numblocks(clabel1) ==
3183 rf_component_label_numblocks(clabel2) &&
3184 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3185 (clabel1->root_partition == clabel2->root_partition) &&
3186 (clabel1->last_unit == clabel2->last_unit) &&
3187 (clabel1->config_order == clabel2->config_order)) {
3188 /* if it get's here, it almost *has* to be a match */
3189 } else {
3190 /* it's not consistent with somebody in the set..
3191 punt */
3192 return(0);
3193 }
3194 /* all was fine.. it must fit... */
3195 return(1);
3196 }
3197
3198 int
3199 rf_have_enough_components(RF_ConfigSet_t *cset)
3200 {
3201 RF_AutoConfig_t *ac;
3202 RF_AutoConfig_t *auto_config;
3203 RF_ComponentLabel_t *clabel;
3204 int c;
3205 int num_cols;
3206 int num_missing;
3207 int mod_counter;
3208 int mod_counter_found;
3209 int even_pair_failed;
3210 char parity_type;
3211
3212
3213 /* check to see that we have enough 'live' components
3214 of this set. If so, we can configure it if necessary */
3215
3216 num_cols = cset->ac->clabel->num_columns;
3217 parity_type = cset->ac->clabel->parityConfig;
3218
3219 /* XXX Check for duplicate components!?!?!? */
3220
3221 /* Determine what the mod_counter is supposed to be for this set. */
3222
3223 mod_counter_found = 0;
3224 mod_counter = 0;
3225 ac = cset->ac;
3226 while(ac!=NULL) {
3227 if (mod_counter_found==0) {
3228 mod_counter = ac->clabel->mod_counter;
3229 mod_counter_found = 1;
3230 } else {
3231 if (ac->clabel->mod_counter > mod_counter) {
3232 mod_counter = ac->clabel->mod_counter;
3233 }
3234 }
3235 ac = ac->next;
3236 }
3237
3238 num_missing = 0;
3239 auto_config = cset->ac;
3240
3241 even_pair_failed = 0;
3242 for(c=0; c<num_cols; c++) {
3243 ac = auto_config;
3244 while(ac!=NULL) {
3245 if ((ac->clabel->column == c) &&
3246 (ac->clabel->mod_counter == mod_counter)) {
3247 /* it's this one... */
3248 #ifdef DEBUG
3249 printf("Found: %s at %d\n",
3250 ac->devname,c);
3251 #endif
3252 break;
3253 }
3254 ac=ac->next;
3255 }
3256 if (ac==NULL) {
3257 /* Didn't find one here! */
3258 /* special case for RAID 1, especially
3259 where there are more than 2
3260 components (where RAIDframe treats
3261 things a little differently :( ) */
3262 if (parity_type == '1') {
3263 if (c%2 == 0) { /* even component */
3264 even_pair_failed = 1;
3265 } else { /* odd component. If
3266 we're failed, and
3267 so is the even
3268 component, it's
3269 "Good Night, Charlie" */
3270 if (even_pair_failed == 1) {
3271 return(0);
3272 }
3273 }
3274 } else {
3275 /* normal accounting */
3276 num_missing++;
3277 }
3278 }
3279 if ((parity_type == '1') && (c%2 == 1)) {
3280 /* Just did an even component, and we didn't
3281 bail.. reset the even_pair_failed flag,
3282 and go on to the next component.... */
3283 even_pair_failed = 0;
3284 }
3285 }
3286
3287 clabel = cset->ac->clabel;
3288
3289 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3290 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3291 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3292 /* XXX this needs to be made *much* more general */
3293 /* Too many failures */
3294 return(0);
3295 }
3296 /* otherwise, all is well, and we've got enough to take a kick
3297 at autoconfiguring this set */
3298 return(1);
3299 }
3300
3301 void
3302 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3303 RF_Raid_t *raidPtr)
3304 {
3305 RF_ComponentLabel_t *clabel;
3306 int i;
3307
3308 clabel = ac->clabel;
3309
3310 /* 1. Fill in the common stuff */
3311 config->numRow = clabel->num_rows = 1;
3312 config->numCol = clabel->num_columns;
3313 config->numSpare = 0; /* XXX should this be set here? */
3314 config->sectPerSU = clabel->sectPerSU;
3315 config->SUsPerPU = clabel->SUsPerPU;
3316 config->SUsPerRU = clabel->SUsPerRU;
3317 config->parityConfig = clabel->parityConfig;
3318 /* XXX... */
3319 strcpy(config->diskQueueType,"fifo");
3320 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3321 config->layoutSpecificSize = 0; /* XXX ?? */
3322
3323 while(ac!=NULL) {
3324 /* row/col values will be in range due to the checks
3325 in reasonable_label() */
3326 strcpy(config->devnames[0][ac->clabel->column],
3327 ac->devname);
3328 ac = ac->next;
3329 }
3330
3331 for(i=0;i<RF_MAXDBGV;i++) {
3332 config->debugVars[i][0] = 0;
3333 }
3334 }
3335
3336 int
3337 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3338 {
3339 RF_ComponentLabel_t *clabel;
3340 int column;
3341 int sparecol;
3342
3343 raidPtr->autoconfigure = new_value;
3344
3345 for(column=0; column<raidPtr->numCol; column++) {
3346 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3347 clabel = raidget_component_label(raidPtr, column);
3348 clabel->autoconfigure = new_value;
3349 raidflush_component_label(raidPtr, column);
3350 }
3351 }
3352 for(column = 0; column < raidPtr->numSpare ; column++) {
3353 sparecol = raidPtr->numCol + column;
3354 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3355 clabel = raidget_component_label(raidPtr, sparecol);
3356 clabel->autoconfigure = new_value;
3357 raidflush_component_label(raidPtr, sparecol);
3358 }
3359 }
3360 return(new_value);
3361 }
3362
3363 int
3364 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3365 {
3366 RF_ComponentLabel_t *clabel;
3367 int column;
3368 int sparecol;
3369
3370 raidPtr->root_partition = new_value;
3371 for(column=0; column<raidPtr->numCol; column++) {
3372 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3373 clabel = raidget_component_label(raidPtr, column);
3374 clabel->root_partition = new_value;
3375 raidflush_component_label(raidPtr, column);
3376 }
3377 }
3378 for(column = 0; column < raidPtr->numSpare ; column++) {
3379 sparecol = raidPtr->numCol + column;
3380 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3381 clabel = raidget_component_label(raidPtr, sparecol);
3382 clabel->root_partition = new_value;
3383 raidflush_component_label(raidPtr, sparecol);
3384 }
3385 }
3386 return(new_value);
3387 }
3388
3389 void
3390 rf_release_all_vps(RF_ConfigSet_t *cset)
3391 {
3392 RF_AutoConfig_t *ac;
3393
3394 ac = cset->ac;
3395 while(ac!=NULL) {
3396 /* Close the vp, and give it back */
3397 if (ac->vp) {
3398 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3399 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3400 vput(ac->vp);
3401 ac->vp = NULL;
3402 }
3403 ac = ac->next;
3404 }
3405 }
3406
3407
3408 void
3409 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3410 {
3411 RF_AutoConfig_t *ac;
3412 RF_AutoConfig_t *next_ac;
3413
3414 ac = cset->ac;
3415 while(ac!=NULL) {
3416 next_ac = ac->next;
3417 /* nuke the label */
3418 free(ac->clabel, M_RAIDFRAME);
3419 /* cleanup the config structure */
3420 free(ac, M_RAIDFRAME);
3421 /* "next.." */
3422 ac = next_ac;
3423 }
3424 /* and, finally, nuke the config set */
3425 free(cset, M_RAIDFRAME);
3426 }
3427
3428
3429 void
3430 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3431 {
3432 /* current version number */
3433 clabel->version = RF_COMPONENT_LABEL_VERSION;
3434 clabel->serial_number = raidPtr->serial_number;
3435 clabel->mod_counter = raidPtr->mod_counter;
3436
3437 clabel->num_rows = 1;
3438 clabel->num_columns = raidPtr->numCol;
3439 clabel->clean = RF_RAID_DIRTY; /* not clean */
3440 clabel->status = rf_ds_optimal; /* "It's good!" */
3441
3442 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3443 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3444 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3445
3446 clabel->blockSize = raidPtr->bytesPerSector;
3447 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3448
3449 /* XXX not portable */
3450 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3451 clabel->maxOutstanding = raidPtr->maxOutstanding;
3452 clabel->autoconfigure = raidPtr->autoconfigure;
3453 clabel->root_partition = raidPtr->root_partition;
3454 clabel->last_unit = raidPtr->raidid;
3455 clabel->config_order = raidPtr->config_order;
3456
3457 #ifndef RF_NO_PARITY_MAP
3458 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3459 #endif
3460 }
3461
3462 struct raid_softc *
3463 rf_auto_config_set(RF_ConfigSet_t *cset)
3464 {
3465 RF_Raid_t *raidPtr;
3466 RF_Config_t *config;
3467 int raidID;
3468 struct raid_softc *sc;
3469
3470 #ifdef DEBUG
3471 printf("RAID autoconfigure\n");
3472 #endif
3473
3474 /* 1. Create a config structure */
3475 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3476 if (config == NULL) {
3477 printf("%s: Out of mem - config!?!?\n", __func__);
3478 /* XXX do something more intelligent here. */
3479 return NULL;
3480 }
3481
3482 /*
3483 2. Figure out what RAID ID this one is supposed to live at
3484 See if we can get the same RAID dev that it was configured
3485 on last time..
3486 */
3487
3488 raidID = cset->ac->clabel->last_unit;
3489 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3490 sc = raidget(++raidID, false))
3491 continue;
3492 #ifdef DEBUG
3493 printf("Configuring raid%d:\n",raidID);
3494 #endif
3495
3496 if (sc == NULL)
3497 sc = raidget(raidID, true);
3498 if (sc == NULL) {
3499 printf("%s: Out of mem - softc!?!?\n", __func__);
3500 /* XXX do something more intelligent here. */
3501 free(config, M_RAIDFRAME);
3502 return NULL;
3503 }
3504
3505 raidPtr = &sc->sc_r;
3506
3507 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3508 raidPtr->softc = sc;
3509 raidPtr->raidid = raidID;
3510 raidPtr->openings = RAIDOUTSTANDING;
3511
3512 /* 3. Build the configuration structure */
3513 rf_create_configuration(cset->ac, config, raidPtr);
3514
3515 /* 4. Do the configuration */
3516 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3517 raidinit(sc);
3518
3519 rf_markalldirty(raidPtr);
3520 raidPtr->autoconfigure = 1; /* XXX do this here? */
3521 switch (cset->ac->clabel->root_partition) {
3522 case 1: /* Force Root */
3523 case 2: /* Soft Root: root when boot partition part of raid */
3524 /*
3525 * everything configured just fine. Make a note
3526 * that this set is eligible to be root,
3527 * or forced to be root
3528 */
3529 cset->rootable = cset->ac->clabel->root_partition;
3530 /* XXX do this here? */
3531 raidPtr->root_partition = cset->rootable;
3532 break;
3533 default:
3534 break;
3535 }
3536 } else {
3537 raidput(sc);
3538 sc = NULL;
3539 }
3540
3541 /* 5. Cleanup */
3542 free(config, M_RAIDFRAME);
3543 return sc;
3544 }
3545
3546 void
3547 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3548 size_t xmin, size_t xmax)
3549 {
3550 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3551 pool_sethiwat(p, xmax);
3552 pool_prime(p, xmin);
3553 pool_setlowat(p, xmin);
3554 }
3555
3556 /*
3557 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3558 * to see if there is IO pending and if that IO could possibly be done
3559 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3560 * otherwise.
3561 *
3562 */
3563 int
3564 rf_buf_queue_check(RF_Raid_t *raidPtr)
3565 {
3566 struct raid_softc *rs;
3567 struct dk_softc *dksc;
3568
3569 rs = raidPtr->softc;
3570 dksc = &rs->sc_dksc;
3571
3572 if ((rs->sc_flags & RAIDF_INITED) == 0)
3573 return 1;
3574
3575 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3576 /* there is work to do */
3577 return 0;
3578 }
3579 /* default is nothing to do */
3580 return 1;
3581 }
3582
3583 int
3584 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3585 {
3586 uint64_t numsecs;
3587 unsigned secsize;
3588 int error;
3589
3590 error = getdisksize(vp, &numsecs, &secsize);
3591 if (error == 0) {
3592 diskPtr->blockSize = secsize;
3593 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3594 diskPtr->partitionSize = numsecs;
3595 return 0;
3596 }
3597 return error;
3598 }
3599
3600 static int
3601 raid_match(device_t self, cfdata_t cfdata, void *aux)
3602 {
3603 return 1;
3604 }
3605
3606 static void
3607 raid_attach(device_t parent, device_t self, void *aux)
3608 {
3609 }
3610
3611
3612 static int
3613 raid_detach(device_t self, int flags)
3614 {
3615 int error;
3616 struct raid_softc *rs = raidsoftc(self);
3617
3618 if (rs == NULL)
3619 return ENXIO;
3620
3621 if ((error = raidlock(rs)) != 0)
3622 return (error);
3623
3624 error = raid_detach_unlocked(rs);
3625
3626 raidunlock(rs);
3627
3628 /* XXX raid can be referenced here */
3629
3630 if (error)
3631 return error;
3632
3633 /* Free the softc */
3634 raidput(rs);
3635
3636 return 0;
3637 }
3638
3639 static void
3640 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3641 {
3642 struct dk_softc *dksc = &rs->sc_dksc;
3643 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3644
3645 memset(dg, 0, sizeof(*dg));
3646
3647 dg->dg_secperunit = raidPtr->totalSectors;
3648 dg->dg_secsize = raidPtr->bytesPerSector;
3649 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3650 dg->dg_ntracks = 4 * raidPtr->numCol;
3651
3652 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3653 }
3654
3655 /*
3656 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3657 * We end up returning whatever error was returned by the first cache flush
3658 * that fails.
3659 */
3660
3661 int
3662 rf_sync_component_caches(RF_Raid_t *raidPtr)
3663 {
3664 int c, sparecol;
3665 int e,error;
3666 int force = 1;
3667
3668 error = 0;
3669 for (c = 0; c < raidPtr->numCol; c++) {
3670 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3671 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3672 &force, FWRITE, NOCRED);
3673 if (e) {
3674 if (e != ENODEV)
3675 printf("raid%d: cache flush to component %s failed.\n",
3676 raidPtr->raidid, raidPtr->Disks[c].devname);
3677 if (error == 0) {
3678 error = e;
3679 }
3680 }
3681 }
3682 }
3683
3684 for( c = 0; c < raidPtr->numSpare ; c++) {
3685 sparecol = raidPtr->numCol + c;
3686 /* Need to ensure that the reconstruct actually completed! */
3687 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3688 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3689 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3690 if (e) {
3691 if (e != ENODEV)
3692 printf("raid%d: cache flush to component %s failed.\n",
3693 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3694 if (error == 0) {
3695 error = e;
3696 }
3697 }
3698 }
3699 }
3700 return error;
3701 }
3702
3703 /*
3704 * Module interface
3705 */
3706
3707 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3708
3709 #ifdef _MODULE
3710 CFDRIVER_DECL(raid, DV_DISK, NULL);
3711 #endif
3712
3713 static int raid_modcmd(modcmd_t, void *);
3714 static int raid_modcmd_init(void);
3715 static int raid_modcmd_fini(void);
3716
3717 static int
3718 raid_modcmd(modcmd_t cmd, void *data)
3719 {
3720 int error;
3721
3722 error = 0;
3723 switch (cmd) {
3724 case MODULE_CMD_INIT:
3725 error = raid_modcmd_init();
3726 break;
3727 case MODULE_CMD_FINI:
3728 error = raid_modcmd_fini();
3729 break;
3730 default:
3731 error = ENOTTY;
3732 break;
3733 }
3734 return error;
3735 }
3736
3737 static int
3738 raid_modcmd_init(void)
3739 {
3740 int error;
3741 int bmajor, cmajor;
3742
3743 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3744 mutex_enter(&raid_lock);
3745 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3746 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3747 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3748 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3749
3750 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3751 #endif
3752
3753 bmajor = cmajor = -1;
3754 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3755 &raid_cdevsw, &cmajor);
3756 if (error != 0 && error != EEXIST) {
3757 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3758 mutex_exit(&raid_lock);
3759 return error;
3760 }
3761 #ifdef _MODULE
3762 error = config_cfdriver_attach(&raid_cd);
3763 if (error != 0) {
3764 aprint_error("%s: config_cfdriver_attach failed %d\n",
3765 __func__, error);
3766 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3767 mutex_exit(&raid_lock);
3768 return error;
3769 }
3770 #endif
3771 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3772 if (error != 0) {
3773 aprint_error("%s: config_cfattach_attach failed %d\n",
3774 __func__, error);
3775 #ifdef _MODULE
3776 config_cfdriver_detach(&raid_cd);
3777 #endif
3778 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3779 mutex_exit(&raid_lock);
3780 return error;
3781 }
3782
3783 raidautoconfigdone = false;
3784
3785 mutex_exit(&raid_lock);
3786
3787 if (error == 0) {
3788 if (rf_BootRaidframe(true) == 0)
3789 aprint_verbose("Kernelized RAIDframe activated\n");
3790 else
3791 panic("Serious error activating RAID!!");
3792 }
3793
3794 /*
3795 * Register a finalizer which will be used to auto-config RAID
3796 * sets once all real hardware devices have been found.
3797 */
3798 error = config_finalize_register(NULL, rf_autoconfig);
3799 if (error != 0) {
3800 aprint_error("WARNING: unable to register RAIDframe "
3801 "finalizer\n");
3802 error = 0;
3803 }
3804
3805 return error;
3806 }
3807
3808 static int
3809 raid_modcmd_fini(void)
3810 {
3811 int error;
3812
3813 mutex_enter(&raid_lock);
3814
3815 /* Don't allow unload if raid device(s) exist. */
3816 if (!LIST_EMPTY(&raids)) {
3817 mutex_exit(&raid_lock);
3818 return EBUSY;
3819 }
3820
3821 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3822 if (error != 0) {
3823 aprint_error("%s: cannot detach cfattach\n",__func__);
3824 mutex_exit(&raid_lock);
3825 return error;
3826 }
3827 #ifdef _MODULE
3828 error = config_cfdriver_detach(&raid_cd);
3829 if (error != 0) {
3830 aprint_error("%s: cannot detach cfdriver\n",__func__);
3831 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3832 mutex_exit(&raid_lock);
3833 return error;
3834 }
3835 #endif
3836 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3837 if (error != 0) {
3838 aprint_error("%s: cannot detach devsw\n",__func__);
3839 #ifdef _MODULE
3840 config_cfdriver_attach(&raid_cd);
3841 #endif
3842 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3843 mutex_exit(&raid_lock);
3844 return error;
3845 }
3846 rf_BootRaidframe(false);
3847 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3848 rf_destroy_mutex2(rf_sparet_wait_mutex);
3849 rf_destroy_cond2(rf_sparet_wait_cv);
3850 rf_destroy_cond2(rf_sparet_resp_cv);
3851 #endif
3852 mutex_exit(&raid_lock);
3853 mutex_destroy(&raid_lock);
3854
3855 return error;
3856 }
3857