rf_netbsdkintf.c revision 1.302 1 /* $NetBSD: rf_netbsdkintf.c,v 1.302 2013/04/29 21:21:10 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.302 2013/04/29 21:21:10 christos Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129
130 #include <prop/proplib.h>
131
132 #include <dev/raidframe/raidframevar.h>
133 #include <dev/raidframe/raidframeio.h>
134 #include <dev/raidframe/rf_paritymap.h>
135
136 #include "rf_raid.h"
137 #include "rf_copyback.h"
138 #include "rf_dag.h"
139 #include "rf_dagflags.h"
140 #include "rf_desc.h"
141 #include "rf_diskqueue.h"
142 #include "rf_etimer.h"
143 #include "rf_general.h"
144 #include "rf_kintf.h"
145 #include "rf_options.h"
146 #include "rf_driver.h"
147 #include "rf_parityscan.h"
148 #include "rf_threadstuff.h"
149
150 #ifdef COMPAT_50
151 #include "rf_compat50.h"
152 #endif
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165
166 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
167 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
169 * installation process */
170 #endif
171
172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
173
174 /* prototypes */
175 static void KernelWakeupFunc(struct buf *);
176 static void InitBP(struct buf *, struct vnode *, unsigned,
177 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
178 void *, int, struct proc *);
179 struct raid_softc;
180 static void raidinit(struct raid_softc *);
181
182 void raidattach(int);
183 static int raid_match(device_t, cfdata_t, void *);
184 static void raid_attach(device_t, device_t, void *);
185 static int raid_detach(device_t, int);
186
187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
188 daddr_t, daddr_t);
189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t, int);
191
192 static int raidwrite_component_label(unsigned,
193 dev_t, struct vnode *, RF_ComponentLabel_t *);
194 static int raidread_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196
197
198 dev_type_open(raidopen);
199 dev_type_close(raidclose);
200 dev_type_read(raidread);
201 dev_type_write(raidwrite);
202 dev_type_ioctl(raidioctl);
203 dev_type_strategy(raidstrategy);
204 dev_type_dump(raiddump);
205 dev_type_size(raidsize);
206
207 const struct bdevsw raid_bdevsw = {
208 raidopen, raidclose, raidstrategy, raidioctl,
209 raiddump, raidsize, D_DISK
210 };
211
212 const struct cdevsw raid_cdevsw = {
213 raidopen, raidclose, raidread, raidwrite, raidioctl,
214 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
215 };
216
217 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
218
219 struct raid_softc {
220 device_t sc_dev;
221 int sc_unit;
222 int sc_flags; /* flags */
223 int sc_cflags; /* configuration flags */
224 uint64_t sc_size; /* size of the raid device */
225 char sc_xname[20]; /* XXX external name */
226 struct disk sc_dkdev; /* generic disk device info */
227 struct bufq_state *buf_queue; /* used for the device queue */
228 RF_Raid_t sc_r;
229 LIST_ENTRY(raid_softc) sc_link;
230 };
231 /* sc_flags */
232 #define RAIDF_INITED 0x01 /* unit has been initialized */
233 #define RAIDF_WLABEL 0x02 /* label area is writable */
234 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
235 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
236 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
237 #define RAIDF_LOCKED 0x80 /* unit is locked */
238
239 #define raidunit(x) DISKUNIT(x)
240
241 extern struct cfdriver raid_cd;
242 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
243 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
244 DVF_DETACH_SHUTDOWN);
245
246 /*
247 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
248 * Be aware that large numbers can allow the driver to consume a lot of
249 * kernel memory, especially on writes, and in degraded mode reads.
250 *
251 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
252 * a single 64K write will typically require 64K for the old data,
253 * 64K for the old parity, and 64K for the new parity, for a total
254 * of 192K (if the parity buffer is not re-used immediately).
255 * Even it if is used immediately, that's still 128K, which when multiplied
256 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
257 *
258 * Now in degraded mode, for example, a 64K read on the above setup may
259 * require data reconstruction, which will require *all* of the 4 remaining
260 * disks to participate -- 4 * 32K/disk == 128K again.
261 */
262
263 #ifndef RAIDOUTSTANDING
264 #define RAIDOUTSTANDING 6
265 #endif
266
267 #define RAIDLABELDEV(dev) \
268 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
269
270 /* declared here, and made public, for the benefit of KVM stuff.. */
271
272 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
273 struct disklabel *);
274 static void raidgetdisklabel(dev_t);
275 static void raidmakedisklabel(struct raid_softc *);
276
277 static int raidlock(struct raid_softc *);
278 static void raidunlock(struct raid_softc *);
279
280 static int raid_detach_unlocked(struct raid_softc *);
281
282 static void rf_markalldirty(RF_Raid_t *);
283 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
284
285 void rf_ReconThread(struct rf_recon_req *);
286 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
287 void rf_CopybackThread(RF_Raid_t *raidPtr);
288 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
289 int rf_autoconfig(device_t);
290 void rf_buildroothack(RF_ConfigSet_t *);
291
292 RF_AutoConfig_t *rf_find_raid_components(void);
293 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
294 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
295 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
296 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
297 int rf_set_autoconfig(RF_Raid_t *, int);
298 int rf_set_rootpartition(RF_Raid_t *, int);
299 void rf_release_all_vps(RF_ConfigSet_t *);
300 void rf_cleanup_config_set(RF_ConfigSet_t *);
301 int rf_have_enough_components(RF_ConfigSet_t *);
302 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
303 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
304
305 /*
306 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
307 * Note that this is overridden by having RAID_AUTOCONFIG as an option
308 * in the kernel config file.
309 */
310 #ifdef RAID_AUTOCONFIG
311 int raidautoconfig = 1;
312 #else
313 int raidautoconfig = 0;
314 #endif
315 static bool raidautoconfigdone = false;
316
317 struct RF_Pools_s rf_pools;
318
319 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
320 static kmutex_t raid_lock;
321
322 static struct raid_softc *
323 raidcreate(int unit) {
324 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
325 if (sc == NULL) {
326 #ifdef DIAGNOSTIC
327 printf("%s: out of memory\n", __func__);
328 #endif
329 return NULL;
330 }
331 sc->sc_unit = unit;
332 bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
333 return sc;
334 }
335
336 static void
337 raiddestroy(struct raid_softc *sc) {
338 bufq_free(sc->buf_queue);
339 kmem_free(sc, sizeof(*sc));
340 }
341
342 static struct raid_softc *
343 raidget(int unit) {
344 struct raid_softc *sc;
345 if (unit < 0) {
346 #ifdef DIAGNOSTIC
347 panic("%s: unit %d!", __func__, unit);
348 #endif
349 return NULL;
350 }
351 mutex_enter(&raid_lock);
352 LIST_FOREACH(sc, &raids, sc_link) {
353 if (sc->sc_unit == unit) {
354 mutex_exit(&raid_lock);
355 return sc;
356 }
357 }
358 mutex_exit(&raid_lock);
359 if ((sc = raidcreate(unit)) == NULL)
360 return NULL;
361 mutex_enter(&raid_lock);
362 LIST_INSERT_HEAD(&raids, sc, sc_link);
363 mutex_exit(&raid_lock);
364 return sc;
365 }
366
367 static void
368 raidput(struct raid_softc *sc) {
369 mutex_enter(&raid_lock);
370 LIST_REMOVE(sc, sc_link);
371 mutex_exit(&raid_lock);
372 raiddestroy(sc);
373 }
374
375 void
376 raidattach(int num)
377 {
378 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
379 /* This is where all the initialization stuff gets done. */
380
381 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
382 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
383 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
384 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
385
386 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
387 #endif
388
389 if (rf_BootRaidframe() == 0)
390 aprint_verbose("Kernelized RAIDframe activated\n");
391 else
392 panic("Serious error booting RAID!!");
393
394 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
395 aprint_error("raidattach: config_cfattach_attach failed?\n");
396 }
397
398 raidautoconfigdone = false;
399
400 /*
401 * Register a finalizer which will be used to auto-config RAID
402 * sets once all real hardware devices have been found.
403 */
404 if (config_finalize_register(NULL, rf_autoconfig) != 0)
405 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
406 }
407
408 int
409 rf_autoconfig(device_t self)
410 {
411 RF_AutoConfig_t *ac_list;
412 RF_ConfigSet_t *config_sets;
413
414 if (!raidautoconfig || raidautoconfigdone == true)
415 return (0);
416
417 /* XXX This code can only be run once. */
418 raidautoconfigdone = true;
419
420 /* 1. locate all RAID components on the system */
421 aprint_debug("Searching for RAID components...\n");
422 ac_list = rf_find_raid_components();
423
424 /* 2. Sort them into their respective sets. */
425 config_sets = rf_create_auto_sets(ac_list);
426
427 /*
428 * 3. Evaluate each set and configure the valid ones.
429 * This gets done in rf_buildroothack().
430 */
431 rf_buildroothack(config_sets);
432
433 return 1;
434 }
435
436 void
437 rf_buildroothack(RF_ConfigSet_t *config_sets)
438 {
439 RF_ConfigSet_t *cset;
440 RF_ConfigSet_t *next_cset;
441 int col;
442 int num_root;
443 char *devname;
444 struct raid_softc *sc, *rsc;
445
446 sc = rsc = NULL;
447 num_root = 0;
448 cset = config_sets;
449 while (cset != NULL) {
450 next_cset = cset->next;
451 if (rf_have_enough_components(cset) &&
452 cset->ac->clabel->autoconfigure == 1) {
453 sc = rf_auto_config_set(cset);
454 if (sc != NULL) {
455 aprint_debug("raid%d: configured ok\n",
456 sc->sc_unit);
457 if (cset->rootable) {
458 rsc = sc;
459 num_root++;
460 }
461 } else {
462 /* The autoconfig didn't work :( */
463 aprint_debug("Autoconfig failed\n");
464 rf_release_all_vps(cset);
465 }
466 } else {
467 /* we're not autoconfiguring this set...
468 release the associated resources */
469 rf_release_all_vps(cset);
470 }
471 /* cleanup */
472 rf_cleanup_config_set(cset);
473 cset = next_cset;
474 }
475
476 /* if the user has specified what the root device should be
477 then we don't touch booted_device or boothowto... */
478
479 if (rootspec != NULL)
480 return;
481
482 /* we found something bootable... */
483
484 if (num_root == 1) {
485 if (rsc->sc_dkdev.dk_nwedges != 0) {
486 /* XXX: How do we find the real root partition? */
487 char cname[sizeof(cset->ac->devname)];
488 snprintf(cname, sizeof(cname), "%s%c",
489 device_xname(rsc->sc_dev), 'a');
490 booted_device = dkwedge_find_by_wname(cname);
491 } else
492 booted_device = rsc->sc_dev;
493 } else if (num_root > 1) {
494
495 /*
496 * Maybe the MD code can help. If it cannot, then
497 * setroot() will discover that we have no
498 * booted_device and will ask the user if nothing was
499 * hardwired in the kernel config file
500 */
501
502 if (booted_device == NULL)
503 cpu_rootconf();
504 if (booted_device == NULL)
505 return;
506
507 num_root = 0;
508 mutex_enter(&raid_lock);
509 LIST_FOREACH(sc, &raids, sc_link) {
510 RF_Raid_t *r = &sc->sc_r;
511 if (r->valid == 0)
512 continue;
513
514 if (r->root_partition == 0)
515 continue;
516
517 for (col = 0; col < r->numCol; col++) {
518 devname = r->Disks[col].devname;
519 devname += sizeof("/dev/") - 1;
520 if (strncmp(devname, device_xname(booted_device),
521 strlen(device_xname(booted_device))) != 0)
522 continue;
523 aprint_debug("raid%d includes boot device %s\n",
524 sc->sc_unit, devname);
525 num_root++;
526 rsc = sc;
527 }
528 }
529 mutex_exit(&raid_lock);
530
531 if (num_root == 1) {
532 booted_device = rsc->sc_dev;
533 } else {
534 /* we can't guess.. require the user to answer... */
535 boothowto |= RB_ASKNAME;
536 }
537 }
538 }
539
540
541 int
542 raidsize(dev_t dev)
543 {
544 struct raid_softc *rs;
545 struct disklabel *lp;
546 int part, unit, omask, size;
547
548 unit = raidunit(dev);
549 if ((rs = raidget(unit)) == NULL)
550 return -1;
551 if ((rs->sc_flags & RAIDF_INITED) == 0)
552 return (-1);
553
554 part = DISKPART(dev);
555 omask = rs->sc_dkdev.dk_openmask & (1 << part);
556 lp = rs->sc_dkdev.dk_label;
557
558 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
559 return (-1);
560
561 if (lp->d_partitions[part].p_fstype != FS_SWAP)
562 size = -1;
563 else
564 size = lp->d_partitions[part].p_size *
565 (lp->d_secsize / DEV_BSIZE);
566
567 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
568 return (-1);
569
570 return (size);
571
572 }
573
574 int
575 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
576 {
577 int unit = raidunit(dev);
578 struct raid_softc *rs;
579 const struct bdevsw *bdev;
580 struct disklabel *lp;
581 RF_Raid_t *raidPtr;
582 daddr_t offset;
583 int part, c, sparecol, j, scol, dumpto;
584 int error = 0;
585
586 if ((rs = raidget(unit)) == NULL)
587 return ENXIO;
588
589 raidPtr = &rs->sc_r;
590
591 if ((rs->sc_flags & RAIDF_INITED) == 0)
592 return ENXIO;
593
594 /* we only support dumping to RAID 1 sets */
595 if (raidPtr->Layout.numDataCol != 1 ||
596 raidPtr->Layout.numParityCol != 1)
597 return EINVAL;
598
599
600 if ((error = raidlock(rs)) != 0)
601 return error;
602
603 if (size % DEV_BSIZE != 0) {
604 error = EINVAL;
605 goto out;
606 }
607
608 if (blkno + size / DEV_BSIZE > rs->sc_size) {
609 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
610 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
611 size / DEV_BSIZE, rs->sc_size);
612 error = EINVAL;
613 goto out;
614 }
615
616 part = DISKPART(dev);
617 lp = rs->sc_dkdev.dk_label;
618 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
619
620 /* figure out what device is alive.. */
621
622 /*
623 Look for a component to dump to. The preference for the
624 component to dump to is as follows:
625 1) the master
626 2) a used_spare of the master
627 3) the slave
628 4) a used_spare of the slave
629 */
630
631 dumpto = -1;
632 for (c = 0; c < raidPtr->numCol; c++) {
633 if (raidPtr->Disks[c].status == rf_ds_optimal) {
634 /* this might be the one */
635 dumpto = c;
636 break;
637 }
638 }
639
640 /*
641 At this point we have possibly selected a live master or a
642 live slave. We now check to see if there is a spared
643 master (or a spared slave), if we didn't find a live master
644 or a live slave.
645 */
646
647 for (c = 0; c < raidPtr->numSpare; c++) {
648 sparecol = raidPtr->numCol + c;
649 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
650 /* How about this one? */
651 scol = -1;
652 for(j=0;j<raidPtr->numCol;j++) {
653 if (raidPtr->Disks[j].spareCol == sparecol) {
654 scol = j;
655 break;
656 }
657 }
658 if (scol == 0) {
659 /*
660 We must have found a spared master!
661 We'll take that over anything else
662 found so far. (We couldn't have
663 found a real master before, since
664 this is a used spare, and it's
665 saying that it's replacing the
666 master.) On reboot (with
667 autoconfiguration turned on)
668 sparecol will become the 1st
669 component (component0) of this set.
670 */
671 dumpto = sparecol;
672 break;
673 } else if (scol != -1) {
674 /*
675 Must be a spared slave. We'll dump
676 to that if we havn't found anything
677 else so far.
678 */
679 if (dumpto == -1)
680 dumpto = sparecol;
681 }
682 }
683 }
684
685 if (dumpto == -1) {
686 /* we couldn't find any live components to dump to!?!?
687 */
688 error = EINVAL;
689 goto out;
690 }
691
692 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
693
694 /*
695 Note that blkno is relative to this particular partition.
696 By adding the offset of this partition in the RAID
697 set, and also adding RF_PROTECTED_SECTORS, we get a
698 value that is relative to the partition used for the
699 underlying component.
700 */
701
702 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
703 blkno + offset, va, size);
704
705 out:
706 raidunlock(rs);
707
708 return error;
709 }
710 /* ARGSUSED */
711 int
712 raidopen(dev_t dev, int flags, int fmt,
713 struct lwp *l)
714 {
715 int unit = raidunit(dev);
716 struct raid_softc *rs;
717 struct disklabel *lp;
718 int part, pmask;
719 int error = 0;
720
721 if ((rs = raidget(unit)) == NULL)
722 return ENXIO;
723 if ((error = raidlock(rs)) != 0)
724 return (error);
725
726 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
727 error = EBUSY;
728 goto bad;
729 }
730
731 lp = rs->sc_dkdev.dk_label;
732
733 part = DISKPART(dev);
734
735 /*
736 * If there are wedges, and this is not RAW_PART, then we
737 * need to fail.
738 */
739 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
740 error = EBUSY;
741 goto bad;
742 }
743 pmask = (1 << part);
744
745 if ((rs->sc_flags & RAIDF_INITED) &&
746 (rs->sc_dkdev.dk_openmask == 0))
747 raidgetdisklabel(dev);
748
749 /* make sure that this partition exists */
750
751 if (part != RAW_PART) {
752 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
753 ((part >= lp->d_npartitions) ||
754 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
755 error = ENXIO;
756 goto bad;
757 }
758 }
759 /* Prevent this unit from being unconfigured while open. */
760 switch (fmt) {
761 case S_IFCHR:
762 rs->sc_dkdev.dk_copenmask |= pmask;
763 break;
764
765 case S_IFBLK:
766 rs->sc_dkdev.dk_bopenmask |= pmask;
767 break;
768 }
769
770 if ((rs->sc_dkdev.dk_openmask == 0) &&
771 ((rs->sc_flags & RAIDF_INITED) != 0)) {
772 /* First one... mark things as dirty... Note that we *MUST*
773 have done a configure before this. I DO NOT WANT TO BE
774 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
775 THAT THEY BELONG TOGETHER!!!!! */
776 /* XXX should check to see if we're only open for reading
777 here... If so, we needn't do this, but then need some
778 other way of keeping track of what's happened.. */
779
780 rf_markalldirty(&rs->sc_r);
781 }
782
783
784 rs->sc_dkdev.dk_openmask =
785 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
786
787 bad:
788 raidunlock(rs);
789
790 return (error);
791
792
793 }
794 /* ARGSUSED */
795 int
796 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
797 {
798 int unit = raidunit(dev);
799 struct raid_softc *rs;
800 int error = 0;
801 int part;
802
803 if ((rs = raidget(unit)) == NULL)
804 return ENXIO;
805
806 if ((error = raidlock(rs)) != 0)
807 return (error);
808
809 part = DISKPART(dev);
810
811 /* ...that much closer to allowing unconfiguration... */
812 switch (fmt) {
813 case S_IFCHR:
814 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
815 break;
816
817 case S_IFBLK:
818 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
819 break;
820 }
821 rs->sc_dkdev.dk_openmask =
822 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
823
824 if ((rs->sc_dkdev.dk_openmask == 0) &&
825 ((rs->sc_flags & RAIDF_INITED) != 0)) {
826 /* Last one... device is not unconfigured yet.
827 Device shutdown has taken care of setting the
828 clean bits if RAIDF_INITED is not set
829 mark things as clean... */
830
831 rf_update_component_labels(&rs->sc_r,
832 RF_FINAL_COMPONENT_UPDATE);
833
834 /* If the kernel is shutting down, it will detach
835 * this RAID set soon enough.
836 */
837 }
838
839 raidunlock(rs);
840 return (0);
841
842 }
843
844 void
845 raidstrategy(struct buf *bp)
846 {
847 unsigned int unit = raidunit(bp->b_dev);
848 RF_Raid_t *raidPtr;
849 int wlabel;
850 struct raid_softc *rs;
851
852 if ((rs = raidget(unit)) == NULL) {
853 bp->b_error = ENXIO;
854 goto done;
855 }
856 if ((rs->sc_flags & RAIDF_INITED) == 0) {
857 bp->b_error = ENXIO;
858 goto done;
859 }
860 raidPtr = &rs->sc_r;
861 if (!raidPtr->valid) {
862 bp->b_error = ENODEV;
863 goto done;
864 }
865 if (bp->b_bcount == 0) {
866 db1_printf(("b_bcount is zero..\n"));
867 goto done;
868 }
869
870 /*
871 * Do bounds checking and adjust transfer. If there's an
872 * error, the bounds check will flag that for us.
873 */
874
875 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
876 if (DISKPART(bp->b_dev) == RAW_PART) {
877 uint64_t size; /* device size in DEV_BSIZE unit */
878
879 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
880 size = raidPtr->totalSectors <<
881 (raidPtr->logBytesPerSector - DEV_BSHIFT);
882 } else {
883 size = raidPtr->totalSectors >>
884 (DEV_BSHIFT - raidPtr->logBytesPerSector);
885 }
886 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
887 goto done;
888 }
889 } else {
890 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
891 db1_printf(("Bounds check failed!!:%d %d\n",
892 (int) bp->b_blkno, (int) wlabel));
893 goto done;
894 }
895 }
896
897 rf_lock_mutex2(raidPtr->iodone_lock);
898
899 bp->b_resid = 0;
900
901 /* stuff it onto our queue */
902 bufq_put(rs->buf_queue, bp);
903
904 /* scheduled the IO to happen at the next convenient time */
905 rf_signal_cond2(raidPtr->iodone_cv);
906 rf_unlock_mutex2(raidPtr->iodone_lock);
907
908 return;
909
910 done:
911 bp->b_resid = bp->b_bcount;
912 biodone(bp);
913 }
914 /* ARGSUSED */
915 int
916 raidread(dev_t dev, struct uio *uio, int flags)
917 {
918 int unit = raidunit(dev);
919 struct raid_softc *rs;
920
921 if ((rs = raidget(unit)) == NULL)
922 return ENXIO;
923
924 if ((rs->sc_flags & RAIDF_INITED) == 0)
925 return (ENXIO);
926
927 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
928
929 }
930 /* ARGSUSED */
931 int
932 raidwrite(dev_t dev, struct uio *uio, int flags)
933 {
934 int unit = raidunit(dev);
935 struct raid_softc *rs;
936
937 if ((rs = raidget(unit)) == NULL)
938 return ENXIO;
939
940 if ((rs->sc_flags & RAIDF_INITED) == 0)
941 return (ENXIO);
942
943 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
944
945 }
946
947 static int
948 raid_detach_unlocked(struct raid_softc *rs)
949 {
950 int error;
951 RF_Raid_t *raidPtr;
952
953 raidPtr = &rs->sc_r;
954
955 /*
956 * If somebody has a partition mounted, we shouldn't
957 * shutdown.
958 */
959 if (rs->sc_dkdev.dk_openmask != 0)
960 return EBUSY;
961
962 if ((rs->sc_flags & RAIDF_INITED) == 0)
963 ; /* not initialized: nothing to do */
964 else if ((error = rf_Shutdown(raidPtr)) != 0)
965 return error;
966 else
967 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
968
969 /* Detach the disk. */
970 dkwedge_delall(&rs->sc_dkdev);
971 disk_detach(&rs->sc_dkdev);
972 disk_destroy(&rs->sc_dkdev);
973
974 aprint_normal_dev(rs->sc_dev, "detached\n");
975
976 return 0;
977 }
978
979 int
980 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
981 {
982 int unit = raidunit(dev);
983 int error = 0;
984 int part, pmask, s;
985 cfdata_t cf;
986 struct raid_softc *rs;
987 RF_Config_t *k_cfg, *u_cfg;
988 RF_Raid_t *raidPtr;
989 RF_RaidDisk_t *diskPtr;
990 RF_AccTotals_t *totals;
991 RF_DeviceConfig_t *d_cfg, **ucfgp;
992 u_char *specific_buf;
993 int retcode = 0;
994 int column;
995 /* int raidid; */
996 struct rf_recon_req *rrcopy, *rr;
997 RF_ComponentLabel_t *clabel;
998 RF_ComponentLabel_t *ci_label;
999 RF_ComponentLabel_t **clabel_ptr;
1000 RF_SingleComponent_t *sparePtr,*componentPtr;
1001 RF_SingleComponent_t component;
1002 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1003 int i, j, d;
1004 #ifdef __HAVE_OLD_DISKLABEL
1005 struct disklabel newlabel;
1006 #endif
1007 struct dkwedge_info *dkw;
1008
1009 if ((rs = raidget(unit)) == NULL)
1010 return ENXIO;
1011 raidPtr = &rs->sc_r;
1012
1013 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1014 (int) DISKPART(dev), (int) unit, cmd));
1015
1016 /* Must be open for writes for these commands... */
1017 switch (cmd) {
1018 #ifdef DIOCGSECTORSIZE
1019 case DIOCGSECTORSIZE:
1020 *(u_int *)data = raidPtr->bytesPerSector;
1021 return 0;
1022 case DIOCGMEDIASIZE:
1023 *(off_t *)data =
1024 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1025 return 0;
1026 #endif
1027 case DIOCSDINFO:
1028 case DIOCWDINFO:
1029 #ifdef __HAVE_OLD_DISKLABEL
1030 case ODIOCWDINFO:
1031 case ODIOCSDINFO:
1032 #endif
1033 case DIOCWLABEL:
1034 case DIOCAWEDGE:
1035 case DIOCDWEDGE:
1036 case DIOCSSTRATEGY:
1037 if ((flag & FWRITE) == 0)
1038 return (EBADF);
1039 }
1040
1041 /* Must be initialized for these... */
1042 switch (cmd) {
1043 case DIOCGDINFO:
1044 case DIOCSDINFO:
1045 case DIOCWDINFO:
1046 #ifdef __HAVE_OLD_DISKLABEL
1047 case ODIOCGDINFO:
1048 case ODIOCWDINFO:
1049 case ODIOCSDINFO:
1050 case ODIOCGDEFLABEL:
1051 #endif
1052 case DIOCGPART:
1053 case DIOCWLABEL:
1054 case DIOCGDEFLABEL:
1055 case DIOCAWEDGE:
1056 case DIOCDWEDGE:
1057 case DIOCLWEDGES:
1058 case DIOCCACHESYNC:
1059 case RAIDFRAME_SHUTDOWN:
1060 case RAIDFRAME_REWRITEPARITY:
1061 case RAIDFRAME_GET_INFO:
1062 case RAIDFRAME_RESET_ACCTOTALS:
1063 case RAIDFRAME_GET_ACCTOTALS:
1064 case RAIDFRAME_KEEP_ACCTOTALS:
1065 case RAIDFRAME_GET_SIZE:
1066 case RAIDFRAME_FAIL_DISK:
1067 case RAIDFRAME_COPYBACK:
1068 case RAIDFRAME_CHECK_RECON_STATUS:
1069 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1070 case RAIDFRAME_GET_COMPONENT_LABEL:
1071 case RAIDFRAME_SET_COMPONENT_LABEL:
1072 case RAIDFRAME_ADD_HOT_SPARE:
1073 case RAIDFRAME_REMOVE_HOT_SPARE:
1074 case RAIDFRAME_INIT_LABELS:
1075 case RAIDFRAME_REBUILD_IN_PLACE:
1076 case RAIDFRAME_CHECK_PARITY:
1077 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1078 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1079 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1080 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1081 case RAIDFRAME_SET_AUTOCONFIG:
1082 case RAIDFRAME_SET_ROOT:
1083 case RAIDFRAME_DELETE_COMPONENT:
1084 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1085 case RAIDFRAME_PARITYMAP_STATUS:
1086 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1087 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1088 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1089 case DIOCGSTRATEGY:
1090 case DIOCSSTRATEGY:
1091 if ((rs->sc_flags & RAIDF_INITED) == 0)
1092 return (ENXIO);
1093 }
1094
1095 switch (cmd) {
1096 #ifdef COMPAT_50
1097 case RAIDFRAME_GET_INFO50:
1098 return rf_get_info50(raidPtr, data);
1099
1100 case RAIDFRAME_CONFIGURE50:
1101 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1102 return retcode;
1103 goto config;
1104 #endif
1105 /* configure the system */
1106 case RAIDFRAME_CONFIGURE:
1107
1108 if (raidPtr->valid) {
1109 /* There is a valid RAID set running on this unit! */
1110 printf("raid%d: Device already configured!\n",unit);
1111 return(EINVAL);
1112 }
1113
1114 /* copy-in the configuration information */
1115 /* data points to a pointer to the configuration structure */
1116
1117 u_cfg = *((RF_Config_t **) data);
1118 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1119 if (k_cfg == NULL) {
1120 return (ENOMEM);
1121 }
1122 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1123 if (retcode) {
1124 RF_Free(k_cfg, sizeof(RF_Config_t));
1125 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1126 retcode));
1127 return (retcode);
1128 }
1129 goto config;
1130 config:
1131 /* allocate a buffer for the layout-specific data, and copy it
1132 * in */
1133 if (k_cfg->layoutSpecificSize) {
1134 if (k_cfg->layoutSpecificSize > 10000) {
1135 /* sanity check */
1136 RF_Free(k_cfg, sizeof(RF_Config_t));
1137 return (EINVAL);
1138 }
1139 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1140 (u_char *));
1141 if (specific_buf == NULL) {
1142 RF_Free(k_cfg, sizeof(RF_Config_t));
1143 return (ENOMEM);
1144 }
1145 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1146 k_cfg->layoutSpecificSize);
1147 if (retcode) {
1148 RF_Free(k_cfg, sizeof(RF_Config_t));
1149 RF_Free(specific_buf,
1150 k_cfg->layoutSpecificSize);
1151 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1152 retcode));
1153 return (retcode);
1154 }
1155 } else
1156 specific_buf = NULL;
1157 k_cfg->layoutSpecific = specific_buf;
1158
1159 /* should do some kind of sanity check on the configuration.
1160 * Store the sum of all the bytes in the last byte? */
1161
1162 /* configure the system */
1163
1164 /*
1165 * Clear the entire RAID descriptor, just to make sure
1166 * there is no stale data left in the case of a
1167 * reconfiguration
1168 */
1169 memset(raidPtr, 0, sizeof(*raidPtr));
1170 raidPtr->softc = rs;
1171 raidPtr->raidid = unit;
1172
1173 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1174
1175 if (retcode == 0) {
1176
1177 /* allow this many simultaneous IO's to
1178 this RAID device */
1179 raidPtr->openings = RAIDOUTSTANDING;
1180
1181 raidinit(rs);
1182 rf_markalldirty(raidPtr);
1183 }
1184 /* free the buffers. No return code here. */
1185 if (k_cfg->layoutSpecificSize) {
1186 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1187 }
1188 RF_Free(k_cfg, sizeof(RF_Config_t));
1189
1190 return (retcode);
1191
1192 /* shutdown the system */
1193 case RAIDFRAME_SHUTDOWN:
1194
1195 part = DISKPART(dev);
1196 pmask = (1 << part);
1197
1198 if ((error = raidlock(rs)) != 0)
1199 return (error);
1200
1201 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1202 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1203 (rs->sc_dkdev.dk_copenmask & pmask)))
1204 retcode = EBUSY;
1205 else {
1206 rs->sc_flags |= RAIDF_SHUTDOWN;
1207 rs->sc_dkdev.dk_copenmask &= ~pmask;
1208 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1209 rs->sc_dkdev.dk_openmask &= ~pmask;
1210 retcode = 0;
1211 }
1212
1213 raidunlock(rs);
1214
1215 if (retcode != 0)
1216 return retcode;
1217
1218 /* free the pseudo device attach bits */
1219
1220 cf = device_cfdata(rs->sc_dev);
1221 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1222 free(cf, M_RAIDFRAME);
1223
1224 return (retcode);
1225 case RAIDFRAME_GET_COMPONENT_LABEL:
1226 clabel_ptr = (RF_ComponentLabel_t **) data;
1227 /* need to read the component label for the disk indicated
1228 by row,column in clabel */
1229
1230 /*
1231 * Perhaps there should be an option to skip the in-core
1232 * copy and hit the disk, as with disklabel(8).
1233 */
1234 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1235
1236 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1237
1238 if (retcode) {
1239 RF_Free(clabel, sizeof(*clabel));
1240 return retcode;
1241 }
1242
1243 clabel->row = 0; /* Don't allow looking at anything else.*/
1244
1245 column = clabel->column;
1246
1247 if ((column < 0) || (column >= raidPtr->numCol +
1248 raidPtr->numSpare)) {
1249 RF_Free(clabel, sizeof(*clabel));
1250 return EINVAL;
1251 }
1252
1253 RF_Free(clabel, sizeof(*clabel));
1254
1255 clabel = raidget_component_label(raidPtr, column);
1256
1257 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1258
1259 #if 0
1260 case RAIDFRAME_SET_COMPONENT_LABEL:
1261 clabel = (RF_ComponentLabel_t *) data;
1262
1263 /* XXX check the label for valid stuff... */
1264 /* Note that some things *should not* get modified --
1265 the user should be re-initing the labels instead of
1266 trying to patch things.
1267 */
1268
1269 raidid = raidPtr->raidid;
1270 #ifdef DEBUG
1271 printf("raid%d: Got component label:\n", raidid);
1272 printf("raid%d: Version: %d\n", raidid, clabel->version);
1273 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1274 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1275 printf("raid%d: Column: %d\n", raidid, clabel->column);
1276 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1277 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1278 printf("raid%d: Status: %d\n", raidid, clabel->status);
1279 #endif
1280 clabel->row = 0;
1281 column = clabel->column;
1282
1283 if ((column < 0) || (column >= raidPtr->numCol)) {
1284 return(EINVAL);
1285 }
1286
1287 /* XXX this isn't allowed to do anything for now :-) */
1288
1289 /* XXX and before it is, we need to fill in the rest
1290 of the fields!?!?!?! */
1291 memcpy(raidget_component_label(raidPtr, column),
1292 clabel, sizeof(*clabel));
1293 raidflush_component_label(raidPtr, column);
1294 return (0);
1295 #endif
1296
1297 case RAIDFRAME_INIT_LABELS:
1298 clabel = (RF_ComponentLabel_t *) data;
1299 /*
1300 we only want the serial number from
1301 the above. We get all the rest of the information
1302 from the config that was used to create this RAID
1303 set.
1304 */
1305
1306 raidPtr->serial_number = clabel->serial_number;
1307
1308 for(column=0;column<raidPtr->numCol;column++) {
1309 diskPtr = &raidPtr->Disks[column];
1310 if (!RF_DEAD_DISK(diskPtr->status)) {
1311 ci_label = raidget_component_label(raidPtr,
1312 column);
1313 /* Zeroing this is important. */
1314 memset(ci_label, 0, sizeof(*ci_label));
1315 raid_init_component_label(raidPtr, ci_label);
1316 ci_label->serial_number =
1317 raidPtr->serial_number;
1318 ci_label->row = 0; /* we dont' pretend to support more */
1319 rf_component_label_set_partitionsize(ci_label,
1320 diskPtr->partitionSize);
1321 ci_label->column = column;
1322 raidflush_component_label(raidPtr, column);
1323 }
1324 /* XXXjld what about the spares? */
1325 }
1326
1327 return (retcode);
1328 case RAIDFRAME_SET_AUTOCONFIG:
1329 d = rf_set_autoconfig(raidPtr, *(int *) data);
1330 printf("raid%d: New autoconfig value is: %d\n",
1331 raidPtr->raidid, d);
1332 *(int *) data = d;
1333 return (retcode);
1334
1335 case RAIDFRAME_SET_ROOT:
1336 d = rf_set_rootpartition(raidPtr, *(int *) data);
1337 printf("raid%d: New rootpartition value is: %d\n",
1338 raidPtr->raidid, d);
1339 *(int *) data = d;
1340 return (retcode);
1341
1342 /* initialize all parity */
1343 case RAIDFRAME_REWRITEPARITY:
1344
1345 if (raidPtr->Layout.map->faultsTolerated == 0) {
1346 /* Parity for RAID 0 is trivially correct */
1347 raidPtr->parity_good = RF_RAID_CLEAN;
1348 return(0);
1349 }
1350
1351 if (raidPtr->parity_rewrite_in_progress == 1) {
1352 /* Re-write is already in progress! */
1353 return(EINVAL);
1354 }
1355
1356 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1357 rf_RewriteParityThread,
1358 raidPtr,"raid_parity");
1359 return (retcode);
1360
1361
1362 case RAIDFRAME_ADD_HOT_SPARE:
1363 sparePtr = (RF_SingleComponent_t *) data;
1364 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1365 retcode = rf_add_hot_spare(raidPtr, &component);
1366 return(retcode);
1367
1368 case RAIDFRAME_REMOVE_HOT_SPARE:
1369 return(retcode);
1370
1371 case RAIDFRAME_DELETE_COMPONENT:
1372 componentPtr = (RF_SingleComponent_t *)data;
1373 memcpy( &component, componentPtr,
1374 sizeof(RF_SingleComponent_t));
1375 retcode = rf_delete_component(raidPtr, &component);
1376 return(retcode);
1377
1378 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1379 componentPtr = (RF_SingleComponent_t *)data;
1380 memcpy( &component, componentPtr,
1381 sizeof(RF_SingleComponent_t));
1382 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1383 return(retcode);
1384
1385 case RAIDFRAME_REBUILD_IN_PLACE:
1386
1387 if (raidPtr->Layout.map->faultsTolerated == 0) {
1388 /* Can't do this on a RAID 0!! */
1389 return(EINVAL);
1390 }
1391
1392 if (raidPtr->recon_in_progress == 1) {
1393 /* a reconstruct is already in progress! */
1394 return(EINVAL);
1395 }
1396
1397 componentPtr = (RF_SingleComponent_t *) data;
1398 memcpy( &component, componentPtr,
1399 sizeof(RF_SingleComponent_t));
1400 component.row = 0; /* we don't support any more */
1401 column = component.column;
1402
1403 if ((column < 0) || (column >= raidPtr->numCol)) {
1404 return(EINVAL);
1405 }
1406
1407 rf_lock_mutex2(raidPtr->mutex);
1408 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1409 (raidPtr->numFailures > 0)) {
1410 /* XXX 0 above shouldn't be constant!!! */
1411 /* some component other than this has failed.
1412 Let's not make things worse than they already
1413 are... */
1414 printf("raid%d: Unable to reconstruct to disk at:\n",
1415 raidPtr->raidid);
1416 printf("raid%d: Col: %d Too many failures.\n",
1417 raidPtr->raidid, column);
1418 rf_unlock_mutex2(raidPtr->mutex);
1419 return (EINVAL);
1420 }
1421 if (raidPtr->Disks[column].status ==
1422 rf_ds_reconstructing) {
1423 printf("raid%d: Unable to reconstruct to disk at:\n",
1424 raidPtr->raidid);
1425 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1426
1427 rf_unlock_mutex2(raidPtr->mutex);
1428 return (EINVAL);
1429 }
1430 if (raidPtr->Disks[column].status == rf_ds_spared) {
1431 rf_unlock_mutex2(raidPtr->mutex);
1432 return (EINVAL);
1433 }
1434 rf_unlock_mutex2(raidPtr->mutex);
1435
1436 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1437 if (rrcopy == NULL)
1438 return(ENOMEM);
1439
1440 rrcopy->raidPtr = (void *) raidPtr;
1441 rrcopy->col = column;
1442
1443 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1444 rf_ReconstructInPlaceThread,
1445 rrcopy,"raid_reconip");
1446 return(retcode);
1447
1448 case RAIDFRAME_GET_INFO:
1449 if (!raidPtr->valid)
1450 return (ENODEV);
1451 ucfgp = (RF_DeviceConfig_t **) data;
1452 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1453 (RF_DeviceConfig_t *));
1454 if (d_cfg == NULL)
1455 return (ENOMEM);
1456 d_cfg->rows = 1; /* there is only 1 row now */
1457 d_cfg->cols = raidPtr->numCol;
1458 d_cfg->ndevs = raidPtr->numCol;
1459 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1460 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1461 return (ENOMEM);
1462 }
1463 d_cfg->nspares = raidPtr->numSpare;
1464 if (d_cfg->nspares >= RF_MAX_DISKS) {
1465 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1466 return (ENOMEM);
1467 }
1468 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1469 d = 0;
1470 for (j = 0; j < d_cfg->cols; j++) {
1471 d_cfg->devs[d] = raidPtr->Disks[j];
1472 d++;
1473 }
1474 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1475 d_cfg->spares[i] = raidPtr->Disks[j];
1476 }
1477 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1478 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1479
1480 return (retcode);
1481
1482 case RAIDFRAME_CHECK_PARITY:
1483 *(int *) data = raidPtr->parity_good;
1484 return (0);
1485
1486 case RAIDFRAME_PARITYMAP_STATUS:
1487 if (rf_paritymap_ineligible(raidPtr))
1488 return EINVAL;
1489 rf_paritymap_status(raidPtr->parity_map,
1490 (struct rf_pmstat *)data);
1491 return 0;
1492
1493 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1494 if (rf_paritymap_ineligible(raidPtr))
1495 return EINVAL;
1496 if (raidPtr->parity_map == NULL)
1497 return ENOENT; /* ??? */
1498 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1499 (struct rf_pmparams *)data, 1))
1500 return EINVAL;
1501 return 0;
1502
1503 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1504 if (rf_paritymap_ineligible(raidPtr))
1505 return EINVAL;
1506 *(int *) data = rf_paritymap_get_disable(raidPtr);
1507 return 0;
1508
1509 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1510 if (rf_paritymap_ineligible(raidPtr))
1511 return EINVAL;
1512 rf_paritymap_set_disable(raidPtr, *(int *)data);
1513 /* XXX should errors be passed up? */
1514 return 0;
1515
1516 case RAIDFRAME_RESET_ACCTOTALS:
1517 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1518 return (0);
1519
1520 case RAIDFRAME_GET_ACCTOTALS:
1521 totals = (RF_AccTotals_t *) data;
1522 *totals = raidPtr->acc_totals;
1523 return (0);
1524
1525 case RAIDFRAME_KEEP_ACCTOTALS:
1526 raidPtr->keep_acc_totals = *(int *)data;
1527 return (0);
1528
1529 case RAIDFRAME_GET_SIZE:
1530 *(int *) data = raidPtr->totalSectors;
1531 return (0);
1532
1533 /* fail a disk & optionally start reconstruction */
1534 case RAIDFRAME_FAIL_DISK:
1535
1536 if (raidPtr->Layout.map->faultsTolerated == 0) {
1537 /* Can't do this on a RAID 0!! */
1538 return(EINVAL);
1539 }
1540
1541 rr = (struct rf_recon_req *) data;
1542 rr->row = 0;
1543 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1544 return (EINVAL);
1545
1546
1547 rf_lock_mutex2(raidPtr->mutex);
1548 if (raidPtr->status == rf_rs_reconstructing) {
1549 /* you can't fail a disk while we're reconstructing! */
1550 /* XXX wrong for RAID6 */
1551 rf_unlock_mutex2(raidPtr->mutex);
1552 return (EINVAL);
1553 }
1554 if ((raidPtr->Disks[rr->col].status ==
1555 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1556 /* some other component has failed. Let's not make
1557 things worse. XXX wrong for RAID6 */
1558 rf_unlock_mutex2(raidPtr->mutex);
1559 return (EINVAL);
1560 }
1561 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1562 /* Can't fail a spared disk! */
1563 rf_unlock_mutex2(raidPtr->mutex);
1564 return (EINVAL);
1565 }
1566 rf_unlock_mutex2(raidPtr->mutex);
1567
1568 /* make a copy of the recon request so that we don't rely on
1569 * the user's buffer */
1570 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1571 if (rrcopy == NULL)
1572 return(ENOMEM);
1573 memcpy(rrcopy, rr, sizeof(*rr));
1574 rrcopy->raidPtr = (void *) raidPtr;
1575
1576 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1577 rf_ReconThread,
1578 rrcopy,"raid_recon");
1579 return (0);
1580
1581 /* invoke a copyback operation after recon on whatever disk
1582 * needs it, if any */
1583 case RAIDFRAME_COPYBACK:
1584
1585 if (raidPtr->Layout.map->faultsTolerated == 0) {
1586 /* This makes no sense on a RAID 0!! */
1587 return(EINVAL);
1588 }
1589
1590 if (raidPtr->copyback_in_progress == 1) {
1591 /* Copyback is already in progress! */
1592 return(EINVAL);
1593 }
1594
1595 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1596 rf_CopybackThread,
1597 raidPtr,"raid_copyback");
1598 return (retcode);
1599
1600 /* return the percentage completion of reconstruction */
1601 case RAIDFRAME_CHECK_RECON_STATUS:
1602 if (raidPtr->Layout.map->faultsTolerated == 0) {
1603 /* This makes no sense on a RAID 0, so tell the
1604 user it's done. */
1605 *(int *) data = 100;
1606 return(0);
1607 }
1608 if (raidPtr->status != rf_rs_reconstructing)
1609 *(int *) data = 100;
1610 else {
1611 if (raidPtr->reconControl->numRUsTotal > 0) {
1612 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1613 } else {
1614 *(int *) data = 0;
1615 }
1616 }
1617 return (0);
1618 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1619 progressInfoPtr = (RF_ProgressInfo_t **) data;
1620 if (raidPtr->status != rf_rs_reconstructing) {
1621 progressInfo.remaining = 0;
1622 progressInfo.completed = 100;
1623 progressInfo.total = 100;
1624 } else {
1625 progressInfo.total =
1626 raidPtr->reconControl->numRUsTotal;
1627 progressInfo.completed =
1628 raidPtr->reconControl->numRUsComplete;
1629 progressInfo.remaining = progressInfo.total -
1630 progressInfo.completed;
1631 }
1632 retcode = copyout(&progressInfo, *progressInfoPtr,
1633 sizeof(RF_ProgressInfo_t));
1634 return (retcode);
1635
1636 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1637 if (raidPtr->Layout.map->faultsTolerated == 0) {
1638 /* This makes no sense on a RAID 0, so tell the
1639 user it's done. */
1640 *(int *) data = 100;
1641 return(0);
1642 }
1643 if (raidPtr->parity_rewrite_in_progress == 1) {
1644 *(int *) data = 100 *
1645 raidPtr->parity_rewrite_stripes_done /
1646 raidPtr->Layout.numStripe;
1647 } else {
1648 *(int *) data = 100;
1649 }
1650 return (0);
1651
1652 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1653 progressInfoPtr = (RF_ProgressInfo_t **) data;
1654 if (raidPtr->parity_rewrite_in_progress == 1) {
1655 progressInfo.total = raidPtr->Layout.numStripe;
1656 progressInfo.completed =
1657 raidPtr->parity_rewrite_stripes_done;
1658 progressInfo.remaining = progressInfo.total -
1659 progressInfo.completed;
1660 } else {
1661 progressInfo.remaining = 0;
1662 progressInfo.completed = 100;
1663 progressInfo.total = 100;
1664 }
1665 retcode = copyout(&progressInfo, *progressInfoPtr,
1666 sizeof(RF_ProgressInfo_t));
1667 return (retcode);
1668
1669 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1670 if (raidPtr->Layout.map->faultsTolerated == 0) {
1671 /* This makes no sense on a RAID 0 */
1672 *(int *) data = 100;
1673 return(0);
1674 }
1675 if (raidPtr->copyback_in_progress == 1) {
1676 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1677 raidPtr->Layout.numStripe;
1678 } else {
1679 *(int *) data = 100;
1680 }
1681 return (0);
1682
1683 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1684 progressInfoPtr = (RF_ProgressInfo_t **) data;
1685 if (raidPtr->copyback_in_progress == 1) {
1686 progressInfo.total = raidPtr->Layout.numStripe;
1687 progressInfo.completed =
1688 raidPtr->copyback_stripes_done;
1689 progressInfo.remaining = progressInfo.total -
1690 progressInfo.completed;
1691 } else {
1692 progressInfo.remaining = 0;
1693 progressInfo.completed = 100;
1694 progressInfo.total = 100;
1695 }
1696 retcode = copyout(&progressInfo, *progressInfoPtr,
1697 sizeof(RF_ProgressInfo_t));
1698 return (retcode);
1699
1700 /* the sparetable daemon calls this to wait for the kernel to
1701 * need a spare table. this ioctl does not return until a
1702 * spare table is needed. XXX -- calling mpsleep here in the
1703 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1704 * -- I should either compute the spare table in the kernel,
1705 * or have a different -- XXX XXX -- interface (a different
1706 * character device) for delivering the table -- XXX */
1707 #if 0
1708 case RAIDFRAME_SPARET_WAIT:
1709 rf_lock_mutex2(rf_sparet_wait_mutex);
1710 while (!rf_sparet_wait_queue)
1711 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1712 waitreq = rf_sparet_wait_queue;
1713 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1714 rf_unlock_mutex2(rf_sparet_wait_mutex);
1715
1716 /* structure assignment */
1717 *((RF_SparetWait_t *) data) = *waitreq;
1718
1719 RF_Free(waitreq, sizeof(*waitreq));
1720 return (0);
1721
1722 /* wakes up a process waiting on SPARET_WAIT and puts an error
1723 * code in it that will cause the dameon to exit */
1724 case RAIDFRAME_ABORT_SPARET_WAIT:
1725 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1726 waitreq->fcol = -1;
1727 rf_lock_mutex2(rf_sparet_wait_mutex);
1728 waitreq->next = rf_sparet_wait_queue;
1729 rf_sparet_wait_queue = waitreq;
1730 rf_broadcast_conf2(rf_sparet_wait_cv);
1731 rf_unlock_mutex2(rf_sparet_wait_mutex);
1732 return (0);
1733
1734 /* used by the spare table daemon to deliver a spare table
1735 * into the kernel */
1736 case RAIDFRAME_SEND_SPARET:
1737
1738 /* install the spare table */
1739 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1740
1741 /* respond to the requestor. the return status of the spare
1742 * table installation is passed in the "fcol" field */
1743 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1744 waitreq->fcol = retcode;
1745 rf_lock_mutex2(rf_sparet_wait_mutex);
1746 waitreq->next = rf_sparet_resp_queue;
1747 rf_sparet_resp_queue = waitreq;
1748 rf_broadcast_cond2(rf_sparet_resp_cv);
1749 rf_unlock_mutex2(rf_sparet_wait_mutex);
1750
1751 return (retcode);
1752 #endif
1753
1754 default:
1755 break; /* fall through to the os-specific code below */
1756
1757 }
1758
1759 if (!raidPtr->valid)
1760 return (EINVAL);
1761
1762 /*
1763 * Add support for "regular" device ioctls here.
1764 */
1765
1766 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1767 if (error != EPASSTHROUGH)
1768 return (error);
1769
1770 switch (cmd) {
1771 case DIOCGDINFO:
1772 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1773 break;
1774 #ifdef __HAVE_OLD_DISKLABEL
1775 case ODIOCGDINFO:
1776 newlabel = *(rs->sc_dkdev.dk_label);
1777 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1778 return ENOTTY;
1779 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1780 break;
1781 #endif
1782
1783 case DIOCGPART:
1784 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1785 ((struct partinfo *) data)->part =
1786 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1787 break;
1788
1789 case DIOCWDINFO:
1790 case DIOCSDINFO:
1791 #ifdef __HAVE_OLD_DISKLABEL
1792 case ODIOCWDINFO:
1793 case ODIOCSDINFO:
1794 #endif
1795 {
1796 struct disklabel *lp;
1797 #ifdef __HAVE_OLD_DISKLABEL
1798 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1799 memset(&newlabel, 0, sizeof newlabel);
1800 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1801 lp = &newlabel;
1802 } else
1803 #endif
1804 lp = (struct disklabel *)data;
1805
1806 if ((error = raidlock(rs)) != 0)
1807 return (error);
1808
1809 rs->sc_flags |= RAIDF_LABELLING;
1810
1811 error = setdisklabel(rs->sc_dkdev.dk_label,
1812 lp, 0, rs->sc_dkdev.dk_cpulabel);
1813 if (error == 0) {
1814 if (cmd == DIOCWDINFO
1815 #ifdef __HAVE_OLD_DISKLABEL
1816 || cmd == ODIOCWDINFO
1817 #endif
1818 )
1819 error = writedisklabel(RAIDLABELDEV(dev),
1820 raidstrategy, rs->sc_dkdev.dk_label,
1821 rs->sc_dkdev.dk_cpulabel);
1822 }
1823 rs->sc_flags &= ~RAIDF_LABELLING;
1824
1825 raidunlock(rs);
1826
1827 if (error)
1828 return (error);
1829 break;
1830 }
1831
1832 case DIOCWLABEL:
1833 if (*(int *) data != 0)
1834 rs->sc_flags |= RAIDF_WLABEL;
1835 else
1836 rs->sc_flags &= ~RAIDF_WLABEL;
1837 break;
1838
1839 case DIOCGDEFLABEL:
1840 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1841 break;
1842
1843 #ifdef __HAVE_OLD_DISKLABEL
1844 case ODIOCGDEFLABEL:
1845 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1846 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1847 return ENOTTY;
1848 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1849 break;
1850 #endif
1851
1852 case DIOCAWEDGE:
1853 case DIOCDWEDGE:
1854 dkw = (void *)data;
1855
1856 /* If the ioctl happens here, the parent is us. */
1857 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1858 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1859
1860 case DIOCLWEDGES:
1861 return dkwedge_list(&rs->sc_dkdev,
1862 (struct dkwedge_list *)data, l);
1863 case DIOCCACHESYNC:
1864 return rf_sync_component_caches(raidPtr);
1865
1866 case DIOCGSTRATEGY:
1867 {
1868 struct disk_strategy *dks = (void *)data;
1869
1870 s = splbio();
1871 strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
1872 sizeof(dks->dks_name));
1873 splx(s);
1874 dks->dks_paramlen = 0;
1875
1876 return 0;
1877 }
1878
1879 case DIOCSSTRATEGY:
1880 {
1881 struct disk_strategy *dks = (void *)data;
1882 struct bufq_state *new;
1883 struct bufq_state *old;
1884
1885 if (dks->dks_param != NULL) {
1886 return EINVAL;
1887 }
1888 dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
1889 error = bufq_alloc(&new, dks->dks_name,
1890 BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
1891 if (error) {
1892 return error;
1893 }
1894 s = splbio();
1895 old = rs->buf_queue;
1896 bufq_move(new, old);
1897 rs->buf_queue = new;
1898 splx(s);
1899 bufq_free(old);
1900
1901 return 0;
1902 }
1903
1904 default:
1905 retcode = ENOTTY;
1906 }
1907 return (retcode);
1908
1909 }
1910
1911
1912 /* raidinit -- complete the rest of the initialization for the
1913 RAIDframe device. */
1914
1915
1916 static void
1917 raidinit(struct raid_softc *rs)
1918 {
1919 cfdata_t cf;
1920 int unit;
1921 RF_Raid_t *raidPtr = &rs->sc_r;
1922
1923 unit = raidPtr->raidid;
1924
1925
1926 /* XXX should check return code first... */
1927 rs->sc_flags |= RAIDF_INITED;
1928
1929 /* XXX doesn't check bounds. */
1930 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1931
1932 /* attach the pseudo device */
1933 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1934 cf->cf_name = raid_cd.cd_name;
1935 cf->cf_atname = raid_cd.cd_name;
1936 cf->cf_unit = unit;
1937 cf->cf_fstate = FSTATE_STAR;
1938
1939 rs->sc_dev = config_attach_pseudo(cf);
1940
1941 if (rs->sc_dev == NULL) {
1942 printf("raid%d: config_attach_pseudo failed\n",
1943 raidPtr->raidid);
1944 rs->sc_flags &= ~RAIDF_INITED;
1945 free(cf, M_RAIDFRAME);
1946 return;
1947 }
1948
1949 /* disk_attach actually creates space for the CPU disklabel, among
1950 * other things, so it's critical to call this *BEFORE* we try putzing
1951 * with disklabels. */
1952
1953 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1954 disk_attach(&rs->sc_dkdev);
1955 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1956
1957 /* XXX There may be a weird interaction here between this, and
1958 * protectedSectors, as used in RAIDframe. */
1959
1960 rs->sc_size = raidPtr->totalSectors;
1961
1962 dkwedge_discover(&rs->sc_dkdev);
1963
1964 rf_set_properties(rs, raidPtr);
1965
1966 }
1967 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1968 /* wake up the daemon & tell it to get us a spare table
1969 * XXX
1970 * the entries in the queues should be tagged with the raidPtr
1971 * so that in the extremely rare case that two recons happen at once,
1972 * we know for which device were requesting a spare table
1973 * XXX
1974 *
1975 * XXX This code is not currently used. GO
1976 */
1977 int
1978 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1979 {
1980 int retcode;
1981
1982 rf_lock_mutex2(rf_sparet_wait_mutex);
1983 req->next = rf_sparet_wait_queue;
1984 rf_sparet_wait_queue = req;
1985 rf_broadcast_cond2(rf_sparet_wait_cv);
1986
1987 /* mpsleep unlocks the mutex */
1988 while (!rf_sparet_resp_queue) {
1989 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1990 }
1991 req = rf_sparet_resp_queue;
1992 rf_sparet_resp_queue = req->next;
1993 rf_unlock_mutex2(rf_sparet_wait_mutex);
1994
1995 retcode = req->fcol;
1996 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1997 * alloc'd */
1998 return (retcode);
1999 }
2000 #endif
2001
2002 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2003 * bp & passes it down.
2004 * any calls originating in the kernel must use non-blocking I/O
2005 * do some extra sanity checking to return "appropriate" error values for
2006 * certain conditions (to make some standard utilities work)
2007 *
2008 * Formerly known as: rf_DoAccessKernel
2009 */
2010 void
2011 raidstart(RF_Raid_t *raidPtr)
2012 {
2013 RF_SectorCount_t num_blocks, pb, sum;
2014 RF_RaidAddr_t raid_addr;
2015 struct partition *pp;
2016 daddr_t blocknum;
2017 struct raid_softc *rs;
2018 int do_async;
2019 struct buf *bp;
2020 int rc;
2021
2022 rs = raidPtr->softc;
2023 /* quick check to see if anything has died recently */
2024 rf_lock_mutex2(raidPtr->mutex);
2025 if (raidPtr->numNewFailures > 0) {
2026 rf_unlock_mutex2(raidPtr->mutex);
2027 rf_update_component_labels(raidPtr,
2028 RF_NORMAL_COMPONENT_UPDATE);
2029 rf_lock_mutex2(raidPtr->mutex);
2030 raidPtr->numNewFailures--;
2031 }
2032
2033 /* Check to see if we're at the limit... */
2034 while (raidPtr->openings > 0) {
2035 rf_unlock_mutex2(raidPtr->mutex);
2036
2037 /* get the next item, if any, from the queue */
2038 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2039 /* nothing more to do */
2040 return;
2041 }
2042
2043 /* Ok, for the bp we have here, bp->b_blkno is relative to the
2044 * partition.. Need to make it absolute to the underlying
2045 * device.. */
2046
2047 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2048 if (DISKPART(bp->b_dev) != RAW_PART) {
2049 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2050 blocknum += pp->p_offset;
2051 }
2052
2053 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2054 (int) blocknum));
2055
2056 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2057 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2058
2059 /* *THIS* is where we adjust what block we're going to...
2060 * but DO NOT TOUCH bp->b_blkno!!! */
2061 raid_addr = blocknum;
2062
2063 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2064 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2065 sum = raid_addr + num_blocks + pb;
2066 if (1 || rf_debugKernelAccess) {
2067 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2068 (int) raid_addr, (int) sum, (int) num_blocks,
2069 (int) pb, (int) bp->b_resid));
2070 }
2071 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2072 || (sum < num_blocks) || (sum < pb)) {
2073 bp->b_error = ENOSPC;
2074 bp->b_resid = bp->b_bcount;
2075 biodone(bp);
2076 rf_lock_mutex2(raidPtr->mutex);
2077 continue;
2078 }
2079 /*
2080 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2081 */
2082
2083 if (bp->b_bcount & raidPtr->sectorMask) {
2084 bp->b_error = EINVAL;
2085 bp->b_resid = bp->b_bcount;
2086 biodone(bp);
2087 rf_lock_mutex2(raidPtr->mutex);
2088 continue;
2089
2090 }
2091 db1_printf(("Calling DoAccess..\n"));
2092
2093
2094 rf_lock_mutex2(raidPtr->mutex);
2095 raidPtr->openings--;
2096 rf_unlock_mutex2(raidPtr->mutex);
2097
2098 /*
2099 * Everything is async.
2100 */
2101 do_async = 1;
2102
2103 disk_busy(&rs->sc_dkdev);
2104
2105 /* XXX we're still at splbio() here... do we *really*
2106 need to be? */
2107
2108 /* don't ever condition on bp->b_flags & B_WRITE.
2109 * always condition on B_READ instead */
2110
2111 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2112 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2113 do_async, raid_addr, num_blocks,
2114 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2115
2116 if (rc) {
2117 bp->b_error = rc;
2118 bp->b_resid = bp->b_bcount;
2119 biodone(bp);
2120 /* continue loop */
2121 }
2122
2123 rf_lock_mutex2(raidPtr->mutex);
2124 }
2125 rf_unlock_mutex2(raidPtr->mutex);
2126 }
2127
2128
2129
2130
2131 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2132
2133 int
2134 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2135 {
2136 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2137 struct buf *bp;
2138
2139 req->queue = queue;
2140 bp = req->bp;
2141
2142 switch (req->type) {
2143 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2144 /* XXX need to do something extra here.. */
2145 /* I'm leaving this in, as I've never actually seen it used,
2146 * and I'd like folks to report it... GO */
2147 printf(("WAKEUP CALLED\n"));
2148 queue->numOutstanding++;
2149
2150 bp->b_flags = 0;
2151 bp->b_private = req;
2152
2153 KernelWakeupFunc(bp);
2154 break;
2155
2156 case RF_IO_TYPE_READ:
2157 case RF_IO_TYPE_WRITE:
2158 #if RF_ACC_TRACE > 0
2159 if (req->tracerec) {
2160 RF_ETIMER_START(req->tracerec->timer);
2161 }
2162 #endif
2163 InitBP(bp, queue->rf_cinfo->ci_vp,
2164 op, queue->rf_cinfo->ci_dev,
2165 req->sectorOffset, req->numSector,
2166 req->buf, KernelWakeupFunc, (void *) req,
2167 queue->raidPtr->logBytesPerSector, req->b_proc);
2168
2169 if (rf_debugKernelAccess) {
2170 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2171 (long) bp->b_blkno));
2172 }
2173 queue->numOutstanding++;
2174 queue->last_deq_sector = req->sectorOffset;
2175 /* acc wouldn't have been let in if there were any pending
2176 * reqs at any other priority */
2177 queue->curPriority = req->priority;
2178
2179 db1_printf(("Going for %c to unit %d col %d\n",
2180 req->type, queue->raidPtr->raidid,
2181 queue->col));
2182 db1_printf(("sector %d count %d (%d bytes) %d\n",
2183 (int) req->sectorOffset, (int) req->numSector,
2184 (int) (req->numSector <<
2185 queue->raidPtr->logBytesPerSector),
2186 (int) queue->raidPtr->logBytesPerSector));
2187
2188 /*
2189 * XXX: drop lock here since this can block at
2190 * least with backing SCSI devices. Retake it
2191 * to minimize fuss with calling interfaces.
2192 */
2193
2194 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2195 bdev_strategy(bp);
2196 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2197 break;
2198
2199 default:
2200 panic("bad req->type in rf_DispatchKernelIO");
2201 }
2202 db1_printf(("Exiting from DispatchKernelIO\n"));
2203
2204 return (0);
2205 }
2206 /* this is the callback function associated with a I/O invoked from
2207 kernel code.
2208 */
2209 static void
2210 KernelWakeupFunc(struct buf *bp)
2211 {
2212 RF_DiskQueueData_t *req = NULL;
2213 RF_DiskQueue_t *queue;
2214
2215 db1_printf(("recovering the request queue:\n"));
2216
2217 req = bp->b_private;
2218
2219 queue = (RF_DiskQueue_t *) req->queue;
2220
2221 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2222
2223 #if RF_ACC_TRACE > 0
2224 if (req->tracerec) {
2225 RF_ETIMER_STOP(req->tracerec->timer);
2226 RF_ETIMER_EVAL(req->tracerec->timer);
2227 rf_lock_mutex2(rf_tracing_mutex);
2228 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2229 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2230 req->tracerec->num_phys_ios++;
2231 rf_unlock_mutex2(rf_tracing_mutex);
2232 }
2233 #endif
2234
2235 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2236 * ballistic, and mark the component as hosed... */
2237
2238 if (bp->b_error != 0) {
2239 /* Mark the disk as dead */
2240 /* but only mark it once... */
2241 /* and only if it wouldn't leave this RAID set
2242 completely broken */
2243 if (((queue->raidPtr->Disks[queue->col].status ==
2244 rf_ds_optimal) ||
2245 (queue->raidPtr->Disks[queue->col].status ==
2246 rf_ds_used_spare)) &&
2247 (queue->raidPtr->numFailures <
2248 queue->raidPtr->Layout.map->faultsTolerated)) {
2249 printf("raid%d: IO Error. Marking %s as failed.\n",
2250 queue->raidPtr->raidid,
2251 queue->raidPtr->Disks[queue->col].devname);
2252 queue->raidPtr->Disks[queue->col].status =
2253 rf_ds_failed;
2254 queue->raidPtr->status = rf_rs_degraded;
2255 queue->raidPtr->numFailures++;
2256 queue->raidPtr->numNewFailures++;
2257 } else { /* Disk is already dead... */
2258 /* printf("Disk already marked as dead!\n"); */
2259 }
2260
2261 }
2262
2263 /* Fill in the error value */
2264 req->error = bp->b_error;
2265
2266 /* Drop this one on the "finished" queue... */
2267 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2268
2269 /* Let the raidio thread know there is work to be done. */
2270 rf_signal_cond2(queue->raidPtr->iodone_cv);
2271
2272 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2273 }
2274
2275
2276 /*
2277 * initialize a buf structure for doing an I/O in the kernel.
2278 */
2279 static void
2280 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2281 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2282 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2283 struct proc *b_proc)
2284 {
2285 /* bp->b_flags = B_PHYS | rw_flag; */
2286 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2287 bp->b_oflags = 0;
2288 bp->b_cflags = 0;
2289 bp->b_bcount = numSect << logBytesPerSector;
2290 bp->b_bufsize = bp->b_bcount;
2291 bp->b_error = 0;
2292 bp->b_dev = dev;
2293 bp->b_data = bf;
2294 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2295 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2296 if (bp->b_bcount == 0) {
2297 panic("bp->b_bcount is zero in InitBP!!");
2298 }
2299 bp->b_proc = b_proc;
2300 bp->b_iodone = cbFunc;
2301 bp->b_private = cbArg;
2302 }
2303
2304 static void
2305 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2306 struct disklabel *lp)
2307 {
2308 memset(lp, 0, sizeof(*lp));
2309
2310 /* fabricate a label... */
2311 lp->d_secperunit = raidPtr->totalSectors;
2312 lp->d_secsize = raidPtr->bytesPerSector;
2313 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2314 lp->d_ntracks = 4 * raidPtr->numCol;
2315 lp->d_ncylinders = raidPtr->totalSectors /
2316 (lp->d_nsectors * lp->d_ntracks);
2317 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2318
2319 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2320 lp->d_type = DTYPE_RAID;
2321 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2322 lp->d_rpm = 3600;
2323 lp->d_interleave = 1;
2324 lp->d_flags = 0;
2325
2326 lp->d_partitions[RAW_PART].p_offset = 0;
2327 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2328 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2329 lp->d_npartitions = RAW_PART + 1;
2330
2331 lp->d_magic = DISKMAGIC;
2332 lp->d_magic2 = DISKMAGIC;
2333 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2334
2335 }
2336 /*
2337 * Read the disklabel from the raid device. If one is not present, fake one
2338 * up.
2339 */
2340 static void
2341 raidgetdisklabel(dev_t dev)
2342 {
2343 int unit = raidunit(dev);
2344 struct raid_softc *rs;
2345 const char *errstring;
2346 struct disklabel *lp;
2347 struct cpu_disklabel *clp;
2348 RF_Raid_t *raidPtr;
2349
2350 if ((rs = raidget(unit)) == NULL)
2351 return;
2352
2353 lp = rs->sc_dkdev.dk_label;
2354 clp = rs->sc_dkdev.dk_cpulabel;
2355
2356 db1_printf(("Getting the disklabel...\n"));
2357
2358 memset(clp, 0, sizeof(*clp));
2359
2360 raidPtr = &rs->sc_r;
2361
2362 raidgetdefaultlabel(raidPtr, rs, lp);
2363
2364 /*
2365 * Call the generic disklabel extraction routine.
2366 */
2367 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2368 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2369 if (errstring)
2370 raidmakedisklabel(rs);
2371 else {
2372 int i;
2373 struct partition *pp;
2374
2375 /*
2376 * Sanity check whether the found disklabel is valid.
2377 *
2378 * This is necessary since total size of the raid device
2379 * may vary when an interleave is changed even though exactly
2380 * same components are used, and old disklabel may used
2381 * if that is found.
2382 */
2383 if (lp->d_secperunit != rs->sc_size)
2384 printf("raid%d: WARNING: %s: "
2385 "total sector size in disklabel (%" PRIu32 ") != "
2386 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2387 lp->d_secperunit, rs->sc_size);
2388 for (i = 0; i < lp->d_npartitions; i++) {
2389 pp = &lp->d_partitions[i];
2390 if (pp->p_offset + pp->p_size > rs->sc_size)
2391 printf("raid%d: WARNING: %s: end of partition `%c' "
2392 "exceeds the size of raid (%" PRIu64 ")\n",
2393 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2394 }
2395 }
2396
2397 }
2398 /*
2399 * Take care of things one might want to take care of in the event
2400 * that a disklabel isn't present.
2401 */
2402 static void
2403 raidmakedisklabel(struct raid_softc *rs)
2404 {
2405 struct disklabel *lp = rs->sc_dkdev.dk_label;
2406 db1_printf(("Making a label..\n"));
2407
2408 /*
2409 * For historical reasons, if there's no disklabel present
2410 * the raw partition must be marked FS_BSDFFS.
2411 */
2412
2413 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2414
2415 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2416
2417 lp->d_checksum = dkcksum(lp);
2418 }
2419 /*
2420 * Wait interruptibly for an exclusive lock.
2421 *
2422 * XXX
2423 * Several drivers do this; it should be abstracted and made MP-safe.
2424 * (Hmm... where have we seen this warning before :-> GO )
2425 */
2426 static int
2427 raidlock(struct raid_softc *rs)
2428 {
2429 int error;
2430
2431 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2432 rs->sc_flags |= RAIDF_WANTED;
2433 if ((error =
2434 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2435 return (error);
2436 }
2437 rs->sc_flags |= RAIDF_LOCKED;
2438 return (0);
2439 }
2440 /*
2441 * Unlock and wake up any waiters.
2442 */
2443 static void
2444 raidunlock(struct raid_softc *rs)
2445 {
2446
2447 rs->sc_flags &= ~RAIDF_LOCKED;
2448 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2449 rs->sc_flags &= ~RAIDF_WANTED;
2450 wakeup(rs);
2451 }
2452 }
2453
2454
2455 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2456 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2457 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2458
2459 static daddr_t
2460 rf_component_info_offset(void)
2461 {
2462
2463 return RF_COMPONENT_INFO_OFFSET;
2464 }
2465
2466 static daddr_t
2467 rf_component_info_size(unsigned secsize)
2468 {
2469 daddr_t info_size;
2470
2471 KASSERT(secsize);
2472 if (secsize > RF_COMPONENT_INFO_SIZE)
2473 info_size = secsize;
2474 else
2475 info_size = RF_COMPONENT_INFO_SIZE;
2476
2477 return info_size;
2478 }
2479
2480 static daddr_t
2481 rf_parity_map_offset(RF_Raid_t *raidPtr)
2482 {
2483 daddr_t map_offset;
2484
2485 KASSERT(raidPtr->bytesPerSector);
2486 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2487 map_offset = raidPtr->bytesPerSector;
2488 else
2489 map_offset = RF_COMPONENT_INFO_SIZE;
2490 map_offset += rf_component_info_offset();
2491
2492 return map_offset;
2493 }
2494
2495 static daddr_t
2496 rf_parity_map_size(RF_Raid_t *raidPtr)
2497 {
2498 daddr_t map_size;
2499
2500 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2501 map_size = raidPtr->bytesPerSector;
2502 else
2503 map_size = RF_PARITY_MAP_SIZE;
2504
2505 return map_size;
2506 }
2507
2508 int
2509 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2510 {
2511 RF_ComponentLabel_t *clabel;
2512
2513 clabel = raidget_component_label(raidPtr, col);
2514 clabel->clean = RF_RAID_CLEAN;
2515 raidflush_component_label(raidPtr, col);
2516 return(0);
2517 }
2518
2519
2520 int
2521 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2522 {
2523 RF_ComponentLabel_t *clabel;
2524
2525 clabel = raidget_component_label(raidPtr, col);
2526 clabel->clean = RF_RAID_DIRTY;
2527 raidflush_component_label(raidPtr, col);
2528 return(0);
2529 }
2530
2531 int
2532 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2533 {
2534 KASSERT(raidPtr->bytesPerSector);
2535 return raidread_component_label(raidPtr->bytesPerSector,
2536 raidPtr->Disks[col].dev,
2537 raidPtr->raid_cinfo[col].ci_vp,
2538 &raidPtr->raid_cinfo[col].ci_label);
2539 }
2540
2541 RF_ComponentLabel_t *
2542 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2543 {
2544 return &raidPtr->raid_cinfo[col].ci_label;
2545 }
2546
2547 int
2548 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2549 {
2550 RF_ComponentLabel_t *label;
2551
2552 label = &raidPtr->raid_cinfo[col].ci_label;
2553 label->mod_counter = raidPtr->mod_counter;
2554 #ifndef RF_NO_PARITY_MAP
2555 label->parity_map_modcount = label->mod_counter;
2556 #endif
2557 return raidwrite_component_label(raidPtr->bytesPerSector,
2558 raidPtr->Disks[col].dev,
2559 raidPtr->raid_cinfo[col].ci_vp, label);
2560 }
2561
2562
2563 static int
2564 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2565 RF_ComponentLabel_t *clabel)
2566 {
2567 return raidread_component_area(dev, b_vp, clabel,
2568 sizeof(RF_ComponentLabel_t),
2569 rf_component_info_offset(),
2570 rf_component_info_size(secsize));
2571 }
2572
2573 /* ARGSUSED */
2574 static int
2575 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2576 size_t msize, daddr_t offset, daddr_t dsize)
2577 {
2578 struct buf *bp;
2579 const struct bdevsw *bdev;
2580 int error;
2581
2582 /* XXX should probably ensure that we don't try to do this if
2583 someone has changed rf_protected_sectors. */
2584
2585 if (b_vp == NULL) {
2586 /* For whatever reason, this component is not valid.
2587 Don't try to read a component label from it. */
2588 return(EINVAL);
2589 }
2590
2591 /* get a block of the appropriate size... */
2592 bp = geteblk((int)dsize);
2593 bp->b_dev = dev;
2594
2595 /* get our ducks in a row for the read */
2596 bp->b_blkno = offset / DEV_BSIZE;
2597 bp->b_bcount = dsize;
2598 bp->b_flags |= B_READ;
2599 bp->b_resid = dsize;
2600
2601 bdev = bdevsw_lookup(bp->b_dev);
2602 if (bdev == NULL)
2603 return (ENXIO);
2604 (*bdev->d_strategy)(bp);
2605
2606 error = biowait(bp);
2607
2608 if (!error) {
2609 memcpy(data, bp->b_data, msize);
2610 }
2611
2612 brelse(bp, 0);
2613 return(error);
2614 }
2615
2616
2617 static int
2618 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2619 RF_ComponentLabel_t *clabel)
2620 {
2621 return raidwrite_component_area(dev, b_vp, clabel,
2622 sizeof(RF_ComponentLabel_t),
2623 rf_component_info_offset(),
2624 rf_component_info_size(secsize), 0);
2625 }
2626
2627 /* ARGSUSED */
2628 static int
2629 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2630 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2631 {
2632 struct buf *bp;
2633 const struct bdevsw *bdev;
2634 int error;
2635
2636 /* get a block of the appropriate size... */
2637 bp = geteblk((int)dsize);
2638 bp->b_dev = dev;
2639
2640 /* get our ducks in a row for the write */
2641 bp->b_blkno = offset / DEV_BSIZE;
2642 bp->b_bcount = dsize;
2643 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2644 bp->b_resid = dsize;
2645
2646 memset(bp->b_data, 0, dsize);
2647 memcpy(bp->b_data, data, msize);
2648
2649 bdev = bdevsw_lookup(bp->b_dev);
2650 if (bdev == NULL)
2651 return (ENXIO);
2652 (*bdev->d_strategy)(bp);
2653 if (asyncp)
2654 return 0;
2655 error = biowait(bp);
2656 brelse(bp, 0);
2657 if (error) {
2658 #if 1
2659 printf("Failed to write RAID component info!\n");
2660 #endif
2661 }
2662
2663 return(error);
2664 }
2665
2666 void
2667 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2668 {
2669 int c;
2670
2671 for (c = 0; c < raidPtr->numCol; c++) {
2672 /* Skip dead disks. */
2673 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2674 continue;
2675 /* XXXjld: what if an error occurs here? */
2676 raidwrite_component_area(raidPtr->Disks[c].dev,
2677 raidPtr->raid_cinfo[c].ci_vp, map,
2678 RF_PARITYMAP_NBYTE,
2679 rf_parity_map_offset(raidPtr),
2680 rf_parity_map_size(raidPtr), 0);
2681 }
2682 }
2683
2684 void
2685 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2686 {
2687 struct rf_paritymap_ondisk tmp;
2688 int c,first;
2689
2690 first=1;
2691 for (c = 0; c < raidPtr->numCol; c++) {
2692 /* Skip dead disks. */
2693 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2694 continue;
2695 raidread_component_area(raidPtr->Disks[c].dev,
2696 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2697 RF_PARITYMAP_NBYTE,
2698 rf_parity_map_offset(raidPtr),
2699 rf_parity_map_size(raidPtr));
2700 if (first) {
2701 memcpy(map, &tmp, sizeof(*map));
2702 first = 0;
2703 } else {
2704 rf_paritymap_merge(map, &tmp);
2705 }
2706 }
2707 }
2708
2709 void
2710 rf_markalldirty(RF_Raid_t *raidPtr)
2711 {
2712 RF_ComponentLabel_t *clabel;
2713 int sparecol;
2714 int c;
2715 int j;
2716 int scol = -1;
2717
2718 raidPtr->mod_counter++;
2719 for (c = 0; c < raidPtr->numCol; c++) {
2720 /* we don't want to touch (at all) a disk that has
2721 failed */
2722 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2723 clabel = raidget_component_label(raidPtr, c);
2724 if (clabel->status == rf_ds_spared) {
2725 /* XXX do something special...
2726 but whatever you do, don't
2727 try to access it!! */
2728 } else {
2729 raidmarkdirty(raidPtr, c);
2730 }
2731 }
2732 }
2733
2734 for( c = 0; c < raidPtr->numSpare ; c++) {
2735 sparecol = raidPtr->numCol + c;
2736 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2737 /*
2738
2739 we claim this disk is "optimal" if it's
2740 rf_ds_used_spare, as that means it should be
2741 directly substitutable for the disk it replaced.
2742 We note that too...
2743
2744 */
2745
2746 for(j=0;j<raidPtr->numCol;j++) {
2747 if (raidPtr->Disks[j].spareCol == sparecol) {
2748 scol = j;
2749 break;
2750 }
2751 }
2752
2753 clabel = raidget_component_label(raidPtr, sparecol);
2754 /* make sure status is noted */
2755
2756 raid_init_component_label(raidPtr, clabel);
2757
2758 clabel->row = 0;
2759 clabel->column = scol;
2760 /* Note: we *don't* change status from rf_ds_used_spare
2761 to rf_ds_optimal */
2762 /* clabel.status = rf_ds_optimal; */
2763
2764 raidmarkdirty(raidPtr, sparecol);
2765 }
2766 }
2767 }
2768
2769
2770 void
2771 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2772 {
2773 RF_ComponentLabel_t *clabel;
2774 int sparecol;
2775 int c;
2776 int j;
2777 int scol;
2778
2779 scol = -1;
2780
2781 /* XXX should do extra checks to make sure things really are clean,
2782 rather than blindly setting the clean bit... */
2783
2784 raidPtr->mod_counter++;
2785
2786 for (c = 0; c < raidPtr->numCol; c++) {
2787 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2788 clabel = raidget_component_label(raidPtr, c);
2789 /* make sure status is noted */
2790 clabel->status = rf_ds_optimal;
2791
2792 /* note what unit we are configured as */
2793 clabel->last_unit = raidPtr->raidid;
2794
2795 raidflush_component_label(raidPtr, c);
2796 if (final == RF_FINAL_COMPONENT_UPDATE) {
2797 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2798 raidmarkclean(raidPtr, c);
2799 }
2800 }
2801 }
2802 /* else we don't touch it.. */
2803 }
2804
2805 for( c = 0; c < raidPtr->numSpare ; c++) {
2806 sparecol = raidPtr->numCol + c;
2807 /* Need to ensure that the reconstruct actually completed! */
2808 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2809 /*
2810
2811 we claim this disk is "optimal" if it's
2812 rf_ds_used_spare, as that means it should be
2813 directly substitutable for the disk it replaced.
2814 We note that too...
2815
2816 */
2817
2818 for(j=0;j<raidPtr->numCol;j++) {
2819 if (raidPtr->Disks[j].spareCol == sparecol) {
2820 scol = j;
2821 break;
2822 }
2823 }
2824
2825 /* XXX shouldn't *really* need this... */
2826 clabel = raidget_component_label(raidPtr, sparecol);
2827 /* make sure status is noted */
2828
2829 raid_init_component_label(raidPtr, clabel);
2830
2831 clabel->column = scol;
2832 clabel->status = rf_ds_optimal;
2833 clabel->last_unit = raidPtr->raidid;
2834
2835 raidflush_component_label(raidPtr, sparecol);
2836 if (final == RF_FINAL_COMPONENT_UPDATE) {
2837 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2838 raidmarkclean(raidPtr, sparecol);
2839 }
2840 }
2841 }
2842 }
2843 }
2844
2845 void
2846 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2847 {
2848
2849 if (vp != NULL) {
2850 if (auto_configured == 1) {
2851 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2852 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2853 vput(vp);
2854
2855 } else {
2856 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2857 }
2858 }
2859 }
2860
2861
2862 void
2863 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2864 {
2865 int r,c;
2866 struct vnode *vp;
2867 int acd;
2868
2869
2870 /* We take this opportunity to close the vnodes like we should.. */
2871
2872 for (c = 0; c < raidPtr->numCol; c++) {
2873 vp = raidPtr->raid_cinfo[c].ci_vp;
2874 acd = raidPtr->Disks[c].auto_configured;
2875 rf_close_component(raidPtr, vp, acd);
2876 raidPtr->raid_cinfo[c].ci_vp = NULL;
2877 raidPtr->Disks[c].auto_configured = 0;
2878 }
2879
2880 for (r = 0; r < raidPtr->numSpare; r++) {
2881 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2882 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2883 rf_close_component(raidPtr, vp, acd);
2884 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2885 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2886 }
2887 }
2888
2889
2890 void
2891 rf_ReconThread(struct rf_recon_req *req)
2892 {
2893 int s;
2894 RF_Raid_t *raidPtr;
2895
2896 s = splbio();
2897 raidPtr = (RF_Raid_t *) req->raidPtr;
2898 raidPtr->recon_in_progress = 1;
2899
2900 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2901 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2902
2903 RF_Free(req, sizeof(*req));
2904
2905 raidPtr->recon_in_progress = 0;
2906 splx(s);
2907
2908 /* That's all... */
2909 kthread_exit(0); /* does not return */
2910 }
2911
2912 void
2913 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2914 {
2915 int retcode;
2916 int s;
2917
2918 raidPtr->parity_rewrite_stripes_done = 0;
2919 raidPtr->parity_rewrite_in_progress = 1;
2920 s = splbio();
2921 retcode = rf_RewriteParity(raidPtr);
2922 splx(s);
2923 if (retcode) {
2924 printf("raid%d: Error re-writing parity (%d)!\n",
2925 raidPtr->raidid, retcode);
2926 } else {
2927 /* set the clean bit! If we shutdown correctly,
2928 the clean bit on each component label will get
2929 set */
2930 raidPtr->parity_good = RF_RAID_CLEAN;
2931 }
2932 raidPtr->parity_rewrite_in_progress = 0;
2933
2934 /* Anyone waiting for us to stop? If so, inform them... */
2935 if (raidPtr->waitShutdown) {
2936 wakeup(&raidPtr->parity_rewrite_in_progress);
2937 }
2938
2939 /* That's all... */
2940 kthread_exit(0); /* does not return */
2941 }
2942
2943
2944 void
2945 rf_CopybackThread(RF_Raid_t *raidPtr)
2946 {
2947 int s;
2948
2949 raidPtr->copyback_in_progress = 1;
2950 s = splbio();
2951 rf_CopybackReconstructedData(raidPtr);
2952 splx(s);
2953 raidPtr->copyback_in_progress = 0;
2954
2955 /* That's all... */
2956 kthread_exit(0); /* does not return */
2957 }
2958
2959
2960 void
2961 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2962 {
2963 int s;
2964 RF_Raid_t *raidPtr;
2965
2966 s = splbio();
2967 raidPtr = req->raidPtr;
2968 raidPtr->recon_in_progress = 1;
2969 rf_ReconstructInPlace(raidPtr, req->col);
2970 RF_Free(req, sizeof(*req));
2971 raidPtr->recon_in_progress = 0;
2972 splx(s);
2973
2974 /* That's all... */
2975 kthread_exit(0); /* does not return */
2976 }
2977
2978 static RF_AutoConfig_t *
2979 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2980 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2981 unsigned secsize)
2982 {
2983 int good_one = 0;
2984 RF_ComponentLabel_t *clabel;
2985 RF_AutoConfig_t *ac;
2986
2987 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2988 if (clabel == NULL) {
2989 oomem:
2990 while(ac_list) {
2991 ac = ac_list;
2992 if (ac->clabel)
2993 free(ac->clabel, M_RAIDFRAME);
2994 ac_list = ac_list->next;
2995 free(ac, M_RAIDFRAME);
2996 }
2997 printf("RAID auto config: out of memory!\n");
2998 return NULL; /* XXX probably should panic? */
2999 }
3000
3001 if (!raidread_component_label(secsize, dev, vp, clabel)) {
3002 /* Got the label. Does it look reasonable? */
3003 if (rf_reasonable_label(clabel, numsecs) &&
3004 (rf_component_label_partitionsize(clabel) <= size)) {
3005 #ifdef DEBUG
3006 printf("Component on: %s: %llu\n",
3007 cname, (unsigned long long)size);
3008 rf_print_component_label(clabel);
3009 #endif
3010 /* if it's reasonable, add it, else ignore it. */
3011 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
3012 M_NOWAIT);
3013 if (ac == NULL) {
3014 free(clabel, M_RAIDFRAME);
3015 goto oomem;
3016 }
3017 strlcpy(ac->devname, cname, sizeof(ac->devname));
3018 ac->dev = dev;
3019 ac->vp = vp;
3020 ac->clabel = clabel;
3021 ac->next = ac_list;
3022 ac_list = ac;
3023 good_one = 1;
3024 }
3025 }
3026 if (!good_one) {
3027 /* cleanup */
3028 free(clabel, M_RAIDFRAME);
3029 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3030 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3031 vput(vp);
3032 }
3033 return ac_list;
3034 }
3035
3036 RF_AutoConfig_t *
3037 rf_find_raid_components(void)
3038 {
3039 struct vnode *vp;
3040 struct disklabel label;
3041 device_t dv;
3042 deviter_t di;
3043 dev_t dev;
3044 int bmajor, bminor, wedge, rf_part_found;
3045 int error;
3046 int i;
3047 RF_AutoConfig_t *ac_list;
3048 uint64_t numsecs;
3049 unsigned secsize;
3050
3051 /* initialize the AutoConfig list */
3052 ac_list = NULL;
3053
3054 /* we begin by trolling through *all* the devices on the system */
3055
3056 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3057 dv = deviter_next(&di)) {
3058
3059 /* we are only interested in disks... */
3060 if (device_class(dv) != DV_DISK)
3061 continue;
3062
3063 /* we don't care about floppies... */
3064 if (device_is_a(dv, "fd")) {
3065 continue;
3066 }
3067
3068 /* we don't care about CD's... */
3069 if (device_is_a(dv, "cd")) {
3070 continue;
3071 }
3072
3073 /* we don't care about md's... */
3074 if (device_is_a(dv, "md")) {
3075 continue;
3076 }
3077
3078 /* hdfd is the Atari/Hades floppy driver */
3079 if (device_is_a(dv, "hdfd")) {
3080 continue;
3081 }
3082
3083 /* fdisa is the Atari/Milan floppy driver */
3084 if (device_is_a(dv, "fdisa")) {
3085 continue;
3086 }
3087
3088 /* need to find the device_name_to_block_device_major stuff */
3089 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3090
3091 rf_part_found = 0; /*No raid partition as yet*/
3092
3093 /* get a vnode for the raw partition of this disk */
3094
3095 wedge = device_is_a(dv, "dk");
3096 bminor = minor(device_unit(dv));
3097 dev = wedge ? makedev(bmajor, bminor) :
3098 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3099 if (bdevvp(dev, &vp))
3100 panic("RAID can't alloc vnode");
3101
3102 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3103
3104 if (error) {
3105 /* "Who cares." Continue looking
3106 for something that exists*/
3107 vput(vp);
3108 continue;
3109 }
3110
3111 error = getdisksize(vp, &numsecs, &secsize);
3112 if (error) {
3113 vput(vp);
3114 continue;
3115 }
3116 if (wedge) {
3117 struct dkwedge_info dkw;
3118 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3119 NOCRED);
3120 if (error) {
3121 printf("RAIDframe: can't get wedge info for "
3122 "dev %s (%d)\n", device_xname(dv), error);
3123 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3124 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3125 vput(vp);
3126 continue;
3127 }
3128
3129 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3130 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3131 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3132 vput(vp);
3133 continue;
3134 }
3135
3136 ac_list = rf_get_component(ac_list, dev, vp,
3137 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3138 rf_part_found = 1; /*There is a raid component on this disk*/
3139 continue;
3140 }
3141
3142 /* Ok, the disk exists. Go get the disklabel. */
3143 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3144 if (error) {
3145 /*
3146 * XXX can't happen - open() would
3147 * have errored out (or faked up one)
3148 */
3149 if (error != ENOTTY)
3150 printf("RAIDframe: can't get label for dev "
3151 "%s (%d)\n", device_xname(dv), error);
3152 }
3153
3154 /* don't need this any more. We'll allocate it again
3155 a little later if we really do... */
3156 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3157 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3158 vput(vp);
3159
3160 if (error)
3161 continue;
3162
3163 rf_part_found = 0; /*No raid partitions yet*/
3164 for (i = 0; i < label.d_npartitions; i++) {
3165 char cname[sizeof(ac_list->devname)];
3166
3167 /* We only support partitions marked as RAID */
3168 if (label.d_partitions[i].p_fstype != FS_RAID)
3169 continue;
3170
3171 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3172 if (bdevvp(dev, &vp))
3173 panic("RAID can't alloc vnode");
3174
3175 error = VOP_OPEN(vp, FREAD, NOCRED);
3176 if (error) {
3177 /* Whatever... */
3178 vput(vp);
3179 continue;
3180 }
3181 snprintf(cname, sizeof(cname), "%s%c",
3182 device_xname(dv), 'a' + i);
3183 ac_list = rf_get_component(ac_list, dev, vp, cname,
3184 label.d_partitions[i].p_size, numsecs, secsize);
3185 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3186 }
3187
3188 /*
3189 *If there is no raid component on this disk, either in a
3190 *disklabel or inside a wedge, check the raw partition as well,
3191 *as it is possible to configure raid components on raw disk
3192 *devices.
3193 */
3194
3195 if (!rf_part_found) {
3196 char cname[sizeof(ac_list->devname)];
3197
3198 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3199 if (bdevvp(dev, &vp))
3200 panic("RAID can't alloc vnode");
3201
3202 error = VOP_OPEN(vp, FREAD, NOCRED);
3203 if (error) {
3204 /* Whatever... */
3205 vput(vp);
3206 continue;
3207 }
3208 snprintf(cname, sizeof(cname), "%s%c",
3209 device_xname(dv), 'a' + RAW_PART);
3210 ac_list = rf_get_component(ac_list, dev, vp, cname,
3211 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3212 }
3213 }
3214 deviter_release(&di);
3215 return ac_list;
3216 }
3217
3218
3219 int
3220 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3221 {
3222
3223 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3224 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3225 ((clabel->clean == RF_RAID_CLEAN) ||
3226 (clabel->clean == RF_RAID_DIRTY)) &&
3227 clabel->row >=0 &&
3228 clabel->column >= 0 &&
3229 clabel->num_rows > 0 &&
3230 clabel->num_columns > 0 &&
3231 clabel->row < clabel->num_rows &&
3232 clabel->column < clabel->num_columns &&
3233 clabel->blockSize > 0 &&
3234 /*
3235 * numBlocksHi may contain garbage, but it is ok since
3236 * the type is unsigned. If it is really garbage,
3237 * rf_fix_old_label_size() will fix it.
3238 */
3239 rf_component_label_numblocks(clabel) > 0) {
3240 /*
3241 * label looks reasonable enough...
3242 * let's make sure it has no old garbage.
3243 */
3244 if (numsecs)
3245 rf_fix_old_label_size(clabel, numsecs);
3246 return(1);
3247 }
3248 return(0);
3249 }
3250
3251
3252 /*
3253 * For reasons yet unknown, some old component labels have garbage in
3254 * the newer numBlocksHi region, and this causes lossage. Since those
3255 * disks will also have numsecs set to less than 32 bits of sectors,
3256 * we can determine when this corruption has occurred, and fix it.
3257 *
3258 * The exact same problem, with the same unknown reason, happens to
3259 * the partitionSizeHi member as well.
3260 */
3261 static void
3262 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3263 {
3264
3265 if (numsecs < ((uint64_t)1 << 32)) {
3266 if (clabel->numBlocksHi) {
3267 printf("WARNING: total sectors < 32 bits, yet "
3268 "numBlocksHi set\n"
3269 "WARNING: resetting numBlocksHi to zero.\n");
3270 clabel->numBlocksHi = 0;
3271 }
3272
3273 if (clabel->partitionSizeHi) {
3274 printf("WARNING: total sectors < 32 bits, yet "
3275 "partitionSizeHi set\n"
3276 "WARNING: resetting partitionSizeHi to zero.\n");
3277 clabel->partitionSizeHi = 0;
3278 }
3279 }
3280 }
3281
3282
3283 #ifdef DEBUG
3284 void
3285 rf_print_component_label(RF_ComponentLabel_t *clabel)
3286 {
3287 uint64_t numBlocks;
3288
3289 numBlocks = rf_component_label_numblocks(clabel);
3290
3291 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3292 clabel->row, clabel->column,
3293 clabel->num_rows, clabel->num_columns);
3294 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3295 clabel->version, clabel->serial_number,
3296 clabel->mod_counter);
3297 printf(" Clean: %s Status: %d\n",
3298 clabel->clean ? "Yes" : "No", clabel->status);
3299 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3300 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3301 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3302 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3303 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3304 printf(" Contains root partition: %s\n",
3305 clabel->root_partition ? "Yes" : "No");
3306 printf(" Last configured as: raid%d\n", clabel->last_unit);
3307 #if 0
3308 printf(" Config order: %d\n", clabel->config_order);
3309 #endif
3310
3311 }
3312 #endif
3313
3314 RF_ConfigSet_t *
3315 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3316 {
3317 RF_AutoConfig_t *ac;
3318 RF_ConfigSet_t *config_sets;
3319 RF_ConfigSet_t *cset;
3320 RF_AutoConfig_t *ac_next;
3321
3322
3323 config_sets = NULL;
3324
3325 /* Go through the AutoConfig list, and figure out which components
3326 belong to what sets. */
3327 ac = ac_list;
3328 while(ac!=NULL) {
3329 /* we're going to putz with ac->next, so save it here
3330 for use at the end of the loop */
3331 ac_next = ac->next;
3332
3333 if (config_sets == NULL) {
3334 /* will need at least this one... */
3335 config_sets = (RF_ConfigSet_t *)
3336 malloc(sizeof(RF_ConfigSet_t),
3337 M_RAIDFRAME, M_NOWAIT);
3338 if (config_sets == NULL) {
3339 panic("rf_create_auto_sets: No memory!");
3340 }
3341 /* this one is easy :) */
3342 config_sets->ac = ac;
3343 config_sets->next = NULL;
3344 config_sets->rootable = 0;
3345 ac->next = NULL;
3346 } else {
3347 /* which set does this component fit into? */
3348 cset = config_sets;
3349 while(cset!=NULL) {
3350 if (rf_does_it_fit(cset, ac)) {
3351 /* looks like it matches... */
3352 ac->next = cset->ac;
3353 cset->ac = ac;
3354 break;
3355 }
3356 cset = cset->next;
3357 }
3358 if (cset==NULL) {
3359 /* didn't find a match above... new set..*/
3360 cset = (RF_ConfigSet_t *)
3361 malloc(sizeof(RF_ConfigSet_t),
3362 M_RAIDFRAME, M_NOWAIT);
3363 if (cset == NULL) {
3364 panic("rf_create_auto_sets: No memory!");
3365 }
3366 cset->ac = ac;
3367 ac->next = NULL;
3368 cset->next = config_sets;
3369 cset->rootable = 0;
3370 config_sets = cset;
3371 }
3372 }
3373 ac = ac_next;
3374 }
3375
3376
3377 return(config_sets);
3378 }
3379
3380 static int
3381 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3382 {
3383 RF_ComponentLabel_t *clabel1, *clabel2;
3384
3385 /* If this one matches the *first* one in the set, that's good
3386 enough, since the other members of the set would have been
3387 through here too... */
3388 /* note that we are not checking partitionSize here..
3389
3390 Note that we are also not checking the mod_counters here.
3391 If everything else matches except the mod_counter, that's
3392 good enough for this test. We will deal with the mod_counters
3393 a little later in the autoconfiguration process.
3394
3395 (clabel1->mod_counter == clabel2->mod_counter) &&
3396
3397 The reason we don't check for this is that failed disks
3398 will have lower modification counts. If those disks are
3399 not added to the set they used to belong to, then they will
3400 form their own set, which may result in 2 different sets,
3401 for example, competing to be configured at raid0, and
3402 perhaps competing to be the root filesystem set. If the
3403 wrong ones get configured, or both attempt to become /,
3404 weird behaviour and or serious lossage will occur. Thus we
3405 need to bring them into the fold here, and kick them out at
3406 a later point.
3407
3408 */
3409
3410 clabel1 = cset->ac->clabel;
3411 clabel2 = ac->clabel;
3412 if ((clabel1->version == clabel2->version) &&
3413 (clabel1->serial_number == clabel2->serial_number) &&
3414 (clabel1->num_rows == clabel2->num_rows) &&
3415 (clabel1->num_columns == clabel2->num_columns) &&
3416 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3417 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3418 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3419 (clabel1->parityConfig == clabel2->parityConfig) &&
3420 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3421 (clabel1->blockSize == clabel2->blockSize) &&
3422 rf_component_label_numblocks(clabel1) ==
3423 rf_component_label_numblocks(clabel2) &&
3424 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3425 (clabel1->root_partition == clabel2->root_partition) &&
3426 (clabel1->last_unit == clabel2->last_unit) &&
3427 (clabel1->config_order == clabel2->config_order)) {
3428 /* if it get's here, it almost *has* to be a match */
3429 } else {
3430 /* it's not consistent with somebody in the set..
3431 punt */
3432 return(0);
3433 }
3434 /* all was fine.. it must fit... */
3435 return(1);
3436 }
3437
3438 int
3439 rf_have_enough_components(RF_ConfigSet_t *cset)
3440 {
3441 RF_AutoConfig_t *ac;
3442 RF_AutoConfig_t *auto_config;
3443 RF_ComponentLabel_t *clabel;
3444 int c;
3445 int num_cols;
3446 int num_missing;
3447 int mod_counter;
3448 int mod_counter_found;
3449 int even_pair_failed;
3450 char parity_type;
3451
3452
3453 /* check to see that we have enough 'live' components
3454 of this set. If so, we can configure it if necessary */
3455
3456 num_cols = cset->ac->clabel->num_columns;
3457 parity_type = cset->ac->clabel->parityConfig;
3458
3459 /* XXX Check for duplicate components!?!?!? */
3460
3461 /* Determine what the mod_counter is supposed to be for this set. */
3462
3463 mod_counter_found = 0;
3464 mod_counter = 0;
3465 ac = cset->ac;
3466 while(ac!=NULL) {
3467 if (mod_counter_found==0) {
3468 mod_counter = ac->clabel->mod_counter;
3469 mod_counter_found = 1;
3470 } else {
3471 if (ac->clabel->mod_counter > mod_counter) {
3472 mod_counter = ac->clabel->mod_counter;
3473 }
3474 }
3475 ac = ac->next;
3476 }
3477
3478 num_missing = 0;
3479 auto_config = cset->ac;
3480
3481 even_pair_failed = 0;
3482 for(c=0; c<num_cols; c++) {
3483 ac = auto_config;
3484 while(ac!=NULL) {
3485 if ((ac->clabel->column == c) &&
3486 (ac->clabel->mod_counter == mod_counter)) {
3487 /* it's this one... */
3488 #ifdef DEBUG
3489 printf("Found: %s at %d\n",
3490 ac->devname,c);
3491 #endif
3492 break;
3493 }
3494 ac=ac->next;
3495 }
3496 if (ac==NULL) {
3497 /* Didn't find one here! */
3498 /* special case for RAID 1, especially
3499 where there are more than 2
3500 components (where RAIDframe treats
3501 things a little differently :( ) */
3502 if (parity_type == '1') {
3503 if (c%2 == 0) { /* even component */
3504 even_pair_failed = 1;
3505 } else { /* odd component. If
3506 we're failed, and
3507 so is the even
3508 component, it's
3509 "Good Night, Charlie" */
3510 if (even_pair_failed == 1) {
3511 return(0);
3512 }
3513 }
3514 } else {
3515 /* normal accounting */
3516 num_missing++;
3517 }
3518 }
3519 if ((parity_type == '1') && (c%2 == 1)) {
3520 /* Just did an even component, and we didn't
3521 bail.. reset the even_pair_failed flag,
3522 and go on to the next component.... */
3523 even_pair_failed = 0;
3524 }
3525 }
3526
3527 clabel = cset->ac->clabel;
3528
3529 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3530 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3531 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3532 /* XXX this needs to be made *much* more general */
3533 /* Too many failures */
3534 return(0);
3535 }
3536 /* otherwise, all is well, and we've got enough to take a kick
3537 at autoconfiguring this set */
3538 return(1);
3539 }
3540
3541 void
3542 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3543 RF_Raid_t *raidPtr)
3544 {
3545 RF_ComponentLabel_t *clabel;
3546 int i;
3547
3548 clabel = ac->clabel;
3549
3550 /* 1. Fill in the common stuff */
3551 config->numRow = clabel->num_rows = 1;
3552 config->numCol = clabel->num_columns;
3553 config->numSpare = 0; /* XXX should this be set here? */
3554 config->sectPerSU = clabel->sectPerSU;
3555 config->SUsPerPU = clabel->SUsPerPU;
3556 config->SUsPerRU = clabel->SUsPerRU;
3557 config->parityConfig = clabel->parityConfig;
3558 /* XXX... */
3559 strcpy(config->diskQueueType,"fifo");
3560 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3561 config->layoutSpecificSize = 0; /* XXX ?? */
3562
3563 while(ac!=NULL) {
3564 /* row/col values will be in range due to the checks
3565 in reasonable_label() */
3566 strcpy(config->devnames[0][ac->clabel->column],
3567 ac->devname);
3568 ac = ac->next;
3569 }
3570
3571 for(i=0;i<RF_MAXDBGV;i++) {
3572 config->debugVars[i][0] = 0;
3573 }
3574 }
3575
3576 int
3577 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3578 {
3579 RF_ComponentLabel_t *clabel;
3580 int column;
3581 int sparecol;
3582
3583 raidPtr->autoconfigure = new_value;
3584
3585 for(column=0; column<raidPtr->numCol; column++) {
3586 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3587 clabel = raidget_component_label(raidPtr, column);
3588 clabel->autoconfigure = new_value;
3589 raidflush_component_label(raidPtr, column);
3590 }
3591 }
3592 for(column = 0; column < raidPtr->numSpare ; column++) {
3593 sparecol = raidPtr->numCol + column;
3594 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3595 clabel = raidget_component_label(raidPtr, sparecol);
3596 clabel->autoconfigure = new_value;
3597 raidflush_component_label(raidPtr, sparecol);
3598 }
3599 }
3600 return(new_value);
3601 }
3602
3603 int
3604 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3605 {
3606 RF_ComponentLabel_t *clabel;
3607 int column;
3608 int sparecol;
3609
3610 raidPtr->root_partition = new_value;
3611 for(column=0; column<raidPtr->numCol; column++) {
3612 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3613 clabel = raidget_component_label(raidPtr, column);
3614 clabel->root_partition = new_value;
3615 raidflush_component_label(raidPtr, column);
3616 }
3617 }
3618 for(column = 0; column < raidPtr->numSpare ; column++) {
3619 sparecol = raidPtr->numCol + column;
3620 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3621 clabel = raidget_component_label(raidPtr, sparecol);
3622 clabel->root_partition = new_value;
3623 raidflush_component_label(raidPtr, sparecol);
3624 }
3625 }
3626 return(new_value);
3627 }
3628
3629 void
3630 rf_release_all_vps(RF_ConfigSet_t *cset)
3631 {
3632 RF_AutoConfig_t *ac;
3633
3634 ac = cset->ac;
3635 while(ac!=NULL) {
3636 /* Close the vp, and give it back */
3637 if (ac->vp) {
3638 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3639 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3640 vput(ac->vp);
3641 ac->vp = NULL;
3642 }
3643 ac = ac->next;
3644 }
3645 }
3646
3647
3648 void
3649 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3650 {
3651 RF_AutoConfig_t *ac;
3652 RF_AutoConfig_t *next_ac;
3653
3654 ac = cset->ac;
3655 while(ac!=NULL) {
3656 next_ac = ac->next;
3657 /* nuke the label */
3658 free(ac->clabel, M_RAIDFRAME);
3659 /* cleanup the config structure */
3660 free(ac, M_RAIDFRAME);
3661 /* "next.." */
3662 ac = next_ac;
3663 }
3664 /* and, finally, nuke the config set */
3665 free(cset, M_RAIDFRAME);
3666 }
3667
3668
3669 void
3670 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3671 {
3672 /* current version number */
3673 clabel->version = RF_COMPONENT_LABEL_VERSION;
3674 clabel->serial_number = raidPtr->serial_number;
3675 clabel->mod_counter = raidPtr->mod_counter;
3676
3677 clabel->num_rows = 1;
3678 clabel->num_columns = raidPtr->numCol;
3679 clabel->clean = RF_RAID_DIRTY; /* not clean */
3680 clabel->status = rf_ds_optimal; /* "It's good!" */
3681
3682 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3683 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3684 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3685
3686 clabel->blockSize = raidPtr->bytesPerSector;
3687 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3688
3689 /* XXX not portable */
3690 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3691 clabel->maxOutstanding = raidPtr->maxOutstanding;
3692 clabel->autoconfigure = raidPtr->autoconfigure;
3693 clabel->root_partition = raidPtr->root_partition;
3694 clabel->last_unit = raidPtr->raidid;
3695 clabel->config_order = raidPtr->config_order;
3696
3697 #ifndef RF_NO_PARITY_MAP
3698 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3699 #endif
3700 }
3701
3702 struct raid_softc *
3703 rf_auto_config_set(RF_ConfigSet_t *cset)
3704 {
3705 RF_Raid_t *raidPtr;
3706 RF_Config_t *config;
3707 int raidID;
3708 struct raid_softc *sc;
3709
3710 #ifdef DEBUG
3711 printf("RAID autoconfigure\n");
3712 #endif
3713
3714 /* 1. Create a config structure */
3715 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3716 if (config == NULL) {
3717 printf("Out of mem!?!?\n");
3718 /* XXX do something more intelligent here. */
3719 return NULL;
3720 }
3721
3722 /*
3723 2. Figure out what RAID ID this one is supposed to live at
3724 See if we can get the same RAID dev that it was configured
3725 on last time..
3726 */
3727
3728 raidID = cset->ac->clabel->last_unit;
3729 for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
3730 continue;
3731 #ifdef DEBUG
3732 printf("Configuring raid%d:\n",raidID);
3733 #endif
3734
3735 raidPtr = &sc->sc_r;
3736
3737 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3738 raidPtr->softc = sc;
3739 raidPtr->raidid = raidID;
3740 raidPtr->openings = RAIDOUTSTANDING;
3741
3742 /* 3. Build the configuration structure */
3743 rf_create_configuration(cset->ac, config, raidPtr);
3744
3745 /* 4. Do the configuration */
3746 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3747 raidinit(sc);
3748
3749 rf_markalldirty(raidPtr);
3750 raidPtr->autoconfigure = 1; /* XXX do this here? */
3751 if (cset->ac->clabel->root_partition==1) {
3752 /* everything configured just fine. Make a note
3753 that this set is eligible to be root. */
3754 cset->rootable = 1;
3755 /* XXX do this here? */
3756 raidPtr->root_partition = 1;
3757 }
3758 } else {
3759 raidput(sc);
3760 sc = NULL;
3761 }
3762
3763 /* 5. Cleanup */
3764 free(config, M_RAIDFRAME);
3765 return sc;
3766 }
3767
3768 void
3769 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3770 {
3771 struct buf *bp;
3772 struct raid_softc *rs;
3773
3774 bp = (struct buf *)desc->bp;
3775 rs = desc->raidPtr->softc;
3776 disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
3777 (bp->b_flags & B_READ));
3778 }
3779
3780 void
3781 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3782 size_t xmin, size_t xmax)
3783 {
3784 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3785 pool_sethiwat(p, xmax);
3786 pool_prime(p, xmin);
3787 pool_setlowat(p, xmin);
3788 }
3789
3790 /*
3791 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
3792 * if there is IO pending and if that IO could possibly be done for a
3793 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3794 * otherwise.
3795 *
3796 */
3797
3798 int
3799 rf_buf_queue_check(RF_Raid_t *raidPtr)
3800 {
3801 struct raid_softc *rs = raidPtr->softc;
3802 if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
3803 /* there is work to do */
3804 return 0;
3805 }
3806 /* default is nothing to do */
3807 return 1;
3808 }
3809
3810 int
3811 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3812 {
3813 uint64_t numsecs;
3814 unsigned secsize;
3815 int error;
3816
3817 error = getdisksize(vp, &numsecs, &secsize);
3818 if (error == 0) {
3819 diskPtr->blockSize = secsize;
3820 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3821 diskPtr->partitionSize = numsecs;
3822 return 0;
3823 }
3824 return error;
3825 }
3826
3827 static int
3828 raid_match(device_t self, cfdata_t cfdata, void *aux)
3829 {
3830 return 1;
3831 }
3832
3833 static void
3834 raid_attach(device_t parent, device_t self, void *aux)
3835 {
3836
3837 }
3838
3839
3840 static int
3841 raid_detach(device_t self, int flags)
3842 {
3843 int error;
3844 struct raid_softc *rs = device_private(self);
3845
3846 if ((error = raidlock(rs)) != 0)
3847 return (error);
3848
3849 error = raid_detach_unlocked(rs);
3850
3851 raidunlock(rs);
3852
3853 return error;
3854 }
3855
3856 static void
3857 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3858 {
3859 prop_dictionary_t disk_info, odisk_info, geom;
3860 disk_info = prop_dictionary_create();
3861 geom = prop_dictionary_create();
3862 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3863 raidPtr->totalSectors);
3864 prop_dictionary_set_uint32(geom, "sector-size",
3865 raidPtr->bytesPerSector);
3866
3867 prop_dictionary_set_uint16(geom, "sectors-per-track",
3868 raidPtr->Layout.dataSectorsPerStripe);
3869 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3870 4 * raidPtr->numCol);
3871
3872 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3873 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3874 (4 * raidPtr->numCol)));
3875
3876 prop_dictionary_set(disk_info, "geometry", geom);
3877 prop_object_release(geom);
3878 prop_dictionary_set(device_properties(rs->sc_dev),
3879 "disk-info", disk_info);
3880 odisk_info = rs->sc_dkdev.dk_info;
3881 rs->sc_dkdev.dk_info = disk_info;
3882 if (odisk_info)
3883 prop_object_release(odisk_info);
3884 }
3885
3886 /*
3887 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3888 * We end up returning whatever error was returned by the first cache flush
3889 * that fails.
3890 */
3891
3892 int
3893 rf_sync_component_caches(RF_Raid_t *raidPtr)
3894 {
3895 int c, sparecol;
3896 int e,error;
3897 int force = 1;
3898
3899 error = 0;
3900 for (c = 0; c < raidPtr->numCol; c++) {
3901 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3902 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3903 &force, FWRITE, NOCRED);
3904 if (e) {
3905 if (e != ENODEV)
3906 printf("raid%d: cache flush to component %s failed.\n",
3907 raidPtr->raidid, raidPtr->Disks[c].devname);
3908 if (error == 0) {
3909 error = e;
3910 }
3911 }
3912 }
3913 }
3914
3915 for( c = 0; c < raidPtr->numSpare ; c++) {
3916 sparecol = raidPtr->numCol + c;
3917 /* Need to ensure that the reconstruct actually completed! */
3918 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3919 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3920 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3921 if (e) {
3922 if (e != ENODEV)
3923 printf("raid%d: cache flush to component %s failed.\n",
3924 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3925 if (error == 0) {
3926 error = e;
3927 }
3928 }
3929 }
3930 }
3931 return error;
3932 }
3933