rf_netbsdkintf.c revision 1.300 1 /* $NetBSD: rf_netbsdkintf.c,v 1.300 2013/04/27 21:18:42 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.300 2013/04/27 21:18:42 christos Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #include "raid.h"
110 #endif
111
112 #include <sys/param.h>
113 #include <sys/errno.h>
114 #include <sys/pool.h>
115 #include <sys/proc.h>
116 #include <sys/queue.h>
117 #include <sys/disk.h>
118 #include <sys/device.h>
119 #include <sys/stat.h>
120 #include <sys/ioctl.h>
121 #include <sys/fcntl.h>
122 #include <sys/systm.h>
123 #include <sys/vnode.h>
124 #include <sys/disklabel.h>
125 #include <sys/conf.h>
126 #include <sys/buf.h>
127 #include <sys/bufq.h>
128 #include <sys/reboot.h>
129 #include <sys/kauth.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #ifdef DEBUG
156 int rf_kdebug_level = 0;
157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
158 #else /* DEBUG */
159 #define db1_printf(a) { }
160 #endif /* DEBUG */
161
162 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
163 static rf_declare_mutex2(rf_sparet_wait_mutex);
164 static rf_declare_cond2(rf_sparet_wait_cv);
165 static rf_declare_cond2(rf_sparet_resp_cv);
166
167 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
168 * spare table */
169 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
170 * installation process */
171 #endif
172
173 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
174
175 /* prototypes */
176 static void KernelWakeupFunc(struct buf *);
177 static void InitBP(struct buf *, struct vnode *, unsigned,
178 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
179 void *, int, struct proc *);
180 struct raid_softc;
181 static void raidinit(struct raid_softc *);
182
183 void raidattach(int);
184 static int raid_match(device_t, cfdata_t, void *);
185 static void raid_attach(device_t, device_t, void *);
186 static int raid_detach(device_t, int);
187
188 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
189 daddr_t, daddr_t);
190 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
191 daddr_t, daddr_t, int);
192
193 static int raidwrite_component_label(unsigned,
194 dev_t, struct vnode *, RF_ComponentLabel_t *);
195 static int raidread_component_label(unsigned,
196 dev_t, struct vnode *, RF_ComponentLabel_t *);
197
198
199 dev_type_open(raidopen);
200 dev_type_close(raidclose);
201 dev_type_read(raidread);
202 dev_type_write(raidwrite);
203 dev_type_ioctl(raidioctl);
204 dev_type_strategy(raidstrategy);
205 dev_type_dump(raiddump);
206 dev_type_size(raidsize);
207
208 const struct bdevsw raid_bdevsw = {
209 raidopen, raidclose, raidstrategy, raidioctl,
210 raiddump, raidsize, D_DISK
211 };
212
213 const struct cdevsw raid_cdevsw = {
214 raidopen, raidclose, raidread, raidwrite, raidioctl,
215 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
216 };
217
218 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
219
220 struct raid_softc {
221 device_t sc_dev;
222 int sc_unit;
223 int sc_flags; /* flags */
224 int sc_cflags; /* configuration flags */
225 uint64_t sc_size; /* size of the raid device */
226 char sc_xname[20]; /* XXX external name */
227 struct disk sc_dkdev; /* generic disk device info */
228 struct bufq_state *buf_queue; /* used for the device queue */
229 RF_Raid_t sc_r;
230 LIST_ENTRY(raid_softc) sc_link;
231 };
232 /* sc_flags */
233 #define RAIDF_INITED 0x01 /* unit has been initialized */
234 #define RAIDF_WLABEL 0x02 /* label area is writable */
235 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
236 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
237 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
238 #define RAIDF_LOCKED 0x80 /* unit is locked */
239
240 #define raidunit(x) DISKUNIT(x)
241
242 extern struct cfdriver raid_cd;
243 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
244 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
245 DVF_DETACH_SHUTDOWN);
246
247 /*
248 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
249 * Be aware that large numbers can allow the driver to consume a lot of
250 * kernel memory, especially on writes, and in degraded mode reads.
251 *
252 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
253 * a single 64K write will typically require 64K for the old data,
254 * 64K for the old parity, and 64K for the new parity, for a total
255 * of 192K (if the parity buffer is not re-used immediately).
256 * Even it if is used immediately, that's still 128K, which when multiplied
257 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
258 *
259 * Now in degraded mode, for example, a 64K read on the above setup may
260 * require data reconstruction, which will require *all* of the 4 remaining
261 * disks to participate -- 4 * 32K/disk == 128K again.
262 */
263
264 #ifndef RAIDOUTSTANDING
265 #define RAIDOUTSTANDING 6
266 #endif
267
268 #define RAIDLABELDEV(dev) \
269 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
270
271 /* declared here, and made public, for the benefit of KVM stuff.. */
272
273 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
274 struct disklabel *);
275 static void raidgetdisklabel(dev_t);
276 static void raidmakedisklabel(struct raid_softc *);
277
278 static int raidlock(struct raid_softc *);
279 static void raidunlock(struct raid_softc *);
280
281 static int raid_detach_unlocked(struct raid_softc *);
282
283 static void rf_markalldirty(RF_Raid_t *);
284 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
285
286 void rf_ReconThread(struct rf_recon_req *);
287 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
288 void rf_CopybackThread(RF_Raid_t *raidPtr);
289 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
290 int rf_autoconfig(device_t);
291 void rf_buildroothack(RF_ConfigSet_t *);
292
293 RF_AutoConfig_t *rf_find_raid_components(void);
294 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
295 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
296 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
297 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
298 int rf_set_autoconfig(RF_Raid_t *, int);
299 int rf_set_rootpartition(RF_Raid_t *, int);
300 void rf_release_all_vps(RF_ConfigSet_t *);
301 void rf_cleanup_config_set(RF_ConfigSet_t *);
302 int rf_have_enough_components(RF_ConfigSet_t *);
303 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
304 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
305
306 /*
307 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
308 * Note that this is overridden by having RAID_AUTOCONFIG as an option
309 * in the kernel config file.
310 */
311 #ifdef RAID_AUTOCONFIG
312 int raidautoconfig = 1;
313 #else
314 int raidautoconfig = 0;
315 #endif
316 static bool raidautoconfigdone = false;
317
318 struct RF_Pools_s rf_pools;
319
320 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
321 static kmutex_t raid_lock;
322
323 static struct raid_softc *
324 raidcreate(int unit) {
325 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
326 if (sc == NULL) {
327 #ifdef DIAGNOSTIC
328 printf("%s: out of memory\n", __func__);
329 #endif
330 return NULL;
331 }
332 sc->sc_unit = unit;
333 sc->sc_r.softc = sc;
334 bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
335 return sc;
336 }
337
338 static void
339 raiddestroy(struct raid_softc *sc) {
340 bufq_free(sc->buf_queue);
341 kmem_free(sc, sizeof(*sc));
342 }
343
344 static struct raid_softc *
345 raidget(int unit) {
346 struct raid_softc *sc;
347 if (unit < 0) {
348 #ifdef DIAGNOSTIC
349 panic("%s: unit %d!", __func__, unit);
350 #endif
351 return NULL;
352 }
353 mutex_enter(&raid_lock);
354 LIST_FOREACH(sc, &raids, sc_link) {
355 if (sc->sc_unit == unit) {
356 mutex_exit(&raid_lock);
357 return sc;
358 }
359 }
360 mutex_exit(&raid_lock);
361 if ((sc = raidcreate(unit)) == NULL)
362 return NULL;
363 mutex_enter(&raid_lock);
364 LIST_INSERT_HEAD(&raids, sc, sc_link);
365 mutex_exit(&raid_lock);
366 return sc;
367 }
368
369 static void
370 raidput(struct raid_softc *sc) {
371 mutex_enter(&raid_lock);
372 LIST_REMOVE(sc, sc_link);
373 mutex_exit(&raid_lock);
374 raiddestroy(sc);
375 }
376
377 void
378 raidattach(int num)
379 {
380 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
381 /* This is where all the initialization stuff gets done. */
382
383 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
384 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
385 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
386 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
387
388 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
389 #endif
390
391 if (rf_BootRaidframe() == 0)
392 aprint_verbose("Kernelized RAIDframe activated\n");
393 else
394 panic("Serious error booting RAID!!");
395
396 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
397 aprint_error("raidattach: config_cfattach_attach failed?\n");
398 }
399
400 raidautoconfigdone = false;
401
402 /*
403 * Register a finalizer which will be used to auto-config RAID
404 * sets once all real hardware devices have been found.
405 */
406 if (config_finalize_register(NULL, rf_autoconfig) != 0)
407 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
408 }
409
410 int
411 rf_autoconfig(device_t self)
412 {
413 RF_AutoConfig_t *ac_list;
414 RF_ConfigSet_t *config_sets;
415
416 if (!raidautoconfig || raidautoconfigdone == true)
417 return (0);
418
419 /* XXX This code can only be run once. */
420 raidautoconfigdone = true;
421
422 /* 1. locate all RAID components on the system */
423 aprint_debug("Searching for RAID components...\n");
424 ac_list = rf_find_raid_components();
425
426 /* 2. Sort them into their respective sets. */
427 config_sets = rf_create_auto_sets(ac_list);
428
429 /*
430 * 3. Evaluate each set and configure the valid ones.
431 * This gets done in rf_buildroothack().
432 */
433 rf_buildroothack(config_sets);
434
435 return 1;
436 }
437
438 void
439 rf_buildroothack(RF_ConfigSet_t *config_sets)
440 {
441 RF_ConfigSet_t *cset;
442 RF_ConfigSet_t *next_cset;
443 int col;
444 int num_root;
445 char *devname;
446 struct raid_softc *sc, *rsc;
447
448 sc = rsc = NULL;
449 num_root = 0;
450 cset = config_sets;
451 while (cset != NULL) {
452 next_cset = cset->next;
453 if (rf_have_enough_components(cset) &&
454 cset->ac->clabel->autoconfigure == 1) {
455 sc = rf_auto_config_set(cset);
456 if (sc != NULL) {
457 aprint_debug("raid%d: configured ok\n",
458 sc->sc_unit);
459 if (cset->rootable) {
460 rsc = sc;
461 num_root++;
462 }
463 } else {
464 /* The autoconfig didn't work :( */
465 aprint_debug("Autoconfig failed\n");
466 rf_release_all_vps(cset);
467 }
468 } else {
469 /* we're not autoconfiguring this set...
470 release the associated resources */
471 rf_release_all_vps(cset);
472 }
473 /* cleanup */
474 rf_cleanup_config_set(cset);
475 cset = next_cset;
476 }
477
478 /* if the user has specified what the root device should be
479 then we don't touch booted_device or boothowto... */
480
481 if (rootspec != NULL)
482 return;
483
484 /* we found something bootable... */
485
486 if (num_root == 1) {
487 if (rsc->sc_dkdev.dk_nwedges != 0) {
488 /* XXX: How do we find the real root partition? */
489 char cname[sizeof(cset->ac->devname)];
490 snprintf(cname, sizeof(cname), "%s%c",
491 device_xname(rsc->sc_dev), 'a');
492 booted_device = dkwedge_find_by_wname(cname);
493 } else
494 booted_device = rsc->sc_dev;
495 } else if (num_root > 1) {
496
497 /*
498 * Maybe the MD code can help. If it cannot, then
499 * setroot() will discover that we have no
500 * booted_device and will ask the user if nothing was
501 * hardwired in the kernel config file
502 */
503
504 if (booted_device == NULL)
505 cpu_rootconf();
506 if (booted_device == NULL)
507 return;
508
509 num_root = 0;
510 mutex_enter(&raid_lock);
511 LIST_FOREACH(sc, &raids, sc_link) {
512 RF_Raid_t *r = &sc->sc_r;
513 if (r->valid == 0)
514 continue;
515
516 if (r->root_partition == 0)
517 continue;
518
519 for (col = 0; col < r->numCol; col++) {
520 devname = r->Disks[col].devname;
521 devname += sizeof("/dev/") - 1;
522 if (strncmp(devname, device_xname(booted_device),
523 strlen(device_xname(booted_device))) != 0)
524 continue;
525 aprint_debug("raid%d includes boot device %s\n",
526 sc->sc_unit, devname);
527 num_root++;
528 rsc = sc;
529 }
530 }
531 mutex_exit(&raid_lock);
532
533 if (num_root == 1) {
534 booted_device = rsc->sc_dev;
535 } else {
536 /* we can't guess.. require the user to answer... */
537 boothowto |= RB_ASKNAME;
538 }
539 }
540 }
541
542
543 int
544 raidsize(dev_t dev)
545 {
546 struct raid_softc *rs;
547 struct disklabel *lp;
548 int part, unit, omask, size;
549
550 unit = raidunit(dev);
551 if ((rs = raidget(unit)) == NULL)
552 return -1;
553 if ((rs->sc_flags & RAIDF_INITED) == 0)
554 return (-1);
555
556 part = DISKPART(dev);
557 omask = rs->sc_dkdev.dk_openmask & (1 << part);
558 lp = rs->sc_dkdev.dk_label;
559
560 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
561 return (-1);
562
563 if (lp->d_partitions[part].p_fstype != FS_SWAP)
564 size = -1;
565 else
566 size = lp->d_partitions[part].p_size *
567 (lp->d_secsize / DEV_BSIZE);
568
569 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
570 return (-1);
571
572 return (size);
573
574 }
575
576 int
577 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
578 {
579 int unit = raidunit(dev);
580 struct raid_softc *rs;
581 const struct bdevsw *bdev;
582 struct disklabel *lp;
583 RF_Raid_t *raidPtr;
584 daddr_t offset;
585 int part, c, sparecol, j, scol, dumpto;
586 int error = 0;
587
588 if ((rs = raidget(unit)) == NULL)
589 return ENXIO;
590
591 raidPtr = &rs->sc_r;
592
593 if ((rs->sc_flags & RAIDF_INITED) == 0)
594 return ENXIO;
595
596 /* we only support dumping to RAID 1 sets */
597 if (raidPtr->Layout.numDataCol != 1 ||
598 raidPtr->Layout.numParityCol != 1)
599 return EINVAL;
600
601
602 if ((error = raidlock(rs)) != 0)
603 return error;
604
605 if (size % DEV_BSIZE != 0) {
606 error = EINVAL;
607 goto out;
608 }
609
610 if (blkno + size / DEV_BSIZE > rs->sc_size) {
611 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
612 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
613 size / DEV_BSIZE, rs->sc_size);
614 error = EINVAL;
615 goto out;
616 }
617
618 part = DISKPART(dev);
619 lp = rs->sc_dkdev.dk_label;
620 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
621
622 /* figure out what device is alive.. */
623
624 /*
625 Look for a component to dump to. The preference for the
626 component to dump to is as follows:
627 1) the master
628 2) a used_spare of the master
629 3) the slave
630 4) a used_spare of the slave
631 */
632
633 dumpto = -1;
634 for (c = 0; c < raidPtr->numCol; c++) {
635 if (raidPtr->Disks[c].status == rf_ds_optimal) {
636 /* this might be the one */
637 dumpto = c;
638 break;
639 }
640 }
641
642 /*
643 At this point we have possibly selected a live master or a
644 live slave. We now check to see if there is a spared
645 master (or a spared slave), if we didn't find a live master
646 or a live slave.
647 */
648
649 for (c = 0; c < raidPtr->numSpare; c++) {
650 sparecol = raidPtr->numCol + c;
651 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
652 /* How about this one? */
653 scol = -1;
654 for(j=0;j<raidPtr->numCol;j++) {
655 if (raidPtr->Disks[j].spareCol == sparecol) {
656 scol = j;
657 break;
658 }
659 }
660 if (scol == 0) {
661 /*
662 We must have found a spared master!
663 We'll take that over anything else
664 found so far. (We couldn't have
665 found a real master before, since
666 this is a used spare, and it's
667 saying that it's replacing the
668 master.) On reboot (with
669 autoconfiguration turned on)
670 sparecol will become the 1st
671 component (component0) of this set.
672 */
673 dumpto = sparecol;
674 break;
675 } else if (scol != -1) {
676 /*
677 Must be a spared slave. We'll dump
678 to that if we havn't found anything
679 else so far.
680 */
681 if (dumpto == -1)
682 dumpto = sparecol;
683 }
684 }
685 }
686
687 if (dumpto == -1) {
688 /* we couldn't find any live components to dump to!?!?
689 */
690 error = EINVAL;
691 goto out;
692 }
693
694 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
695
696 /*
697 Note that blkno is relative to this particular partition.
698 By adding the offset of this partition in the RAID
699 set, and also adding RF_PROTECTED_SECTORS, we get a
700 value that is relative to the partition used for the
701 underlying component.
702 */
703
704 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
705 blkno + offset, va, size);
706
707 out:
708 raidunlock(rs);
709
710 return error;
711 }
712 /* ARGSUSED */
713 int
714 raidopen(dev_t dev, int flags, int fmt,
715 struct lwp *l)
716 {
717 int unit = raidunit(dev);
718 struct raid_softc *rs;
719 struct disklabel *lp;
720 int part, pmask;
721 int error = 0;
722
723 if ((rs = raidget(unit)) == NULL)
724 return ENXIO;
725 if ((error = raidlock(rs)) != 0)
726 return (error);
727
728 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
729 error = EBUSY;
730 goto bad;
731 }
732
733 lp = rs->sc_dkdev.dk_label;
734
735 part = DISKPART(dev);
736
737 /*
738 * If there are wedges, and this is not RAW_PART, then we
739 * need to fail.
740 */
741 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
742 error = EBUSY;
743 goto bad;
744 }
745 pmask = (1 << part);
746
747 if ((rs->sc_flags & RAIDF_INITED) &&
748 (rs->sc_dkdev.dk_openmask == 0))
749 raidgetdisklabel(dev);
750
751 /* make sure that this partition exists */
752
753 if (part != RAW_PART) {
754 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
755 ((part >= lp->d_npartitions) ||
756 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
757 error = ENXIO;
758 goto bad;
759 }
760 }
761 /* Prevent this unit from being unconfigured while open. */
762 switch (fmt) {
763 case S_IFCHR:
764 rs->sc_dkdev.dk_copenmask |= pmask;
765 break;
766
767 case S_IFBLK:
768 rs->sc_dkdev.dk_bopenmask |= pmask;
769 break;
770 }
771
772 if ((rs->sc_dkdev.dk_openmask == 0) &&
773 ((rs->sc_flags & RAIDF_INITED) != 0)) {
774 /* First one... mark things as dirty... Note that we *MUST*
775 have done a configure before this. I DO NOT WANT TO BE
776 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
777 THAT THEY BELONG TOGETHER!!!!! */
778 /* XXX should check to see if we're only open for reading
779 here... If so, we needn't do this, but then need some
780 other way of keeping track of what's happened.. */
781
782 rf_markalldirty(&rs->sc_r);
783 }
784
785
786 rs->sc_dkdev.dk_openmask =
787 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
788
789 bad:
790 raidunlock(rs);
791
792 return (error);
793
794
795 }
796 /* ARGSUSED */
797 int
798 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
799 {
800 int unit = raidunit(dev);
801 struct raid_softc *rs;
802 int error = 0;
803 int part;
804
805 if ((rs = raidget(unit)) == NULL)
806 return ENXIO;
807
808 if ((error = raidlock(rs)) != 0)
809 return (error);
810
811 part = DISKPART(dev);
812
813 /* ...that much closer to allowing unconfiguration... */
814 switch (fmt) {
815 case S_IFCHR:
816 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
817 break;
818
819 case S_IFBLK:
820 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
821 break;
822 }
823 rs->sc_dkdev.dk_openmask =
824 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
825
826 if ((rs->sc_dkdev.dk_openmask == 0) &&
827 ((rs->sc_flags & RAIDF_INITED) != 0)) {
828 /* Last one... device is not unconfigured yet.
829 Device shutdown has taken care of setting the
830 clean bits if RAIDF_INITED is not set
831 mark things as clean... */
832
833 rf_update_component_labels(&rs->sc_r,
834 RF_FINAL_COMPONENT_UPDATE);
835
836 /* If the kernel is shutting down, it will detach
837 * this RAID set soon enough.
838 */
839 }
840
841 raidunlock(rs);
842 return (0);
843
844 }
845
846 void
847 raidstrategy(struct buf *bp)
848 {
849 unsigned int unit = raidunit(bp->b_dev);
850 RF_Raid_t *raidPtr;
851 int wlabel;
852 struct raid_softc *rs;
853
854 if ((rs = raidget(unit)) == NULL) {
855 bp->b_error = ENXIO;
856 goto done;
857 }
858 if ((rs->sc_flags & RAIDF_INITED) == 0) {
859 bp->b_error = ENXIO;
860 goto done;
861 }
862 raidPtr = &rs->sc_r;
863 if (!raidPtr->valid) {
864 bp->b_error = ENODEV;
865 goto done;
866 }
867 if (bp->b_bcount == 0) {
868 db1_printf(("b_bcount is zero..\n"));
869 goto done;
870 }
871
872 /*
873 * Do bounds checking and adjust transfer. If there's an
874 * error, the bounds check will flag that for us.
875 */
876
877 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
878 if (DISKPART(bp->b_dev) == RAW_PART) {
879 uint64_t size; /* device size in DEV_BSIZE unit */
880
881 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
882 size = raidPtr->totalSectors <<
883 (raidPtr->logBytesPerSector - DEV_BSHIFT);
884 } else {
885 size = raidPtr->totalSectors >>
886 (DEV_BSHIFT - raidPtr->logBytesPerSector);
887 }
888 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
889 goto done;
890 }
891 } else {
892 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
893 db1_printf(("Bounds check failed!!:%d %d\n",
894 (int) bp->b_blkno, (int) wlabel));
895 goto done;
896 }
897 }
898
899 rf_lock_mutex2(raidPtr->iodone_lock);
900
901 bp->b_resid = 0;
902
903 /* stuff it onto our queue */
904 bufq_put(rs->buf_queue, bp);
905
906 /* scheduled the IO to happen at the next convenient time */
907 rf_signal_cond2(raidPtr->iodone_cv);
908 rf_unlock_mutex2(raidPtr->iodone_lock);
909
910 return;
911
912 done:
913 bp->b_resid = bp->b_bcount;
914 biodone(bp);
915 }
916 /* ARGSUSED */
917 int
918 raidread(dev_t dev, struct uio *uio, int flags)
919 {
920 int unit = raidunit(dev);
921 struct raid_softc *rs;
922
923 if ((rs = raidget(unit)) == NULL)
924 return ENXIO;
925
926 if ((rs->sc_flags & RAIDF_INITED) == 0)
927 return (ENXIO);
928
929 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
930
931 }
932 /* ARGSUSED */
933 int
934 raidwrite(dev_t dev, struct uio *uio, int flags)
935 {
936 int unit = raidunit(dev);
937 struct raid_softc *rs;
938
939 if ((rs = raidget(unit)) == NULL)
940 return ENXIO;
941
942 if ((rs->sc_flags & RAIDF_INITED) == 0)
943 return (ENXIO);
944
945 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
946
947 }
948
949 static int
950 raid_detach_unlocked(struct raid_softc *rs)
951 {
952 int error;
953 RF_Raid_t *raidPtr;
954
955 raidPtr = &rs->sc_r;
956
957 /*
958 * If somebody has a partition mounted, we shouldn't
959 * shutdown.
960 */
961 if (rs->sc_dkdev.dk_openmask != 0)
962 return EBUSY;
963
964 if ((rs->sc_flags & RAIDF_INITED) == 0)
965 ; /* not initialized: nothing to do */
966 else if ((error = rf_Shutdown(raidPtr)) != 0)
967 return error;
968 else
969 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
970
971 /* Detach the disk. */
972 dkwedge_delall(&rs->sc_dkdev);
973 disk_detach(&rs->sc_dkdev);
974 disk_destroy(&rs->sc_dkdev);
975
976 aprint_normal_dev(rs->sc_dev, "detached\n");
977
978 return 0;
979 }
980
981 int
982 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
983 {
984 int unit = raidunit(dev);
985 int error = 0;
986 int part, pmask, s;
987 cfdata_t cf;
988 struct raid_softc *rs;
989 RF_Config_t *k_cfg, *u_cfg;
990 RF_Raid_t *raidPtr;
991 RF_RaidDisk_t *diskPtr;
992 RF_AccTotals_t *totals;
993 RF_DeviceConfig_t *d_cfg, **ucfgp;
994 u_char *specific_buf;
995 int retcode = 0;
996 int column;
997 /* int raidid; */
998 struct rf_recon_req *rrcopy, *rr;
999 RF_ComponentLabel_t *clabel;
1000 RF_ComponentLabel_t *ci_label;
1001 RF_ComponentLabel_t **clabel_ptr;
1002 RF_SingleComponent_t *sparePtr,*componentPtr;
1003 RF_SingleComponent_t component;
1004 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1005 int i, j, d;
1006 #ifdef __HAVE_OLD_DISKLABEL
1007 struct disklabel newlabel;
1008 #endif
1009 struct dkwedge_info *dkw;
1010
1011 if ((rs = raidget(unit)) == NULL)
1012 return ENXIO;
1013 raidPtr = &rs->sc_r;
1014
1015 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1016 (int) DISKPART(dev), (int) unit, cmd));
1017
1018 /* Must be open for writes for these commands... */
1019 switch (cmd) {
1020 #ifdef DIOCGSECTORSIZE
1021 case DIOCGSECTORSIZE:
1022 *(u_int *)data = raidPtr->bytesPerSector;
1023 return 0;
1024 case DIOCGMEDIASIZE:
1025 *(off_t *)data =
1026 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1027 return 0;
1028 #endif
1029 case DIOCSDINFO:
1030 case DIOCWDINFO:
1031 #ifdef __HAVE_OLD_DISKLABEL
1032 case ODIOCWDINFO:
1033 case ODIOCSDINFO:
1034 #endif
1035 case DIOCWLABEL:
1036 case DIOCAWEDGE:
1037 case DIOCDWEDGE:
1038 case DIOCSSTRATEGY:
1039 if ((flag & FWRITE) == 0)
1040 return (EBADF);
1041 }
1042
1043 /* Must be initialized for these... */
1044 switch (cmd) {
1045 case DIOCGDINFO:
1046 case DIOCSDINFO:
1047 case DIOCWDINFO:
1048 #ifdef __HAVE_OLD_DISKLABEL
1049 case ODIOCGDINFO:
1050 case ODIOCWDINFO:
1051 case ODIOCSDINFO:
1052 case ODIOCGDEFLABEL:
1053 #endif
1054 case DIOCGPART:
1055 case DIOCWLABEL:
1056 case DIOCGDEFLABEL:
1057 case DIOCAWEDGE:
1058 case DIOCDWEDGE:
1059 case DIOCLWEDGES:
1060 case DIOCCACHESYNC:
1061 case RAIDFRAME_SHUTDOWN:
1062 case RAIDFRAME_REWRITEPARITY:
1063 case RAIDFRAME_GET_INFO:
1064 case RAIDFRAME_RESET_ACCTOTALS:
1065 case RAIDFRAME_GET_ACCTOTALS:
1066 case RAIDFRAME_KEEP_ACCTOTALS:
1067 case RAIDFRAME_GET_SIZE:
1068 case RAIDFRAME_FAIL_DISK:
1069 case RAIDFRAME_COPYBACK:
1070 case RAIDFRAME_CHECK_RECON_STATUS:
1071 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1072 case RAIDFRAME_GET_COMPONENT_LABEL:
1073 case RAIDFRAME_SET_COMPONENT_LABEL:
1074 case RAIDFRAME_ADD_HOT_SPARE:
1075 case RAIDFRAME_REMOVE_HOT_SPARE:
1076 case RAIDFRAME_INIT_LABELS:
1077 case RAIDFRAME_REBUILD_IN_PLACE:
1078 case RAIDFRAME_CHECK_PARITY:
1079 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1080 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1081 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1082 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1083 case RAIDFRAME_SET_AUTOCONFIG:
1084 case RAIDFRAME_SET_ROOT:
1085 case RAIDFRAME_DELETE_COMPONENT:
1086 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1087 case RAIDFRAME_PARITYMAP_STATUS:
1088 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1089 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1090 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1091 case DIOCGSTRATEGY:
1092 case DIOCSSTRATEGY:
1093 if ((rs->sc_flags & RAIDF_INITED) == 0)
1094 return (ENXIO);
1095 }
1096
1097 switch (cmd) {
1098 #ifdef COMPAT_50
1099 case RAIDFRAME_GET_INFO50:
1100 return rf_get_info50(raidPtr, data);
1101
1102 case RAIDFRAME_CONFIGURE50:
1103 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1104 return retcode;
1105 goto config;
1106 #endif
1107 /* configure the system */
1108 case RAIDFRAME_CONFIGURE:
1109
1110 if (raidPtr->valid) {
1111 /* There is a valid RAID set running on this unit! */
1112 printf("raid%d: Device already configured!\n",unit);
1113 return(EINVAL);
1114 }
1115
1116 /* copy-in the configuration information */
1117 /* data points to a pointer to the configuration structure */
1118
1119 u_cfg = *((RF_Config_t **) data);
1120 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1121 if (k_cfg == NULL) {
1122 return (ENOMEM);
1123 }
1124 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1125 if (retcode) {
1126 RF_Free(k_cfg, sizeof(RF_Config_t));
1127 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1128 retcode));
1129 return (retcode);
1130 }
1131 goto config;
1132 config:
1133 /* allocate a buffer for the layout-specific data, and copy it
1134 * in */
1135 if (k_cfg->layoutSpecificSize) {
1136 if (k_cfg->layoutSpecificSize > 10000) {
1137 /* sanity check */
1138 RF_Free(k_cfg, sizeof(RF_Config_t));
1139 return (EINVAL);
1140 }
1141 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1142 (u_char *));
1143 if (specific_buf == NULL) {
1144 RF_Free(k_cfg, sizeof(RF_Config_t));
1145 return (ENOMEM);
1146 }
1147 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1148 k_cfg->layoutSpecificSize);
1149 if (retcode) {
1150 RF_Free(k_cfg, sizeof(RF_Config_t));
1151 RF_Free(specific_buf,
1152 k_cfg->layoutSpecificSize);
1153 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1154 retcode));
1155 return (retcode);
1156 }
1157 } else
1158 specific_buf = NULL;
1159 k_cfg->layoutSpecific = specific_buf;
1160
1161 /* should do some kind of sanity check on the configuration.
1162 * Store the sum of all the bytes in the last byte? */
1163
1164 /* configure the system */
1165
1166 /*
1167 * Clear the entire RAID descriptor, just to make sure
1168 * there is no stale data left in the case of a
1169 * reconfiguration
1170 */
1171 memset(raidPtr, 0, sizeof(*raidPtr));
1172 raidPtr->raidid = unit;
1173
1174 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1175
1176 if (retcode == 0) {
1177
1178 /* allow this many simultaneous IO's to
1179 this RAID device */
1180 raidPtr->openings = RAIDOUTSTANDING;
1181
1182 raidinit(rs);
1183 rf_markalldirty(raidPtr);
1184 }
1185 /* free the buffers. No return code here. */
1186 if (k_cfg->layoutSpecificSize) {
1187 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1188 }
1189 RF_Free(k_cfg, sizeof(RF_Config_t));
1190
1191 return (retcode);
1192
1193 /* shutdown the system */
1194 case RAIDFRAME_SHUTDOWN:
1195
1196 part = DISKPART(dev);
1197 pmask = (1 << part);
1198
1199 if ((error = raidlock(rs)) != 0)
1200 return (error);
1201
1202 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1203 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1204 (rs->sc_dkdev.dk_copenmask & pmask)))
1205 retcode = EBUSY;
1206 else {
1207 rs->sc_flags |= RAIDF_SHUTDOWN;
1208 rs->sc_dkdev.dk_copenmask &= ~pmask;
1209 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1210 rs->sc_dkdev.dk_openmask &= ~pmask;
1211 retcode = 0;
1212 }
1213
1214 raidunlock(rs);
1215
1216 if (retcode != 0)
1217 return retcode;
1218
1219 /* free the pseudo device attach bits */
1220
1221 cf = device_cfdata(rs->sc_dev);
1222 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1223 free(cf, M_RAIDFRAME);
1224
1225 return (retcode);
1226 case RAIDFRAME_GET_COMPONENT_LABEL:
1227 clabel_ptr = (RF_ComponentLabel_t **) data;
1228 /* need to read the component label for the disk indicated
1229 by row,column in clabel */
1230
1231 /*
1232 * Perhaps there should be an option to skip the in-core
1233 * copy and hit the disk, as with disklabel(8).
1234 */
1235 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1236
1237 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1238
1239 if (retcode) {
1240 RF_Free(clabel, sizeof(*clabel));
1241 return retcode;
1242 }
1243
1244 clabel->row = 0; /* Don't allow looking at anything else.*/
1245
1246 column = clabel->column;
1247
1248 if ((column < 0) || (column >= raidPtr->numCol +
1249 raidPtr->numSpare)) {
1250 RF_Free(clabel, sizeof(*clabel));
1251 return EINVAL;
1252 }
1253
1254 RF_Free(clabel, sizeof(*clabel));
1255
1256 clabel = raidget_component_label(raidPtr, column);
1257
1258 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1259
1260 #if 0
1261 case RAIDFRAME_SET_COMPONENT_LABEL:
1262 clabel = (RF_ComponentLabel_t *) data;
1263
1264 /* XXX check the label for valid stuff... */
1265 /* Note that some things *should not* get modified --
1266 the user should be re-initing the labels instead of
1267 trying to patch things.
1268 */
1269
1270 raidid = raidPtr->raidid;
1271 #ifdef DEBUG
1272 printf("raid%d: Got component label:\n", raidid);
1273 printf("raid%d: Version: %d\n", raidid, clabel->version);
1274 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1275 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1276 printf("raid%d: Column: %d\n", raidid, clabel->column);
1277 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1278 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1279 printf("raid%d: Status: %d\n", raidid, clabel->status);
1280 #endif
1281 clabel->row = 0;
1282 column = clabel->column;
1283
1284 if ((column < 0) || (column >= raidPtr->numCol)) {
1285 return(EINVAL);
1286 }
1287
1288 /* XXX this isn't allowed to do anything for now :-) */
1289
1290 /* XXX and before it is, we need to fill in the rest
1291 of the fields!?!?!?! */
1292 memcpy(raidget_component_label(raidPtr, column),
1293 clabel, sizeof(*clabel));
1294 raidflush_component_label(raidPtr, column);
1295 return (0);
1296 #endif
1297
1298 case RAIDFRAME_INIT_LABELS:
1299 clabel = (RF_ComponentLabel_t *) data;
1300 /*
1301 we only want the serial number from
1302 the above. We get all the rest of the information
1303 from the config that was used to create this RAID
1304 set.
1305 */
1306
1307 raidPtr->serial_number = clabel->serial_number;
1308
1309 for(column=0;column<raidPtr->numCol;column++) {
1310 diskPtr = &raidPtr->Disks[column];
1311 if (!RF_DEAD_DISK(diskPtr->status)) {
1312 ci_label = raidget_component_label(raidPtr,
1313 column);
1314 /* Zeroing this is important. */
1315 memset(ci_label, 0, sizeof(*ci_label));
1316 raid_init_component_label(raidPtr, ci_label);
1317 ci_label->serial_number =
1318 raidPtr->serial_number;
1319 ci_label->row = 0; /* we dont' pretend to support more */
1320 rf_component_label_set_partitionsize(ci_label,
1321 diskPtr->partitionSize);
1322 ci_label->column = column;
1323 raidflush_component_label(raidPtr, column);
1324 }
1325 /* XXXjld what about the spares? */
1326 }
1327
1328 return (retcode);
1329 case RAIDFRAME_SET_AUTOCONFIG:
1330 d = rf_set_autoconfig(raidPtr, *(int *) data);
1331 printf("raid%d: New autoconfig value is: %d\n",
1332 raidPtr->raidid, d);
1333 *(int *) data = d;
1334 return (retcode);
1335
1336 case RAIDFRAME_SET_ROOT:
1337 d = rf_set_rootpartition(raidPtr, *(int *) data);
1338 printf("raid%d: New rootpartition value is: %d\n",
1339 raidPtr->raidid, d);
1340 *(int *) data = d;
1341 return (retcode);
1342
1343 /* initialize all parity */
1344 case RAIDFRAME_REWRITEPARITY:
1345
1346 if (raidPtr->Layout.map->faultsTolerated == 0) {
1347 /* Parity for RAID 0 is trivially correct */
1348 raidPtr->parity_good = RF_RAID_CLEAN;
1349 return(0);
1350 }
1351
1352 if (raidPtr->parity_rewrite_in_progress == 1) {
1353 /* Re-write is already in progress! */
1354 return(EINVAL);
1355 }
1356
1357 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1358 rf_RewriteParityThread,
1359 raidPtr,"raid_parity");
1360 return (retcode);
1361
1362
1363 case RAIDFRAME_ADD_HOT_SPARE:
1364 sparePtr = (RF_SingleComponent_t *) data;
1365 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1366 retcode = rf_add_hot_spare(raidPtr, &component);
1367 return(retcode);
1368
1369 case RAIDFRAME_REMOVE_HOT_SPARE:
1370 return(retcode);
1371
1372 case RAIDFRAME_DELETE_COMPONENT:
1373 componentPtr = (RF_SingleComponent_t *)data;
1374 memcpy( &component, componentPtr,
1375 sizeof(RF_SingleComponent_t));
1376 retcode = rf_delete_component(raidPtr, &component);
1377 return(retcode);
1378
1379 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1380 componentPtr = (RF_SingleComponent_t *)data;
1381 memcpy( &component, componentPtr,
1382 sizeof(RF_SingleComponent_t));
1383 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1384 return(retcode);
1385
1386 case RAIDFRAME_REBUILD_IN_PLACE:
1387
1388 if (raidPtr->Layout.map->faultsTolerated == 0) {
1389 /* Can't do this on a RAID 0!! */
1390 return(EINVAL);
1391 }
1392
1393 if (raidPtr->recon_in_progress == 1) {
1394 /* a reconstruct is already in progress! */
1395 return(EINVAL);
1396 }
1397
1398 componentPtr = (RF_SingleComponent_t *) data;
1399 memcpy( &component, componentPtr,
1400 sizeof(RF_SingleComponent_t));
1401 component.row = 0; /* we don't support any more */
1402 column = component.column;
1403
1404 if ((column < 0) || (column >= raidPtr->numCol)) {
1405 return(EINVAL);
1406 }
1407
1408 rf_lock_mutex2(raidPtr->mutex);
1409 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1410 (raidPtr->numFailures > 0)) {
1411 /* XXX 0 above shouldn't be constant!!! */
1412 /* some component other than this has failed.
1413 Let's not make things worse than they already
1414 are... */
1415 printf("raid%d: Unable to reconstruct to disk at:\n",
1416 raidPtr->raidid);
1417 printf("raid%d: Col: %d Too many failures.\n",
1418 raidPtr->raidid, column);
1419 rf_unlock_mutex2(raidPtr->mutex);
1420 return (EINVAL);
1421 }
1422 if (raidPtr->Disks[column].status ==
1423 rf_ds_reconstructing) {
1424 printf("raid%d: Unable to reconstruct to disk at:\n",
1425 raidPtr->raidid);
1426 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1427
1428 rf_unlock_mutex2(raidPtr->mutex);
1429 return (EINVAL);
1430 }
1431 if (raidPtr->Disks[column].status == rf_ds_spared) {
1432 rf_unlock_mutex2(raidPtr->mutex);
1433 return (EINVAL);
1434 }
1435 rf_unlock_mutex2(raidPtr->mutex);
1436
1437 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1438 if (rrcopy == NULL)
1439 return(ENOMEM);
1440
1441 rrcopy->raidPtr = (void *) raidPtr;
1442 rrcopy->col = column;
1443
1444 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1445 rf_ReconstructInPlaceThread,
1446 rrcopy,"raid_reconip");
1447 return(retcode);
1448
1449 case RAIDFRAME_GET_INFO:
1450 if (!raidPtr->valid)
1451 return (ENODEV);
1452 ucfgp = (RF_DeviceConfig_t **) data;
1453 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1454 (RF_DeviceConfig_t *));
1455 if (d_cfg == NULL)
1456 return (ENOMEM);
1457 d_cfg->rows = 1; /* there is only 1 row now */
1458 d_cfg->cols = raidPtr->numCol;
1459 d_cfg->ndevs = raidPtr->numCol;
1460 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1461 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1462 return (ENOMEM);
1463 }
1464 d_cfg->nspares = raidPtr->numSpare;
1465 if (d_cfg->nspares >= RF_MAX_DISKS) {
1466 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1467 return (ENOMEM);
1468 }
1469 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1470 d = 0;
1471 for (j = 0; j < d_cfg->cols; j++) {
1472 d_cfg->devs[d] = raidPtr->Disks[j];
1473 d++;
1474 }
1475 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1476 d_cfg->spares[i] = raidPtr->Disks[j];
1477 }
1478 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1479 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1480
1481 return (retcode);
1482
1483 case RAIDFRAME_CHECK_PARITY:
1484 *(int *) data = raidPtr->parity_good;
1485 return (0);
1486
1487 case RAIDFRAME_PARITYMAP_STATUS:
1488 if (rf_paritymap_ineligible(raidPtr))
1489 return EINVAL;
1490 rf_paritymap_status(raidPtr->parity_map,
1491 (struct rf_pmstat *)data);
1492 return 0;
1493
1494 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1495 if (rf_paritymap_ineligible(raidPtr))
1496 return EINVAL;
1497 if (raidPtr->parity_map == NULL)
1498 return ENOENT; /* ??? */
1499 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1500 (struct rf_pmparams *)data, 1))
1501 return EINVAL;
1502 return 0;
1503
1504 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1505 if (rf_paritymap_ineligible(raidPtr))
1506 return EINVAL;
1507 *(int *) data = rf_paritymap_get_disable(raidPtr);
1508 return 0;
1509
1510 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1511 if (rf_paritymap_ineligible(raidPtr))
1512 return EINVAL;
1513 rf_paritymap_set_disable(raidPtr, *(int *)data);
1514 /* XXX should errors be passed up? */
1515 return 0;
1516
1517 case RAIDFRAME_RESET_ACCTOTALS:
1518 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1519 return (0);
1520
1521 case RAIDFRAME_GET_ACCTOTALS:
1522 totals = (RF_AccTotals_t *) data;
1523 *totals = raidPtr->acc_totals;
1524 return (0);
1525
1526 case RAIDFRAME_KEEP_ACCTOTALS:
1527 raidPtr->keep_acc_totals = *(int *)data;
1528 return (0);
1529
1530 case RAIDFRAME_GET_SIZE:
1531 *(int *) data = raidPtr->totalSectors;
1532 return (0);
1533
1534 /* fail a disk & optionally start reconstruction */
1535 case RAIDFRAME_FAIL_DISK:
1536
1537 if (raidPtr->Layout.map->faultsTolerated == 0) {
1538 /* Can't do this on a RAID 0!! */
1539 return(EINVAL);
1540 }
1541
1542 rr = (struct rf_recon_req *) data;
1543 rr->row = 0;
1544 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1545 return (EINVAL);
1546
1547
1548 rf_lock_mutex2(raidPtr->mutex);
1549 if (raidPtr->status == rf_rs_reconstructing) {
1550 /* you can't fail a disk while we're reconstructing! */
1551 /* XXX wrong for RAID6 */
1552 rf_unlock_mutex2(raidPtr->mutex);
1553 return (EINVAL);
1554 }
1555 if ((raidPtr->Disks[rr->col].status ==
1556 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1557 /* some other component has failed. Let's not make
1558 things worse. XXX wrong for RAID6 */
1559 rf_unlock_mutex2(raidPtr->mutex);
1560 return (EINVAL);
1561 }
1562 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1563 /* Can't fail a spared disk! */
1564 rf_unlock_mutex2(raidPtr->mutex);
1565 return (EINVAL);
1566 }
1567 rf_unlock_mutex2(raidPtr->mutex);
1568
1569 /* make a copy of the recon request so that we don't rely on
1570 * the user's buffer */
1571 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1572 if (rrcopy == NULL)
1573 return(ENOMEM);
1574 memcpy(rrcopy, rr, sizeof(*rr));
1575 rrcopy->raidPtr = (void *) raidPtr;
1576
1577 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1578 rf_ReconThread,
1579 rrcopy,"raid_recon");
1580 return (0);
1581
1582 /* invoke a copyback operation after recon on whatever disk
1583 * needs it, if any */
1584 case RAIDFRAME_COPYBACK:
1585
1586 if (raidPtr->Layout.map->faultsTolerated == 0) {
1587 /* This makes no sense on a RAID 0!! */
1588 return(EINVAL);
1589 }
1590
1591 if (raidPtr->copyback_in_progress == 1) {
1592 /* Copyback is already in progress! */
1593 return(EINVAL);
1594 }
1595
1596 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1597 rf_CopybackThread,
1598 raidPtr,"raid_copyback");
1599 return (retcode);
1600
1601 /* return the percentage completion of reconstruction */
1602 case RAIDFRAME_CHECK_RECON_STATUS:
1603 if (raidPtr->Layout.map->faultsTolerated == 0) {
1604 /* This makes no sense on a RAID 0, so tell the
1605 user it's done. */
1606 *(int *) data = 100;
1607 return(0);
1608 }
1609 if (raidPtr->status != rf_rs_reconstructing)
1610 *(int *) data = 100;
1611 else {
1612 if (raidPtr->reconControl->numRUsTotal > 0) {
1613 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1614 } else {
1615 *(int *) data = 0;
1616 }
1617 }
1618 return (0);
1619 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1620 progressInfoPtr = (RF_ProgressInfo_t **) data;
1621 if (raidPtr->status != rf_rs_reconstructing) {
1622 progressInfo.remaining = 0;
1623 progressInfo.completed = 100;
1624 progressInfo.total = 100;
1625 } else {
1626 progressInfo.total =
1627 raidPtr->reconControl->numRUsTotal;
1628 progressInfo.completed =
1629 raidPtr->reconControl->numRUsComplete;
1630 progressInfo.remaining = progressInfo.total -
1631 progressInfo.completed;
1632 }
1633 retcode = copyout(&progressInfo, *progressInfoPtr,
1634 sizeof(RF_ProgressInfo_t));
1635 return (retcode);
1636
1637 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1638 if (raidPtr->Layout.map->faultsTolerated == 0) {
1639 /* This makes no sense on a RAID 0, so tell the
1640 user it's done. */
1641 *(int *) data = 100;
1642 return(0);
1643 }
1644 if (raidPtr->parity_rewrite_in_progress == 1) {
1645 *(int *) data = 100 *
1646 raidPtr->parity_rewrite_stripes_done /
1647 raidPtr->Layout.numStripe;
1648 } else {
1649 *(int *) data = 100;
1650 }
1651 return (0);
1652
1653 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1654 progressInfoPtr = (RF_ProgressInfo_t **) data;
1655 if (raidPtr->parity_rewrite_in_progress == 1) {
1656 progressInfo.total = raidPtr->Layout.numStripe;
1657 progressInfo.completed =
1658 raidPtr->parity_rewrite_stripes_done;
1659 progressInfo.remaining = progressInfo.total -
1660 progressInfo.completed;
1661 } else {
1662 progressInfo.remaining = 0;
1663 progressInfo.completed = 100;
1664 progressInfo.total = 100;
1665 }
1666 retcode = copyout(&progressInfo, *progressInfoPtr,
1667 sizeof(RF_ProgressInfo_t));
1668 return (retcode);
1669
1670 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1671 if (raidPtr->Layout.map->faultsTolerated == 0) {
1672 /* This makes no sense on a RAID 0 */
1673 *(int *) data = 100;
1674 return(0);
1675 }
1676 if (raidPtr->copyback_in_progress == 1) {
1677 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1678 raidPtr->Layout.numStripe;
1679 } else {
1680 *(int *) data = 100;
1681 }
1682 return (0);
1683
1684 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1685 progressInfoPtr = (RF_ProgressInfo_t **) data;
1686 if (raidPtr->copyback_in_progress == 1) {
1687 progressInfo.total = raidPtr->Layout.numStripe;
1688 progressInfo.completed =
1689 raidPtr->copyback_stripes_done;
1690 progressInfo.remaining = progressInfo.total -
1691 progressInfo.completed;
1692 } else {
1693 progressInfo.remaining = 0;
1694 progressInfo.completed = 100;
1695 progressInfo.total = 100;
1696 }
1697 retcode = copyout(&progressInfo, *progressInfoPtr,
1698 sizeof(RF_ProgressInfo_t));
1699 return (retcode);
1700
1701 /* the sparetable daemon calls this to wait for the kernel to
1702 * need a spare table. this ioctl does not return until a
1703 * spare table is needed. XXX -- calling mpsleep here in the
1704 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1705 * -- I should either compute the spare table in the kernel,
1706 * or have a different -- XXX XXX -- interface (a different
1707 * character device) for delivering the table -- XXX */
1708 #if 0
1709 case RAIDFRAME_SPARET_WAIT:
1710 rf_lock_mutex2(rf_sparet_wait_mutex);
1711 while (!rf_sparet_wait_queue)
1712 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1713 waitreq = rf_sparet_wait_queue;
1714 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1715 rf_unlock_mutex2(rf_sparet_wait_mutex);
1716
1717 /* structure assignment */
1718 *((RF_SparetWait_t *) data) = *waitreq;
1719
1720 RF_Free(waitreq, sizeof(*waitreq));
1721 return (0);
1722
1723 /* wakes up a process waiting on SPARET_WAIT and puts an error
1724 * code in it that will cause the dameon to exit */
1725 case RAIDFRAME_ABORT_SPARET_WAIT:
1726 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1727 waitreq->fcol = -1;
1728 rf_lock_mutex2(rf_sparet_wait_mutex);
1729 waitreq->next = rf_sparet_wait_queue;
1730 rf_sparet_wait_queue = waitreq;
1731 rf_broadcast_conf2(rf_sparet_wait_cv);
1732 rf_unlock_mutex2(rf_sparet_wait_mutex);
1733 return (0);
1734
1735 /* used by the spare table daemon to deliver a spare table
1736 * into the kernel */
1737 case RAIDFRAME_SEND_SPARET:
1738
1739 /* install the spare table */
1740 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1741
1742 /* respond to the requestor. the return status of the spare
1743 * table installation is passed in the "fcol" field */
1744 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1745 waitreq->fcol = retcode;
1746 rf_lock_mutex2(rf_sparet_wait_mutex);
1747 waitreq->next = rf_sparet_resp_queue;
1748 rf_sparet_resp_queue = waitreq;
1749 rf_broadcast_cond2(rf_sparet_resp_cv);
1750 rf_unlock_mutex2(rf_sparet_wait_mutex);
1751
1752 return (retcode);
1753 #endif
1754
1755 default:
1756 break; /* fall through to the os-specific code below */
1757
1758 }
1759
1760 if (!raidPtr->valid)
1761 return (EINVAL);
1762
1763 /*
1764 * Add support for "regular" device ioctls here.
1765 */
1766
1767 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1768 if (error != EPASSTHROUGH)
1769 return (error);
1770
1771 switch (cmd) {
1772 case DIOCGDINFO:
1773 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1774 break;
1775 #ifdef __HAVE_OLD_DISKLABEL
1776 case ODIOCGDINFO:
1777 newlabel = *(rs->sc_dkdev.dk_label);
1778 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1779 return ENOTTY;
1780 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1781 break;
1782 #endif
1783
1784 case DIOCGPART:
1785 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1786 ((struct partinfo *) data)->part =
1787 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1788 break;
1789
1790 case DIOCWDINFO:
1791 case DIOCSDINFO:
1792 #ifdef __HAVE_OLD_DISKLABEL
1793 case ODIOCWDINFO:
1794 case ODIOCSDINFO:
1795 #endif
1796 {
1797 struct disklabel *lp;
1798 #ifdef __HAVE_OLD_DISKLABEL
1799 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1800 memset(&newlabel, 0, sizeof newlabel);
1801 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1802 lp = &newlabel;
1803 } else
1804 #endif
1805 lp = (struct disklabel *)data;
1806
1807 if ((error = raidlock(rs)) != 0)
1808 return (error);
1809
1810 rs->sc_flags |= RAIDF_LABELLING;
1811
1812 error = setdisklabel(rs->sc_dkdev.dk_label,
1813 lp, 0, rs->sc_dkdev.dk_cpulabel);
1814 if (error == 0) {
1815 if (cmd == DIOCWDINFO
1816 #ifdef __HAVE_OLD_DISKLABEL
1817 || cmd == ODIOCWDINFO
1818 #endif
1819 )
1820 error = writedisklabel(RAIDLABELDEV(dev),
1821 raidstrategy, rs->sc_dkdev.dk_label,
1822 rs->sc_dkdev.dk_cpulabel);
1823 }
1824 rs->sc_flags &= ~RAIDF_LABELLING;
1825
1826 raidunlock(rs);
1827
1828 if (error)
1829 return (error);
1830 break;
1831 }
1832
1833 case DIOCWLABEL:
1834 if (*(int *) data != 0)
1835 rs->sc_flags |= RAIDF_WLABEL;
1836 else
1837 rs->sc_flags &= ~RAIDF_WLABEL;
1838 break;
1839
1840 case DIOCGDEFLABEL:
1841 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1842 break;
1843
1844 #ifdef __HAVE_OLD_DISKLABEL
1845 case ODIOCGDEFLABEL:
1846 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1847 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1848 return ENOTTY;
1849 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1850 break;
1851 #endif
1852
1853 case DIOCAWEDGE:
1854 case DIOCDWEDGE:
1855 dkw = (void *)data;
1856
1857 /* If the ioctl happens here, the parent is us. */
1858 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1859 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1860
1861 case DIOCLWEDGES:
1862 return dkwedge_list(&rs->sc_dkdev,
1863 (struct dkwedge_list *)data, l);
1864 case DIOCCACHESYNC:
1865 return rf_sync_component_caches(raidPtr);
1866
1867 case DIOCGSTRATEGY:
1868 {
1869 struct disk_strategy *dks = (void *)data;
1870
1871 s = splbio();
1872 strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
1873 sizeof(dks->dks_name));
1874 splx(s);
1875 dks->dks_paramlen = 0;
1876
1877 return 0;
1878 }
1879
1880 case DIOCSSTRATEGY:
1881 {
1882 struct disk_strategy *dks = (void *)data;
1883 struct bufq_state *new;
1884 struct bufq_state *old;
1885
1886 if (dks->dks_param != NULL) {
1887 return EINVAL;
1888 }
1889 dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
1890 error = bufq_alloc(&new, dks->dks_name,
1891 BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
1892 if (error) {
1893 return error;
1894 }
1895 s = splbio();
1896 old = rs->buf_queue;
1897 bufq_move(new, old);
1898 rs->buf_queue = new;
1899 splx(s);
1900 bufq_free(old);
1901
1902 return 0;
1903 }
1904
1905 default:
1906 retcode = ENOTTY;
1907 }
1908 return (retcode);
1909
1910 }
1911
1912
1913 /* raidinit -- complete the rest of the initialization for the
1914 RAIDframe device. */
1915
1916
1917 static void
1918 raidinit(struct raid_softc *rs)
1919 {
1920 cfdata_t cf;
1921 int unit;
1922 RF_Raid_t *raidPtr = &rs->sc_r;
1923
1924 unit = raidPtr->raidid;
1925
1926
1927 /* XXX should check return code first... */
1928 rs->sc_flags |= RAIDF_INITED;
1929
1930 /* XXX doesn't check bounds. */
1931 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1932
1933 /* attach the pseudo device */
1934 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1935 cf->cf_name = raid_cd.cd_name;
1936 cf->cf_atname = raid_cd.cd_name;
1937 cf->cf_unit = unit;
1938 cf->cf_fstate = FSTATE_STAR;
1939
1940 rs->sc_dev = config_attach_pseudo(cf);
1941
1942 if (rs->sc_dev == NULL) {
1943 printf("raid%d: config_attach_pseudo failed\n",
1944 raidPtr->raidid);
1945 rs->sc_flags &= ~RAIDF_INITED;
1946 free(cf, M_RAIDFRAME);
1947 return;
1948 }
1949
1950 /* disk_attach actually creates space for the CPU disklabel, among
1951 * other things, so it's critical to call this *BEFORE* we try putzing
1952 * with disklabels. */
1953
1954 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1955 disk_attach(&rs->sc_dkdev);
1956 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1957
1958 /* XXX There may be a weird interaction here between this, and
1959 * protectedSectors, as used in RAIDframe. */
1960
1961 rs->sc_size = raidPtr->totalSectors;
1962
1963 dkwedge_discover(&rs->sc_dkdev);
1964
1965 rf_set_properties(rs, raidPtr);
1966
1967 }
1968 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1969 /* wake up the daemon & tell it to get us a spare table
1970 * XXX
1971 * the entries in the queues should be tagged with the raidPtr
1972 * so that in the extremely rare case that two recons happen at once,
1973 * we know for which device were requesting a spare table
1974 * XXX
1975 *
1976 * XXX This code is not currently used. GO
1977 */
1978 int
1979 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1980 {
1981 int retcode;
1982
1983 rf_lock_mutex2(rf_sparet_wait_mutex);
1984 req->next = rf_sparet_wait_queue;
1985 rf_sparet_wait_queue = req;
1986 rf_broadcast_cond2(rf_sparet_wait_cv);
1987
1988 /* mpsleep unlocks the mutex */
1989 while (!rf_sparet_resp_queue) {
1990 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1991 }
1992 req = rf_sparet_resp_queue;
1993 rf_sparet_resp_queue = req->next;
1994 rf_unlock_mutex2(rf_sparet_wait_mutex);
1995
1996 retcode = req->fcol;
1997 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1998 * alloc'd */
1999 return (retcode);
2000 }
2001 #endif
2002
2003 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2004 * bp & passes it down.
2005 * any calls originating in the kernel must use non-blocking I/O
2006 * do some extra sanity checking to return "appropriate" error values for
2007 * certain conditions (to make some standard utilities work)
2008 *
2009 * Formerly known as: rf_DoAccessKernel
2010 */
2011 void
2012 raidstart(RF_Raid_t *raidPtr)
2013 {
2014 RF_SectorCount_t num_blocks, pb, sum;
2015 RF_RaidAddr_t raid_addr;
2016 struct partition *pp;
2017 daddr_t blocknum;
2018 struct raid_softc *rs;
2019 int do_async;
2020 struct buf *bp;
2021 int rc;
2022
2023 rs = raidPtr->softc;
2024 /* quick check to see if anything has died recently */
2025 rf_lock_mutex2(raidPtr->mutex);
2026 if (raidPtr->numNewFailures > 0) {
2027 rf_unlock_mutex2(raidPtr->mutex);
2028 rf_update_component_labels(raidPtr,
2029 RF_NORMAL_COMPONENT_UPDATE);
2030 rf_lock_mutex2(raidPtr->mutex);
2031 raidPtr->numNewFailures--;
2032 }
2033
2034 /* Check to see if we're at the limit... */
2035 while (raidPtr->openings > 0) {
2036 rf_unlock_mutex2(raidPtr->mutex);
2037
2038 /* get the next item, if any, from the queue */
2039 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2040 /* nothing more to do */
2041 return;
2042 }
2043
2044 /* Ok, for the bp we have here, bp->b_blkno is relative to the
2045 * partition.. Need to make it absolute to the underlying
2046 * device.. */
2047
2048 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2049 if (DISKPART(bp->b_dev) != RAW_PART) {
2050 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2051 blocknum += pp->p_offset;
2052 }
2053
2054 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2055 (int) blocknum));
2056
2057 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2058 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2059
2060 /* *THIS* is where we adjust what block we're going to...
2061 * but DO NOT TOUCH bp->b_blkno!!! */
2062 raid_addr = blocknum;
2063
2064 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2065 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2066 sum = raid_addr + num_blocks + pb;
2067 if (1 || rf_debugKernelAccess) {
2068 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2069 (int) raid_addr, (int) sum, (int) num_blocks,
2070 (int) pb, (int) bp->b_resid));
2071 }
2072 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2073 || (sum < num_blocks) || (sum < pb)) {
2074 bp->b_error = ENOSPC;
2075 bp->b_resid = bp->b_bcount;
2076 biodone(bp);
2077 rf_lock_mutex2(raidPtr->mutex);
2078 continue;
2079 }
2080 /*
2081 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2082 */
2083
2084 if (bp->b_bcount & raidPtr->sectorMask) {
2085 bp->b_error = EINVAL;
2086 bp->b_resid = bp->b_bcount;
2087 biodone(bp);
2088 rf_lock_mutex2(raidPtr->mutex);
2089 continue;
2090
2091 }
2092 db1_printf(("Calling DoAccess..\n"));
2093
2094
2095 rf_lock_mutex2(raidPtr->mutex);
2096 raidPtr->openings--;
2097 rf_unlock_mutex2(raidPtr->mutex);
2098
2099 /*
2100 * Everything is async.
2101 */
2102 do_async = 1;
2103
2104 disk_busy(&rs->sc_dkdev);
2105
2106 /* XXX we're still at splbio() here... do we *really*
2107 need to be? */
2108
2109 /* don't ever condition on bp->b_flags & B_WRITE.
2110 * always condition on B_READ instead */
2111
2112 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2113 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2114 do_async, raid_addr, num_blocks,
2115 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2116
2117 if (rc) {
2118 bp->b_error = rc;
2119 bp->b_resid = bp->b_bcount;
2120 biodone(bp);
2121 /* continue loop */
2122 }
2123
2124 rf_lock_mutex2(raidPtr->mutex);
2125 }
2126 rf_unlock_mutex2(raidPtr->mutex);
2127 }
2128
2129
2130
2131
2132 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2133
2134 int
2135 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2136 {
2137 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2138 struct buf *bp;
2139
2140 req->queue = queue;
2141 bp = req->bp;
2142
2143 switch (req->type) {
2144 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2145 /* XXX need to do something extra here.. */
2146 /* I'm leaving this in, as I've never actually seen it used,
2147 * and I'd like folks to report it... GO */
2148 printf(("WAKEUP CALLED\n"));
2149 queue->numOutstanding++;
2150
2151 bp->b_flags = 0;
2152 bp->b_private = req;
2153
2154 KernelWakeupFunc(bp);
2155 break;
2156
2157 case RF_IO_TYPE_READ:
2158 case RF_IO_TYPE_WRITE:
2159 #if RF_ACC_TRACE > 0
2160 if (req->tracerec) {
2161 RF_ETIMER_START(req->tracerec->timer);
2162 }
2163 #endif
2164 InitBP(bp, queue->rf_cinfo->ci_vp,
2165 op, queue->rf_cinfo->ci_dev,
2166 req->sectorOffset, req->numSector,
2167 req->buf, KernelWakeupFunc, (void *) req,
2168 queue->raidPtr->logBytesPerSector, req->b_proc);
2169
2170 if (rf_debugKernelAccess) {
2171 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2172 (long) bp->b_blkno));
2173 }
2174 queue->numOutstanding++;
2175 queue->last_deq_sector = req->sectorOffset;
2176 /* acc wouldn't have been let in if there were any pending
2177 * reqs at any other priority */
2178 queue->curPriority = req->priority;
2179
2180 db1_printf(("Going for %c to unit %d col %d\n",
2181 req->type, queue->raidPtr->raidid,
2182 queue->col));
2183 db1_printf(("sector %d count %d (%d bytes) %d\n",
2184 (int) req->sectorOffset, (int) req->numSector,
2185 (int) (req->numSector <<
2186 queue->raidPtr->logBytesPerSector),
2187 (int) queue->raidPtr->logBytesPerSector));
2188
2189 /*
2190 * XXX: drop lock here since this can block at
2191 * least with backing SCSI devices. Retake it
2192 * to minimize fuss with calling interfaces.
2193 */
2194
2195 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2196 bdev_strategy(bp);
2197 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2198 break;
2199
2200 default:
2201 panic("bad req->type in rf_DispatchKernelIO");
2202 }
2203 db1_printf(("Exiting from DispatchKernelIO\n"));
2204
2205 return (0);
2206 }
2207 /* this is the callback function associated with a I/O invoked from
2208 kernel code.
2209 */
2210 static void
2211 KernelWakeupFunc(struct buf *bp)
2212 {
2213 RF_DiskQueueData_t *req = NULL;
2214 RF_DiskQueue_t *queue;
2215
2216 db1_printf(("recovering the request queue:\n"));
2217
2218 req = bp->b_private;
2219
2220 queue = (RF_DiskQueue_t *) req->queue;
2221
2222 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2223
2224 #if RF_ACC_TRACE > 0
2225 if (req->tracerec) {
2226 RF_ETIMER_STOP(req->tracerec->timer);
2227 RF_ETIMER_EVAL(req->tracerec->timer);
2228 rf_lock_mutex2(rf_tracing_mutex);
2229 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2230 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2231 req->tracerec->num_phys_ios++;
2232 rf_unlock_mutex2(rf_tracing_mutex);
2233 }
2234 #endif
2235
2236 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2237 * ballistic, and mark the component as hosed... */
2238
2239 if (bp->b_error != 0) {
2240 /* Mark the disk as dead */
2241 /* but only mark it once... */
2242 /* and only if it wouldn't leave this RAID set
2243 completely broken */
2244 if (((queue->raidPtr->Disks[queue->col].status ==
2245 rf_ds_optimal) ||
2246 (queue->raidPtr->Disks[queue->col].status ==
2247 rf_ds_used_spare)) &&
2248 (queue->raidPtr->numFailures <
2249 queue->raidPtr->Layout.map->faultsTolerated)) {
2250 printf("raid%d: IO Error. Marking %s as failed.\n",
2251 queue->raidPtr->raidid,
2252 queue->raidPtr->Disks[queue->col].devname);
2253 queue->raidPtr->Disks[queue->col].status =
2254 rf_ds_failed;
2255 queue->raidPtr->status = rf_rs_degraded;
2256 queue->raidPtr->numFailures++;
2257 queue->raidPtr->numNewFailures++;
2258 } else { /* Disk is already dead... */
2259 /* printf("Disk already marked as dead!\n"); */
2260 }
2261
2262 }
2263
2264 /* Fill in the error value */
2265 req->error = bp->b_error;
2266
2267 /* Drop this one on the "finished" queue... */
2268 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2269
2270 /* Let the raidio thread know there is work to be done. */
2271 rf_signal_cond2(queue->raidPtr->iodone_cv);
2272
2273 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2274 }
2275
2276
2277 /*
2278 * initialize a buf structure for doing an I/O in the kernel.
2279 */
2280 static void
2281 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2282 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2283 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2284 struct proc *b_proc)
2285 {
2286 /* bp->b_flags = B_PHYS | rw_flag; */
2287 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2288 bp->b_oflags = 0;
2289 bp->b_cflags = 0;
2290 bp->b_bcount = numSect << logBytesPerSector;
2291 bp->b_bufsize = bp->b_bcount;
2292 bp->b_error = 0;
2293 bp->b_dev = dev;
2294 bp->b_data = bf;
2295 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2296 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2297 if (bp->b_bcount == 0) {
2298 panic("bp->b_bcount is zero in InitBP!!");
2299 }
2300 bp->b_proc = b_proc;
2301 bp->b_iodone = cbFunc;
2302 bp->b_private = cbArg;
2303 }
2304
2305 static void
2306 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2307 struct disklabel *lp)
2308 {
2309 memset(lp, 0, sizeof(*lp));
2310
2311 /* fabricate a label... */
2312 lp->d_secperunit = raidPtr->totalSectors;
2313 lp->d_secsize = raidPtr->bytesPerSector;
2314 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2315 lp->d_ntracks = 4 * raidPtr->numCol;
2316 lp->d_ncylinders = raidPtr->totalSectors /
2317 (lp->d_nsectors * lp->d_ntracks);
2318 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2319
2320 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2321 lp->d_type = DTYPE_RAID;
2322 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2323 lp->d_rpm = 3600;
2324 lp->d_interleave = 1;
2325 lp->d_flags = 0;
2326
2327 lp->d_partitions[RAW_PART].p_offset = 0;
2328 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2329 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2330 lp->d_npartitions = RAW_PART + 1;
2331
2332 lp->d_magic = DISKMAGIC;
2333 lp->d_magic2 = DISKMAGIC;
2334 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2335
2336 }
2337 /*
2338 * Read the disklabel from the raid device. If one is not present, fake one
2339 * up.
2340 */
2341 static void
2342 raidgetdisklabel(dev_t dev)
2343 {
2344 int unit = raidunit(dev);
2345 struct raid_softc *rs;
2346 const char *errstring;
2347 struct disklabel *lp;
2348 struct cpu_disklabel *clp;
2349 RF_Raid_t *raidPtr;
2350
2351 if ((rs = raidget(unit)) == NULL)
2352 return;
2353
2354 lp = rs->sc_dkdev.dk_label;
2355 clp = rs->sc_dkdev.dk_cpulabel;
2356
2357 db1_printf(("Getting the disklabel...\n"));
2358
2359 memset(clp, 0, sizeof(*clp));
2360
2361 raidPtr = &rs->sc_r;
2362
2363 raidgetdefaultlabel(raidPtr, rs, lp);
2364
2365 /*
2366 * Call the generic disklabel extraction routine.
2367 */
2368 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2369 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2370 if (errstring)
2371 raidmakedisklabel(rs);
2372 else {
2373 int i;
2374 struct partition *pp;
2375
2376 /*
2377 * Sanity check whether the found disklabel is valid.
2378 *
2379 * This is necessary since total size of the raid device
2380 * may vary when an interleave is changed even though exactly
2381 * same components are used, and old disklabel may used
2382 * if that is found.
2383 */
2384 if (lp->d_secperunit != rs->sc_size)
2385 printf("raid%d: WARNING: %s: "
2386 "total sector size in disklabel (%" PRIu32 ") != "
2387 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2388 lp->d_secperunit, rs->sc_size);
2389 for (i = 0; i < lp->d_npartitions; i++) {
2390 pp = &lp->d_partitions[i];
2391 if (pp->p_offset + pp->p_size > rs->sc_size)
2392 printf("raid%d: WARNING: %s: end of partition `%c' "
2393 "exceeds the size of raid (%" PRIu64 ")\n",
2394 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2395 }
2396 }
2397
2398 }
2399 /*
2400 * Take care of things one might want to take care of in the event
2401 * that a disklabel isn't present.
2402 */
2403 static void
2404 raidmakedisklabel(struct raid_softc *rs)
2405 {
2406 struct disklabel *lp = rs->sc_dkdev.dk_label;
2407 db1_printf(("Making a label..\n"));
2408
2409 /*
2410 * For historical reasons, if there's no disklabel present
2411 * the raw partition must be marked FS_BSDFFS.
2412 */
2413
2414 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2415
2416 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2417
2418 lp->d_checksum = dkcksum(lp);
2419 }
2420 /*
2421 * Wait interruptibly for an exclusive lock.
2422 *
2423 * XXX
2424 * Several drivers do this; it should be abstracted and made MP-safe.
2425 * (Hmm... where have we seen this warning before :-> GO )
2426 */
2427 static int
2428 raidlock(struct raid_softc *rs)
2429 {
2430 int error;
2431
2432 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2433 rs->sc_flags |= RAIDF_WANTED;
2434 if ((error =
2435 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2436 return (error);
2437 }
2438 rs->sc_flags |= RAIDF_LOCKED;
2439 return (0);
2440 }
2441 /*
2442 * Unlock and wake up any waiters.
2443 */
2444 static void
2445 raidunlock(struct raid_softc *rs)
2446 {
2447
2448 rs->sc_flags &= ~RAIDF_LOCKED;
2449 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2450 rs->sc_flags &= ~RAIDF_WANTED;
2451 wakeup(rs);
2452 }
2453 }
2454
2455
2456 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2457 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2458 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2459
2460 static daddr_t
2461 rf_component_info_offset(void)
2462 {
2463
2464 return RF_COMPONENT_INFO_OFFSET;
2465 }
2466
2467 static daddr_t
2468 rf_component_info_size(unsigned secsize)
2469 {
2470 daddr_t info_size;
2471
2472 KASSERT(secsize);
2473 if (secsize > RF_COMPONENT_INFO_SIZE)
2474 info_size = secsize;
2475 else
2476 info_size = RF_COMPONENT_INFO_SIZE;
2477
2478 return info_size;
2479 }
2480
2481 static daddr_t
2482 rf_parity_map_offset(RF_Raid_t *raidPtr)
2483 {
2484 daddr_t map_offset;
2485
2486 KASSERT(raidPtr->bytesPerSector);
2487 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2488 map_offset = raidPtr->bytesPerSector;
2489 else
2490 map_offset = RF_COMPONENT_INFO_SIZE;
2491 map_offset += rf_component_info_offset();
2492
2493 return map_offset;
2494 }
2495
2496 static daddr_t
2497 rf_parity_map_size(RF_Raid_t *raidPtr)
2498 {
2499 daddr_t map_size;
2500
2501 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2502 map_size = raidPtr->bytesPerSector;
2503 else
2504 map_size = RF_PARITY_MAP_SIZE;
2505
2506 return map_size;
2507 }
2508
2509 int
2510 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2511 {
2512 RF_ComponentLabel_t *clabel;
2513
2514 clabel = raidget_component_label(raidPtr, col);
2515 clabel->clean = RF_RAID_CLEAN;
2516 raidflush_component_label(raidPtr, col);
2517 return(0);
2518 }
2519
2520
2521 int
2522 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2523 {
2524 RF_ComponentLabel_t *clabel;
2525
2526 clabel = raidget_component_label(raidPtr, col);
2527 clabel->clean = RF_RAID_DIRTY;
2528 raidflush_component_label(raidPtr, col);
2529 return(0);
2530 }
2531
2532 int
2533 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2534 {
2535 KASSERT(raidPtr->bytesPerSector);
2536 return raidread_component_label(raidPtr->bytesPerSector,
2537 raidPtr->Disks[col].dev,
2538 raidPtr->raid_cinfo[col].ci_vp,
2539 &raidPtr->raid_cinfo[col].ci_label);
2540 }
2541
2542 RF_ComponentLabel_t *
2543 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2544 {
2545 return &raidPtr->raid_cinfo[col].ci_label;
2546 }
2547
2548 int
2549 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2550 {
2551 RF_ComponentLabel_t *label;
2552
2553 label = &raidPtr->raid_cinfo[col].ci_label;
2554 label->mod_counter = raidPtr->mod_counter;
2555 #ifndef RF_NO_PARITY_MAP
2556 label->parity_map_modcount = label->mod_counter;
2557 #endif
2558 return raidwrite_component_label(raidPtr->bytesPerSector,
2559 raidPtr->Disks[col].dev,
2560 raidPtr->raid_cinfo[col].ci_vp, label);
2561 }
2562
2563
2564 static int
2565 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2566 RF_ComponentLabel_t *clabel)
2567 {
2568 return raidread_component_area(dev, b_vp, clabel,
2569 sizeof(RF_ComponentLabel_t),
2570 rf_component_info_offset(),
2571 rf_component_info_size(secsize));
2572 }
2573
2574 /* ARGSUSED */
2575 static int
2576 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2577 size_t msize, daddr_t offset, daddr_t dsize)
2578 {
2579 struct buf *bp;
2580 const struct bdevsw *bdev;
2581 int error;
2582
2583 /* XXX should probably ensure that we don't try to do this if
2584 someone has changed rf_protected_sectors. */
2585
2586 if (b_vp == NULL) {
2587 /* For whatever reason, this component is not valid.
2588 Don't try to read a component label from it. */
2589 return(EINVAL);
2590 }
2591
2592 /* get a block of the appropriate size... */
2593 bp = geteblk((int)dsize);
2594 bp->b_dev = dev;
2595
2596 /* get our ducks in a row for the read */
2597 bp->b_blkno = offset / DEV_BSIZE;
2598 bp->b_bcount = dsize;
2599 bp->b_flags |= B_READ;
2600 bp->b_resid = dsize;
2601
2602 bdev = bdevsw_lookup(bp->b_dev);
2603 if (bdev == NULL)
2604 return (ENXIO);
2605 (*bdev->d_strategy)(bp);
2606
2607 error = biowait(bp);
2608
2609 if (!error) {
2610 memcpy(data, bp->b_data, msize);
2611 }
2612
2613 brelse(bp, 0);
2614 return(error);
2615 }
2616
2617
2618 static int
2619 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2620 RF_ComponentLabel_t *clabel)
2621 {
2622 return raidwrite_component_area(dev, b_vp, clabel,
2623 sizeof(RF_ComponentLabel_t),
2624 rf_component_info_offset(),
2625 rf_component_info_size(secsize), 0);
2626 }
2627
2628 /* ARGSUSED */
2629 static int
2630 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2631 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2632 {
2633 struct buf *bp;
2634 const struct bdevsw *bdev;
2635 int error;
2636
2637 /* get a block of the appropriate size... */
2638 bp = geteblk((int)dsize);
2639 bp->b_dev = dev;
2640
2641 /* get our ducks in a row for the write */
2642 bp->b_blkno = offset / DEV_BSIZE;
2643 bp->b_bcount = dsize;
2644 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2645 bp->b_resid = dsize;
2646
2647 memset(bp->b_data, 0, dsize);
2648 memcpy(bp->b_data, data, msize);
2649
2650 bdev = bdevsw_lookup(bp->b_dev);
2651 if (bdev == NULL)
2652 return (ENXIO);
2653 (*bdev->d_strategy)(bp);
2654 if (asyncp)
2655 return 0;
2656 error = biowait(bp);
2657 brelse(bp, 0);
2658 if (error) {
2659 #if 1
2660 printf("Failed to write RAID component info!\n");
2661 #endif
2662 }
2663
2664 return(error);
2665 }
2666
2667 void
2668 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2669 {
2670 int c;
2671
2672 for (c = 0; c < raidPtr->numCol; c++) {
2673 /* Skip dead disks. */
2674 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2675 continue;
2676 /* XXXjld: what if an error occurs here? */
2677 raidwrite_component_area(raidPtr->Disks[c].dev,
2678 raidPtr->raid_cinfo[c].ci_vp, map,
2679 RF_PARITYMAP_NBYTE,
2680 rf_parity_map_offset(raidPtr),
2681 rf_parity_map_size(raidPtr), 0);
2682 }
2683 }
2684
2685 void
2686 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2687 {
2688 struct rf_paritymap_ondisk tmp;
2689 int c,first;
2690
2691 first=1;
2692 for (c = 0; c < raidPtr->numCol; c++) {
2693 /* Skip dead disks. */
2694 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2695 continue;
2696 raidread_component_area(raidPtr->Disks[c].dev,
2697 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2698 RF_PARITYMAP_NBYTE,
2699 rf_parity_map_offset(raidPtr),
2700 rf_parity_map_size(raidPtr));
2701 if (first) {
2702 memcpy(map, &tmp, sizeof(*map));
2703 first = 0;
2704 } else {
2705 rf_paritymap_merge(map, &tmp);
2706 }
2707 }
2708 }
2709
2710 void
2711 rf_markalldirty(RF_Raid_t *raidPtr)
2712 {
2713 RF_ComponentLabel_t *clabel;
2714 int sparecol;
2715 int c;
2716 int j;
2717 int scol = -1;
2718
2719 raidPtr->mod_counter++;
2720 for (c = 0; c < raidPtr->numCol; c++) {
2721 /* we don't want to touch (at all) a disk that has
2722 failed */
2723 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2724 clabel = raidget_component_label(raidPtr, c);
2725 if (clabel->status == rf_ds_spared) {
2726 /* XXX do something special...
2727 but whatever you do, don't
2728 try to access it!! */
2729 } else {
2730 raidmarkdirty(raidPtr, c);
2731 }
2732 }
2733 }
2734
2735 for( c = 0; c < raidPtr->numSpare ; c++) {
2736 sparecol = raidPtr->numCol + c;
2737 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2738 /*
2739
2740 we claim this disk is "optimal" if it's
2741 rf_ds_used_spare, as that means it should be
2742 directly substitutable for the disk it replaced.
2743 We note that too...
2744
2745 */
2746
2747 for(j=0;j<raidPtr->numCol;j++) {
2748 if (raidPtr->Disks[j].spareCol == sparecol) {
2749 scol = j;
2750 break;
2751 }
2752 }
2753
2754 clabel = raidget_component_label(raidPtr, sparecol);
2755 /* make sure status is noted */
2756
2757 raid_init_component_label(raidPtr, clabel);
2758
2759 clabel->row = 0;
2760 clabel->column = scol;
2761 /* Note: we *don't* change status from rf_ds_used_spare
2762 to rf_ds_optimal */
2763 /* clabel.status = rf_ds_optimal; */
2764
2765 raidmarkdirty(raidPtr, sparecol);
2766 }
2767 }
2768 }
2769
2770
2771 void
2772 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2773 {
2774 RF_ComponentLabel_t *clabel;
2775 int sparecol;
2776 int c;
2777 int j;
2778 int scol;
2779
2780 scol = -1;
2781
2782 /* XXX should do extra checks to make sure things really are clean,
2783 rather than blindly setting the clean bit... */
2784
2785 raidPtr->mod_counter++;
2786
2787 for (c = 0; c < raidPtr->numCol; c++) {
2788 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2789 clabel = raidget_component_label(raidPtr, c);
2790 /* make sure status is noted */
2791 clabel->status = rf_ds_optimal;
2792
2793 /* note what unit we are configured as */
2794 clabel->last_unit = raidPtr->raidid;
2795
2796 raidflush_component_label(raidPtr, c);
2797 if (final == RF_FINAL_COMPONENT_UPDATE) {
2798 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2799 raidmarkclean(raidPtr, c);
2800 }
2801 }
2802 }
2803 /* else we don't touch it.. */
2804 }
2805
2806 for( c = 0; c < raidPtr->numSpare ; c++) {
2807 sparecol = raidPtr->numCol + c;
2808 /* Need to ensure that the reconstruct actually completed! */
2809 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2810 /*
2811
2812 we claim this disk is "optimal" if it's
2813 rf_ds_used_spare, as that means it should be
2814 directly substitutable for the disk it replaced.
2815 We note that too...
2816
2817 */
2818
2819 for(j=0;j<raidPtr->numCol;j++) {
2820 if (raidPtr->Disks[j].spareCol == sparecol) {
2821 scol = j;
2822 break;
2823 }
2824 }
2825
2826 /* XXX shouldn't *really* need this... */
2827 clabel = raidget_component_label(raidPtr, sparecol);
2828 /* make sure status is noted */
2829
2830 raid_init_component_label(raidPtr, clabel);
2831
2832 clabel->column = scol;
2833 clabel->status = rf_ds_optimal;
2834 clabel->last_unit = raidPtr->raidid;
2835
2836 raidflush_component_label(raidPtr, sparecol);
2837 if (final == RF_FINAL_COMPONENT_UPDATE) {
2838 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2839 raidmarkclean(raidPtr, sparecol);
2840 }
2841 }
2842 }
2843 }
2844 }
2845
2846 void
2847 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2848 {
2849
2850 if (vp != NULL) {
2851 if (auto_configured == 1) {
2852 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2853 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2854 vput(vp);
2855
2856 } else {
2857 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2858 }
2859 }
2860 }
2861
2862
2863 void
2864 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2865 {
2866 int r,c;
2867 struct vnode *vp;
2868 int acd;
2869
2870
2871 /* We take this opportunity to close the vnodes like we should.. */
2872
2873 for (c = 0; c < raidPtr->numCol; c++) {
2874 vp = raidPtr->raid_cinfo[c].ci_vp;
2875 acd = raidPtr->Disks[c].auto_configured;
2876 rf_close_component(raidPtr, vp, acd);
2877 raidPtr->raid_cinfo[c].ci_vp = NULL;
2878 raidPtr->Disks[c].auto_configured = 0;
2879 }
2880
2881 for (r = 0; r < raidPtr->numSpare; r++) {
2882 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2883 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2884 rf_close_component(raidPtr, vp, acd);
2885 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2886 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2887 }
2888 }
2889
2890
2891 void
2892 rf_ReconThread(struct rf_recon_req *req)
2893 {
2894 int s;
2895 RF_Raid_t *raidPtr;
2896
2897 s = splbio();
2898 raidPtr = (RF_Raid_t *) req->raidPtr;
2899 raidPtr->recon_in_progress = 1;
2900
2901 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2902 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2903
2904 RF_Free(req, sizeof(*req));
2905
2906 raidPtr->recon_in_progress = 0;
2907 splx(s);
2908
2909 /* That's all... */
2910 kthread_exit(0); /* does not return */
2911 }
2912
2913 void
2914 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2915 {
2916 int retcode;
2917 int s;
2918
2919 raidPtr->parity_rewrite_stripes_done = 0;
2920 raidPtr->parity_rewrite_in_progress = 1;
2921 s = splbio();
2922 retcode = rf_RewriteParity(raidPtr);
2923 splx(s);
2924 if (retcode) {
2925 printf("raid%d: Error re-writing parity (%d)!\n",
2926 raidPtr->raidid, retcode);
2927 } else {
2928 /* set the clean bit! If we shutdown correctly,
2929 the clean bit on each component label will get
2930 set */
2931 raidPtr->parity_good = RF_RAID_CLEAN;
2932 }
2933 raidPtr->parity_rewrite_in_progress = 0;
2934
2935 /* Anyone waiting for us to stop? If so, inform them... */
2936 if (raidPtr->waitShutdown) {
2937 wakeup(&raidPtr->parity_rewrite_in_progress);
2938 }
2939
2940 /* That's all... */
2941 kthread_exit(0); /* does not return */
2942 }
2943
2944
2945 void
2946 rf_CopybackThread(RF_Raid_t *raidPtr)
2947 {
2948 int s;
2949
2950 raidPtr->copyback_in_progress = 1;
2951 s = splbio();
2952 rf_CopybackReconstructedData(raidPtr);
2953 splx(s);
2954 raidPtr->copyback_in_progress = 0;
2955
2956 /* That's all... */
2957 kthread_exit(0); /* does not return */
2958 }
2959
2960
2961 void
2962 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2963 {
2964 int s;
2965 RF_Raid_t *raidPtr;
2966
2967 s = splbio();
2968 raidPtr = req->raidPtr;
2969 raidPtr->recon_in_progress = 1;
2970 rf_ReconstructInPlace(raidPtr, req->col);
2971 RF_Free(req, sizeof(*req));
2972 raidPtr->recon_in_progress = 0;
2973 splx(s);
2974
2975 /* That's all... */
2976 kthread_exit(0); /* does not return */
2977 }
2978
2979 static RF_AutoConfig_t *
2980 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2981 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2982 unsigned secsize)
2983 {
2984 int good_one = 0;
2985 RF_ComponentLabel_t *clabel;
2986 RF_AutoConfig_t *ac;
2987
2988 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2989 if (clabel == NULL) {
2990 oomem:
2991 while(ac_list) {
2992 ac = ac_list;
2993 if (ac->clabel)
2994 free(ac->clabel, M_RAIDFRAME);
2995 ac_list = ac_list->next;
2996 free(ac, M_RAIDFRAME);
2997 }
2998 printf("RAID auto config: out of memory!\n");
2999 return NULL; /* XXX probably should panic? */
3000 }
3001
3002 if (!raidread_component_label(secsize, dev, vp, clabel)) {
3003 /* Got the label. Does it look reasonable? */
3004 if (rf_reasonable_label(clabel, numsecs) &&
3005 (rf_component_label_partitionsize(clabel) <= size)) {
3006 #ifdef DEBUG
3007 printf("Component on: %s: %llu\n",
3008 cname, (unsigned long long)size);
3009 rf_print_component_label(clabel);
3010 #endif
3011 /* if it's reasonable, add it, else ignore it. */
3012 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
3013 M_NOWAIT);
3014 if (ac == NULL) {
3015 free(clabel, M_RAIDFRAME);
3016 goto oomem;
3017 }
3018 strlcpy(ac->devname, cname, sizeof(ac->devname));
3019 ac->dev = dev;
3020 ac->vp = vp;
3021 ac->clabel = clabel;
3022 ac->next = ac_list;
3023 ac_list = ac;
3024 good_one = 1;
3025 }
3026 }
3027 if (!good_one) {
3028 /* cleanup */
3029 free(clabel, M_RAIDFRAME);
3030 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3031 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3032 vput(vp);
3033 }
3034 return ac_list;
3035 }
3036
3037 RF_AutoConfig_t *
3038 rf_find_raid_components(void)
3039 {
3040 struct vnode *vp;
3041 struct disklabel label;
3042 device_t dv;
3043 deviter_t di;
3044 dev_t dev;
3045 int bmajor, bminor, wedge, rf_part_found;
3046 int error;
3047 int i;
3048 RF_AutoConfig_t *ac_list;
3049 uint64_t numsecs;
3050 unsigned secsize;
3051
3052 /* initialize the AutoConfig list */
3053 ac_list = NULL;
3054
3055 /* we begin by trolling through *all* the devices on the system */
3056
3057 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3058 dv = deviter_next(&di)) {
3059
3060 /* we are only interested in disks... */
3061 if (device_class(dv) != DV_DISK)
3062 continue;
3063
3064 /* we don't care about floppies... */
3065 if (device_is_a(dv, "fd")) {
3066 continue;
3067 }
3068
3069 /* we don't care about CD's... */
3070 if (device_is_a(dv, "cd")) {
3071 continue;
3072 }
3073
3074 /* we don't care about md's... */
3075 if (device_is_a(dv, "md")) {
3076 continue;
3077 }
3078
3079 /* hdfd is the Atari/Hades floppy driver */
3080 if (device_is_a(dv, "hdfd")) {
3081 continue;
3082 }
3083
3084 /* fdisa is the Atari/Milan floppy driver */
3085 if (device_is_a(dv, "fdisa")) {
3086 continue;
3087 }
3088
3089 /* need to find the device_name_to_block_device_major stuff */
3090 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3091
3092 rf_part_found = 0; /*No raid partition as yet*/
3093
3094 /* get a vnode for the raw partition of this disk */
3095
3096 wedge = device_is_a(dv, "dk");
3097 bminor = minor(device_unit(dv));
3098 dev = wedge ? makedev(bmajor, bminor) :
3099 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3100 if (bdevvp(dev, &vp))
3101 panic("RAID can't alloc vnode");
3102
3103 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3104
3105 if (error) {
3106 /* "Who cares." Continue looking
3107 for something that exists*/
3108 vput(vp);
3109 continue;
3110 }
3111
3112 error = getdisksize(vp, &numsecs, &secsize);
3113 if (error) {
3114 vput(vp);
3115 continue;
3116 }
3117 if (wedge) {
3118 struct dkwedge_info dkw;
3119 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3120 NOCRED);
3121 if (error) {
3122 printf("RAIDframe: can't get wedge info for "
3123 "dev %s (%d)\n", device_xname(dv), error);
3124 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3125 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3126 vput(vp);
3127 continue;
3128 }
3129
3130 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3131 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3132 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3133 vput(vp);
3134 continue;
3135 }
3136
3137 ac_list = rf_get_component(ac_list, dev, vp,
3138 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3139 rf_part_found = 1; /*There is a raid component on this disk*/
3140 continue;
3141 }
3142
3143 /* Ok, the disk exists. Go get the disklabel. */
3144 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3145 if (error) {
3146 /*
3147 * XXX can't happen - open() would
3148 * have errored out (or faked up one)
3149 */
3150 if (error != ENOTTY)
3151 printf("RAIDframe: can't get label for dev "
3152 "%s (%d)\n", device_xname(dv), error);
3153 }
3154
3155 /* don't need this any more. We'll allocate it again
3156 a little later if we really do... */
3157 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3158 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3159 vput(vp);
3160
3161 if (error)
3162 continue;
3163
3164 rf_part_found = 0; /*No raid partitions yet*/
3165 for (i = 0; i < label.d_npartitions; i++) {
3166 char cname[sizeof(ac_list->devname)];
3167
3168 /* We only support partitions marked as RAID */
3169 if (label.d_partitions[i].p_fstype != FS_RAID)
3170 continue;
3171
3172 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3173 if (bdevvp(dev, &vp))
3174 panic("RAID can't alloc vnode");
3175
3176 error = VOP_OPEN(vp, FREAD, NOCRED);
3177 if (error) {
3178 /* Whatever... */
3179 vput(vp);
3180 continue;
3181 }
3182 snprintf(cname, sizeof(cname), "%s%c",
3183 device_xname(dv), 'a' + i);
3184 ac_list = rf_get_component(ac_list, dev, vp, cname,
3185 label.d_partitions[i].p_size, numsecs, secsize);
3186 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3187 }
3188
3189 /*
3190 *If there is no raid component on this disk, either in a
3191 *disklabel or inside a wedge, check the raw partition as well,
3192 *as it is possible to configure raid components on raw disk
3193 *devices.
3194 */
3195
3196 if (!rf_part_found) {
3197 char cname[sizeof(ac_list->devname)];
3198
3199 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3200 if (bdevvp(dev, &vp))
3201 panic("RAID can't alloc vnode");
3202
3203 error = VOP_OPEN(vp, FREAD, NOCRED);
3204 if (error) {
3205 /* Whatever... */
3206 vput(vp);
3207 continue;
3208 }
3209 snprintf(cname, sizeof(cname), "%s%c",
3210 device_xname(dv), 'a' + RAW_PART);
3211 ac_list = rf_get_component(ac_list, dev, vp, cname,
3212 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3213 }
3214 }
3215 deviter_release(&di);
3216 return ac_list;
3217 }
3218
3219
3220 int
3221 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3222 {
3223
3224 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3225 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3226 ((clabel->clean == RF_RAID_CLEAN) ||
3227 (clabel->clean == RF_RAID_DIRTY)) &&
3228 clabel->row >=0 &&
3229 clabel->column >= 0 &&
3230 clabel->num_rows > 0 &&
3231 clabel->num_columns > 0 &&
3232 clabel->row < clabel->num_rows &&
3233 clabel->column < clabel->num_columns &&
3234 clabel->blockSize > 0 &&
3235 /*
3236 * numBlocksHi may contain garbage, but it is ok since
3237 * the type is unsigned. If it is really garbage,
3238 * rf_fix_old_label_size() will fix it.
3239 */
3240 rf_component_label_numblocks(clabel) > 0) {
3241 /*
3242 * label looks reasonable enough...
3243 * let's make sure it has no old garbage.
3244 */
3245 if (numsecs)
3246 rf_fix_old_label_size(clabel, numsecs);
3247 return(1);
3248 }
3249 return(0);
3250 }
3251
3252
3253 /*
3254 * For reasons yet unknown, some old component labels have garbage in
3255 * the newer numBlocksHi region, and this causes lossage. Since those
3256 * disks will also have numsecs set to less than 32 bits of sectors,
3257 * we can determine when this corruption has occurred, and fix it.
3258 *
3259 * The exact same problem, with the same unknown reason, happens to
3260 * the partitionSizeHi member as well.
3261 */
3262 static void
3263 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3264 {
3265
3266 if (numsecs < ((uint64_t)1 << 32)) {
3267 if (clabel->numBlocksHi) {
3268 printf("WARNING: total sectors < 32 bits, yet "
3269 "numBlocksHi set\n"
3270 "WARNING: resetting numBlocksHi to zero.\n");
3271 clabel->numBlocksHi = 0;
3272 }
3273
3274 if (clabel->partitionSizeHi) {
3275 printf("WARNING: total sectors < 32 bits, yet "
3276 "partitionSizeHi set\n"
3277 "WARNING: resetting partitionSizeHi to zero.\n");
3278 clabel->partitionSizeHi = 0;
3279 }
3280 }
3281 }
3282
3283
3284 #ifdef DEBUG
3285 void
3286 rf_print_component_label(RF_ComponentLabel_t *clabel)
3287 {
3288 uint64_t numBlocks;
3289
3290 numBlocks = rf_component_label_numblocks(clabel);
3291
3292 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3293 clabel->row, clabel->column,
3294 clabel->num_rows, clabel->num_columns);
3295 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3296 clabel->version, clabel->serial_number,
3297 clabel->mod_counter);
3298 printf(" Clean: %s Status: %d\n",
3299 clabel->clean ? "Yes" : "No", clabel->status);
3300 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3301 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3302 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3303 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3304 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3305 printf(" Contains root partition: %s\n",
3306 clabel->root_partition ? "Yes" : "No");
3307 printf(" Last configured as: raid%d\n", clabel->last_unit);
3308 #if 0
3309 printf(" Config order: %d\n", clabel->config_order);
3310 #endif
3311
3312 }
3313 #endif
3314
3315 RF_ConfigSet_t *
3316 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3317 {
3318 RF_AutoConfig_t *ac;
3319 RF_ConfigSet_t *config_sets;
3320 RF_ConfigSet_t *cset;
3321 RF_AutoConfig_t *ac_next;
3322
3323
3324 config_sets = NULL;
3325
3326 /* Go through the AutoConfig list, and figure out which components
3327 belong to what sets. */
3328 ac = ac_list;
3329 while(ac!=NULL) {
3330 /* we're going to putz with ac->next, so save it here
3331 for use at the end of the loop */
3332 ac_next = ac->next;
3333
3334 if (config_sets == NULL) {
3335 /* will need at least this one... */
3336 config_sets = (RF_ConfigSet_t *)
3337 malloc(sizeof(RF_ConfigSet_t),
3338 M_RAIDFRAME, M_NOWAIT);
3339 if (config_sets == NULL) {
3340 panic("rf_create_auto_sets: No memory!");
3341 }
3342 /* this one is easy :) */
3343 config_sets->ac = ac;
3344 config_sets->next = NULL;
3345 config_sets->rootable = 0;
3346 ac->next = NULL;
3347 } else {
3348 /* which set does this component fit into? */
3349 cset = config_sets;
3350 while(cset!=NULL) {
3351 if (rf_does_it_fit(cset, ac)) {
3352 /* looks like it matches... */
3353 ac->next = cset->ac;
3354 cset->ac = ac;
3355 break;
3356 }
3357 cset = cset->next;
3358 }
3359 if (cset==NULL) {
3360 /* didn't find a match above... new set..*/
3361 cset = (RF_ConfigSet_t *)
3362 malloc(sizeof(RF_ConfigSet_t),
3363 M_RAIDFRAME, M_NOWAIT);
3364 if (cset == NULL) {
3365 panic("rf_create_auto_sets: No memory!");
3366 }
3367 cset->ac = ac;
3368 ac->next = NULL;
3369 cset->next = config_sets;
3370 cset->rootable = 0;
3371 config_sets = cset;
3372 }
3373 }
3374 ac = ac_next;
3375 }
3376
3377
3378 return(config_sets);
3379 }
3380
3381 static int
3382 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3383 {
3384 RF_ComponentLabel_t *clabel1, *clabel2;
3385
3386 /* If this one matches the *first* one in the set, that's good
3387 enough, since the other members of the set would have been
3388 through here too... */
3389 /* note that we are not checking partitionSize here..
3390
3391 Note that we are also not checking the mod_counters here.
3392 If everything else matches except the mod_counter, that's
3393 good enough for this test. We will deal with the mod_counters
3394 a little later in the autoconfiguration process.
3395
3396 (clabel1->mod_counter == clabel2->mod_counter) &&
3397
3398 The reason we don't check for this is that failed disks
3399 will have lower modification counts. If those disks are
3400 not added to the set they used to belong to, then they will
3401 form their own set, which may result in 2 different sets,
3402 for example, competing to be configured at raid0, and
3403 perhaps competing to be the root filesystem set. If the
3404 wrong ones get configured, or both attempt to become /,
3405 weird behaviour and or serious lossage will occur. Thus we
3406 need to bring them into the fold here, and kick them out at
3407 a later point.
3408
3409 */
3410
3411 clabel1 = cset->ac->clabel;
3412 clabel2 = ac->clabel;
3413 if ((clabel1->version == clabel2->version) &&
3414 (clabel1->serial_number == clabel2->serial_number) &&
3415 (clabel1->num_rows == clabel2->num_rows) &&
3416 (clabel1->num_columns == clabel2->num_columns) &&
3417 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3418 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3419 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3420 (clabel1->parityConfig == clabel2->parityConfig) &&
3421 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3422 (clabel1->blockSize == clabel2->blockSize) &&
3423 rf_component_label_numblocks(clabel1) ==
3424 rf_component_label_numblocks(clabel2) &&
3425 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3426 (clabel1->root_partition == clabel2->root_partition) &&
3427 (clabel1->last_unit == clabel2->last_unit) &&
3428 (clabel1->config_order == clabel2->config_order)) {
3429 /* if it get's here, it almost *has* to be a match */
3430 } else {
3431 /* it's not consistent with somebody in the set..
3432 punt */
3433 return(0);
3434 }
3435 /* all was fine.. it must fit... */
3436 return(1);
3437 }
3438
3439 int
3440 rf_have_enough_components(RF_ConfigSet_t *cset)
3441 {
3442 RF_AutoConfig_t *ac;
3443 RF_AutoConfig_t *auto_config;
3444 RF_ComponentLabel_t *clabel;
3445 int c;
3446 int num_cols;
3447 int num_missing;
3448 int mod_counter;
3449 int mod_counter_found;
3450 int even_pair_failed;
3451 char parity_type;
3452
3453
3454 /* check to see that we have enough 'live' components
3455 of this set. If so, we can configure it if necessary */
3456
3457 num_cols = cset->ac->clabel->num_columns;
3458 parity_type = cset->ac->clabel->parityConfig;
3459
3460 /* XXX Check for duplicate components!?!?!? */
3461
3462 /* Determine what the mod_counter is supposed to be for this set. */
3463
3464 mod_counter_found = 0;
3465 mod_counter = 0;
3466 ac = cset->ac;
3467 while(ac!=NULL) {
3468 if (mod_counter_found==0) {
3469 mod_counter = ac->clabel->mod_counter;
3470 mod_counter_found = 1;
3471 } else {
3472 if (ac->clabel->mod_counter > mod_counter) {
3473 mod_counter = ac->clabel->mod_counter;
3474 }
3475 }
3476 ac = ac->next;
3477 }
3478
3479 num_missing = 0;
3480 auto_config = cset->ac;
3481
3482 even_pair_failed = 0;
3483 for(c=0; c<num_cols; c++) {
3484 ac = auto_config;
3485 while(ac!=NULL) {
3486 if ((ac->clabel->column == c) &&
3487 (ac->clabel->mod_counter == mod_counter)) {
3488 /* it's this one... */
3489 #ifdef DEBUG
3490 printf("Found: %s at %d\n",
3491 ac->devname,c);
3492 #endif
3493 break;
3494 }
3495 ac=ac->next;
3496 }
3497 if (ac==NULL) {
3498 /* Didn't find one here! */
3499 /* special case for RAID 1, especially
3500 where there are more than 2
3501 components (where RAIDframe treats
3502 things a little differently :( ) */
3503 if (parity_type == '1') {
3504 if (c%2 == 0) { /* even component */
3505 even_pair_failed = 1;
3506 } else { /* odd component. If
3507 we're failed, and
3508 so is the even
3509 component, it's
3510 "Good Night, Charlie" */
3511 if (even_pair_failed == 1) {
3512 return(0);
3513 }
3514 }
3515 } else {
3516 /* normal accounting */
3517 num_missing++;
3518 }
3519 }
3520 if ((parity_type == '1') && (c%2 == 1)) {
3521 /* Just did an even component, and we didn't
3522 bail.. reset the even_pair_failed flag,
3523 and go on to the next component.... */
3524 even_pair_failed = 0;
3525 }
3526 }
3527
3528 clabel = cset->ac->clabel;
3529
3530 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3531 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3532 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3533 /* XXX this needs to be made *much* more general */
3534 /* Too many failures */
3535 return(0);
3536 }
3537 /* otherwise, all is well, and we've got enough to take a kick
3538 at autoconfiguring this set */
3539 return(1);
3540 }
3541
3542 void
3543 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3544 RF_Raid_t *raidPtr)
3545 {
3546 RF_ComponentLabel_t *clabel;
3547 int i;
3548
3549 clabel = ac->clabel;
3550
3551 /* 1. Fill in the common stuff */
3552 config->numRow = clabel->num_rows = 1;
3553 config->numCol = clabel->num_columns;
3554 config->numSpare = 0; /* XXX should this be set here? */
3555 config->sectPerSU = clabel->sectPerSU;
3556 config->SUsPerPU = clabel->SUsPerPU;
3557 config->SUsPerRU = clabel->SUsPerRU;
3558 config->parityConfig = clabel->parityConfig;
3559 /* XXX... */
3560 strcpy(config->diskQueueType,"fifo");
3561 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3562 config->layoutSpecificSize = 0; /* XXX ?? */
3563
3564 while(ac!=NULL) {
3565 /* row/col values will be in range due to the checks
3566 in reasonable_label() */
3567 strcpy(config->devnames[0][ac->clabel->column],
3568 ac->devname);
3569 ac = ac->next;
3570 }
3571
3572 for(i=0;i<RF_MAXDBGV;i++) {
3573 config->debugVars[i][0] = 0;
3574 }
3575 }
3576
3577 int
3578 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3579 {
3580 RF_ComponentLabel_t *clabel;
3581 int column;
3582 int sparecol;
3583
3584 raidPtr->autoconfigure = new_value;
3585
3586 for(column=0; column<raidPtr->numCol; column++) {
3587 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3588 clabel = raidget_component_label(raidPtr, column);
3589 clabel->autoconfigure = new_value;
3590 raidflush_component_label(raidPtr, column);
3591 }
3592 }
3593 for(column = 0; column < raidPtr->numSpare ; column++) {
3594 sparecol = raidPtr->numCol + column;
3595 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3596 clabel = raidget_component_label(raidPtr, sparecol);
3597 clabel->autoconfigure = new_value;
3598 raidflush_component_label(raidPtr, sparecol);
3599 }
3600 }
3601 return(new_value);
3602 }
3603
3604 int
3605 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3606 {
3607 RF_ComponentLabel_t *clabel;
3608 int column;
3609 int sparecol;
3610
3611 raidPtr->root_partition = new_value;
3612 for(column=0; column<raidPtr->numCol; column++) {
3613 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3614 clabel = raidget_component_label(raidPtr, column);
3615 clabel->root_partition = new_value;
3616 raidflush_component_label(raidPtr, column);
3617 }
3618 }
3619 for(column = 0; column < raidPtr->numSpare ; column++) {
3620 sparecol = raidPtr->numCol + column;
3621 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3622 clabel = raidget_component_label(raidPtr, sparecol);
3623 clabel->root_partition = new_value;
3624 raidflush_component_label(raidPtr, sparecol);
3625 }
3626 }
3627 return(new_value);
3628 }
3629
3630 void
3631 rf_release_all_vps(RF_ConfigSet_t *cset)
3632 {
3633 RF_AutoConfig_t *ac;
3634
3635 ac = cset->ac;
3636 while(ac!=NULL) {
3637 /* Close the vp, and give it back */
3638 if (ac->vp) {
3639 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3640 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3641 vput(ac->vp);
3642 ac->vp = NULL;
3643 }
3644 ac = ac->next;
3645 }
3646 }
3647
3648
3649 void
3650 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3651 {
3652 RF_AutoConfig_t *ac;
3653 RF_AutoConfig_t *next_ac;
3654
3655 ac = cset->ac;
3656 while(ac!=NULL) {
3657 next_ac = ac->next;
3658 /* nuke the label */
3659 free(ac->clabel, M_RAIDFRAME);
3660 /* cleanup the config structure */
3661 free(ac, M_RAIDFRAME);
3662 /* "next.." */
3663 ac = next_ac;
3664 }
3665 /* and, finally, nuke the config set */
3666 free(cset, M_RAIDFRAME);
3667 }
3668
3669
3670 void
3671 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3672 {
3673 /* current version number */
3674 clabel->version = RF_COMPONENT_LABEL_VERSION;
3675 clabel->serial_number = raidPtr->serial_number;
3676 clabel->mod_counter = raidPtr->mod_counter;
3677
3678 clabel->num_rows = 1;
3679 clabel->num_columns = raidPtr->numCol;
3680 clabel->clean = RF_RAID_DIRTY; /* not clean */
3681 clabel->status = rf_ds_optimal; /* "It's good!" */
3682
3683 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3684 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3685 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3686
3687 clabel->blockSize = raidPtr->bytesPerSector;
3688 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3689
3690 /* XXX not portable */
3691 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3692 clabel->maxOutstanding = raidPtr->maxOutstanding;
3693 clabel->autoconfigure = raidPtr->autoconfigure;
3694 clabel->root_partition = raidPtr->root_partition;
3695 clabel->last_unit = raidPtr->raidid;
3696 clabel->config_order = raidPtr->config_order;
3697
3698 #ifndef RF_NO_PARITY_MAP
3699 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3700 #endif
3701 }
3702
3703 struct raid_softc *
3704 rf_auto_config_set(RF_ConfigSet_t *cset)
3705 {
3706 RF_Raid_t *raidPtr;
3707 RF_Config_t *config;
3708 int raidID;
3709 struct raid_softc *sc;
3710
3711 #ifdef DEBUG
3712 printf("RAID autoconfigure\n");
3713 #endif
3714
3715 /* 1. Create a config structure */
3716 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3717 if (config == NULL) {
3718 printf("Out of mem!?!?\n");
3719 /* XXX do something more intelligent here. */
3720 return NULL;
3721 }
3722
3723 /*
3724 2. Figure out what RAID ID this one is supposed to live at
3725 See if we can get the same RAID dev that it was configured
3726 on last time..
3727 */
3728
3729 raidID = cset->ac->clabel->last_unit;
3730 for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
3731 continue;
3732 #ifdef DEBUG
3733 printf("Configuring raid%d:\n",raidID);
3734 #endif
3735
3736 raidPtr = &sc->sc_r;
3737
3738 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3739 raidPtr->raidid = raidID;
3740 raidPtr->openings = RAIDOUTSTANDING;
3741
3742 /* 3. Build the configuration structure */
3743 rf_create_configuration(cset->ac, config, raidPtr);
3744
3745 /* 4. Do the configuration */
3746 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3747 raidinit(sc);
3748
3749 rf_markalldirty(raidPtr);
3750 raidPtr->autoconfigure = 1; /* XXX do this here? */
3751 if (cset->ac->clabel->root_partition==1) {
3752 /* everything configured just fine. Make a note
3753 that this set is eligible to be root. */
3754 cset->rootable = 1;
3755 /* XXX do this here? */
3756 raidPtr->root_partition = 1;
3757 }
3758 } else {
3759 raidput(sc);
3760 sc = NULL;
3761 }
3762
3763 /* 5. Cleanup */
3764 free(config, M_RAIDFRAME);
3765 return sc;
3766 }
3767
3768 void
3769 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3770 {
3771 struct buf *bp;
3772 struct raid_softc *rs;
3773
3774 bp = (struct buf *)desc->bp;
3775 rs = desc->raidPtr->softc;
3776 disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
3777 (bp->b_flags & B_READ));
3778 }
3779
3780 void
3781 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3782 size_t xmin, size_t xmax)
3783 {
3784 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3785 pool_sethiwat(p, xmax);
3786 pool_prime(p, xmin);
3787 pool_setlowat(p, xmin);
3788 }
3789
3790 /*
3791 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
3792 * if there is IO pending and if that IO could possibly be done for a
3793 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3794 * otherwise.
3795 *
3796 */
3797
3798 int
3799 rf_buf_queue_check(RF_Raid_t *raidPtr)
3800 {
3801 struct raid_softc *rs = raidPtr->softc;
3802 if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
3803 /* there is work to do */
3804 return 0;
3805 }
3806 /* default is nothing to do */
3807 return 1;
3808 }
3809
3810 int
3811 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3812 {
3813 uint64_t numsecs;
3814 unsigned secsize;
3815 int error;
3816
3817 error = getdisksize(vp, &numsecs, &secsize);
3818 if (error == 0) {
3819 diskPtr->blockSize = secsize;
3820 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3821 diskPtr->partitionSize = numsecs;
3822 return 0;
3823 }
3824 return error;
3825 }
3826
3827 static int
3828 raid_match(device_t self, cfdata_t cfdata, void *aux)
3829 {
3830 return 1;
3831 }
3832
3833 static void
3834 raid_attach(device_t parent, device_t self, void *aux)
3835 {
3836
3837 }
3838
3839
3840 static int
3841 raid_detach(device_t self, int flags)
3842 {
3843 int error;
3844 struct raid_softc *rs = device_private(self);
3845
3846 if ((error = raidlock(rs)) != 0)
3847 return (error);
3848
3849 error = raid_detach_unlocked(rs);
3850
3851 raidunlock(rs);
3852
3853 return error;
3854 }
3855
3856 static void
3857 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3858 {
3859 prop_dictionary_t disk_info, odisk_info, geom;
3860 disk_info = prop_dictionary_create();
3861 geom = prop_dictionary_create();
3862 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3863 raidPtr->totalSectors);
3864 prop_dictionary_set_uint32(geom, "sector-size",
3865 raidPtr->bytesPerSector);
3866
3867 prop_dictionary_set_uint16(geom, "sectors-per-track",
3868 raidPtr->Layout.dataSectorsPerStripe);
3869 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3870 4 * raidPtr->numCol);
3871
3872 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3873 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3874 (4 * raidPtr->numCol)));
3875
3876 prop_dictionary_set(disk_info, "geometry", geom);
3877 prop_object_release(geom);
3878 prop_dictionary_set(device_properties(rs->sc_dev),
3879 "disk-info", disk_info);
3880 odisk_info = rs->sc_dkdev.dk_info;
3881 rs->sc_dkdev.dk_info = disk_info;
3882 if (odisk_info)
3883 prop_object_release(odisk_info);
3884 }
3885
3886 /*
3887 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3888 * We end up returning whatever error was returned by the first cache flush
3889 * that fails.
3890 */
3891
3892 int
3893 rf_sync_component_caches(RF_Raid_t *raidPtr)
3894 {
3895 int c, sparecol;
3896 int e,error;
3897 int force = 1;
3898
3899 error = 0;
3900 for (c = 0; c < raidPtr->numCol; c++) {
3901 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3902 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3903 &force, FWRITE, NOCRED);
3904 if (e) {
3905 if (e != ENODEV)
3906 printf("raid%d: cache flush to component %s failed.\n",
3907 raidPtr->raidid, raidPtr->Disks[c].devname);
3908 if (error == 0) {
3909 error = e;
3910 }
3911 }
3912 }
3913 }
3914
3915 for( c = 0; c < raidPtr->numSpare ; c++) {
3916 sparecol = raidPtr->numCol + c;
3917 /* Need to ensure that the reconstruct actually completed! */
3918 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3919 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3920 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3921 if (e) {
3922 if (e != ENODEV)
3923 printf("raid%d: cache flush to component %s failed.\n",
3924 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3925 if (error == 0) {
3926 error = e;
3927 }
3928 }
3929 }
3930 }
3931 return error;
3932 }
3933