rf_netbsdkintf.c revision 1.305 1 /* $NetBSD: rf_netbsdkintf.c,v 1.305 2014/03/16 05:20:29 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.305 2014/03/16 05:20:29 dholland Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129
130 #include <prop/proplib.h>
131
132 #include <dev/raidframe/raidframevar.h>
133 #include <dev/raidframe/raidframeio.h>
134 #include <dev/raidframe/rf_paritymap.h>
135
136 #include "rf_raid.h"
137 #include "rf_copyback.h"
138 #include "rf_dag.h"
139 #include "rf_dagflags.h"
140 #include "rf_desc.h"
141 #include "rf_diskqueue.h"
142 #include "rf_etimer.h"
143 #include "rf_general.h"
144 #include "rf_kintf.h"
145 #include "rf_options.h"
146 #include "rf_driver.h"
147 #include "rf_parityscan.h"
148 #include "rf_threadstuff.h"
149
150 #ifdef COMPAT_50
151 #include "rf_compat50.h"
152 #endif
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165
166 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
167 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
169 * installation process */
170 #endif
171
172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
173
174 /* prototypes */
175 static void KernelWakeupFunc(struct buf *);
176 static void InitBP(struct buf *, struct vnode *, unsigned,
177 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
178 void *, int, struct proc *);
179 struct raid_softc;
180 static void raidinit(struct raid_softc *);
181
182 void raidattach(int);
183 static int raid_match(device_t, cfdata_t, void *);
184 static void raid_attach(device_t, device_t, void *);
185 static int raid_detach(device_t, int);
186
187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
188 daddr_t, daddr_t);
189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t, int);
191
192 static int raidwrite_component_label(unsigned,
193 dev_t, struct vnode *, RF_ComponentLabel_t *);
194 static int raidread_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196
197
198 dev_type_open(raidopen);
199 dev_type_close(raidclose);
200 dev_type_read(raidread);
201 dev_type_write(raidwrite);
202 dev_type_ioctl(raidioctl);
203 dev_type_strategy(raidstrategy);
204 dev_type_dump(raiddump);
205 dev_type_size(raidsize);
206
207 const struct bdevsw raid_bdevsw = {
208 .d_open = raidopen,
209 .d_close = raidclose,
210 .d_strategy = raidstrategy,
211 .d_ioctl = raidioctl,
212 .d_dump = raiddump,
213 .d_psize = raidsize,
214 .d_flag = D_DISK
215 };
216
217 const struct cdevsw raid_cdevsw = {
218 .d_open = raidopen,
219 .d_close = raidclose,
220 .d_read = raidread,
221 .d_write = raidwrite,
222 .d_ioctl = raidioctl,
223 .d_stop = nostop,
224 .d_tty = notty,
225 .d_poll = nopoll,
226 .d_mmap = nommap,
227 .d_kqfilter = nokqfilter,
228 .d_flag = D_DISK
229 };
230
231 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
232
233 struct raid_softc {
234 device_t sc_dev;
235 int sc_unit;
236 int sc_flags; /* flags */
237 int sc_cflags; /* configuration flags */
238 uint64_t sc_size; /* size of the raid device */
239 char sc_xname[20]; /* XXX external name */
240 struct disk sc_dkdev; /* generic disk device info */
241 struct bufq_state *buf_queue; /* used for the device queue */
242 RF_Raid_t sc_r;
243 LIST_ENTRY(raid_softc) sc_link;
244 };
245 /* sc_flags */
246 #define RAIDF_INITED 0x01 /* unit has been initialized */
247 #define RAIDF_WLABEL 0x02 /* label area is writable */
248 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
249 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
250 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
251 #define RAIDF_LOCKED 0x80 /* unit is locked */
252
253 #define raidunit(x) DISKUNIT(x)
254
255 extern struct cfdriver raid_cd;
256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
257 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
258 DVF_DETACH_SHUTDOWN);
259
260 /*
261 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
262 * Be aware that large numbers can allow the driver to consume a lot of
263 * kernel memory, especially on writes, and in degraded mode reads.
264 *
265 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
266 * a single 64K write will typically require 64K for the old data,
267 * 64K for the old parity, and 64K for the new parity, for a total
268 * of 192K (if the parity buffer is not re-used immediately).
269 * Even it if is used immediately, that's still 128K, which when multiplied
270 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
271 *
272 * Now in degraded mode, for example, a 64K read on the above setup may
273 * require data reconstruction, which will require *all* of the 4 remaining
274 * disks to participate -- 4 * 32K/disk == 128K again.
275 */
276
277 #ifndef RAIDOUTSTANDING
278 #define RAIDOUTSTANDING 6
279 #endif
280
281 #define RAIDLABELDEV(dev) \
282 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
283
284 /* declared here, and made public, for the benefit of KVM stuff.. */
285
286 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
287 struct disklabel *);
288 static void raidgetdisklabel(dev_t);
289 static void raidmakedisklabel(struct raid_softc *);
290
291 static int raidlock(struct raid_softc *);
292 static void raidunlock(struct raid_softc *);
293
294 static int raid_detach_unlocked(struct raid_softc *);
295
296 static void rf_markalldirty(RF_Raid_t *);
297 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
298
299 void rf_ReconThread(struct rf_recon_req *);
300 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
301 void rf_CopybackThread(RF_Raid_t *raidPtr);
302 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
303 int rf_autoconfig(device_t);
304 void rf_buildroothack(RF_ConfigSet_t *);
305
306 RF_AutoConfig_t *rf_find_raid_components(void);
307 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
308 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
309 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
310 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
311 int rf_set_autoconfig(RF_Raid_t *, int);
312 int rf_set_rootpartition(RF_Raid_t *, int);
313 void rf_release_all_vps(RF_ConfigSet_t *);
314 void rf_cleanup_config_set(RF_ConfigSet_t *);
315 int rf_have_enough_components(RF_ConfigSet_t *);
316 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
317 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
318
319 /*
320 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
321 * Note that this is overridden by having RAID_AUTOCONFIG as an option
322 * in the kernel config file.
323 */
324 #ifdef RAID_AUTOCONFIG
325 int raidautoconfig = 1;
326 #else
327 int raidautoconfig = 0;
328 #endif
329 static bool raidautoconfigdone = false;
330
331 struct RF_Pools_s rf_pools;
332
333 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
334 static kmutex_t raid_lock;
335
336 static struct raid_softc *
337 raidcreate(int unit) {
338 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
339 if (sc == NULL) {
340 #ifdef DIAGNOSTIC
341 printf("%s: out of memory\n", __func__);
342 #endif
343 return NULL;
344 }
345 sc->sc_unit = unit;
346 bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
347 return sc;
348 }
349
350 static void
351 raiddestroy(struct raid_softc *sc) {
352 bufq_free(sc->buf_queue);
353 kmem_free(sc, sizeof(*sc));
354 }
355
356 static struct raid_softc *
357 raidget(int unit) {
358 struct raid_softc *sc;
359 if (unit < 0) {
360 #ifdef DIAGNOSTIC
361 panic("%s: unit %d!", __func__, unit);
362 #endif
363 return NULL;
364 }
365 mutex_enter(&raid_lock);
366 LIST_FOREACH(sc, &raids, sc_link) {
367 if (sc->sc_unit == unit) {
368 mutex_exit(&raid_lock);
369 return sc;
370 }
371 }
372 mutex_exit(&raid_lock);
373 if ((sc = raidcreate(unit)) == NULL)
374 return NULL;
375 mutex_enter(&raid_lock);
376 LIST_INSERT_HEAD(&raids, sc, sc_link);
377 mutex_exit(&raid_lock);
378 return sc;
379 }
380
381 static void
382 raidput(struct raid_softc *sc) {
383 mutex_enter(&raid_lock);
384 LIST_REMOVE(sc, sc_link);
385 mutex_exit(&raid_lock);
386 raiddestroy(sc);
387 }
388
389 void
390 raidattach(int num)
391 {
392 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
393 /* This is where all the initialization stuff gets done. */
394
395 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
396 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
397 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
398 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
399
400 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
401 #endif
402
403 if (rf_BootRaidframe() == 0)
404 aprint_verbose("Kernelized RAIDframe activated\n");
405 else
406 panic("Serious error booting RAID!!");
407
408 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
409 aprint_error("raidattach: config_cfattach_attach failed?\n");
410 }
411
412 raidautoconfigdone = false;
413
414 /*
415 * Register a finalizer which will be used to auto-config RAID
416 * sets once all real hardware devices have been found.
417 */
418 if (config_finalize_register(NULL, rf_autoconfig) != 0)
419 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
420 }
421
422 int
423 rf_autoconfig(device_t self)
424 {
425 RF_AutoConfig_t *ac_list;
426 RF_ConfigSet_t *config_sets;
427
428 if (!raidautoconfig || raidautoconfigdone == true)
429 return (0);
430
431 /* XXX This code can only be run once. */
432 raidautoconfigdone = true;
433
434 /* 1. locate all RAID components on the system */
435 aprint_debug("Searching for RAID components...\n");
436 ac_list = rf_find_raid_components();
437
438 /* 2. Sort them into their respective sets. */
439 config_sets = rf_create_auto_sets(ac_list);
440
441 /*
442 * 3. Evaluate each set and configure the valid ones.
443 * This gets done in rf_buildroothack().
444 */
445 rf_buildroothack(config_sets);
446
447 return 1;
448 }
449
450 void
451 rf_buildroothack(RF_ConfigSet_t *config_sets)
452 {
453 RF_ConfigSet_t *cset;
454 RF_ConfigSet_t *next_cset;
455 int col;
456 int num_root;
457 char *devname;
458 struct raid_softc *sc, *rsc;
459
460 sc = rsc = NULL;
461 num_root = 0;
462 cset = config_sets;
463 while (cset != NULL) {
464 next_cset = cset->next;
465 if (rf_have_enough_components(cset) &&
466 cset->ac->clabel->autoconfigure == 1) {
467 sc = rf_auto_config_set(cset);
468 if (sc != NULL) {
469 aprint_debug("raid%d: configured ok\n",
470 sc->sc_unit);
471 if (cset->rootable) {
472 rsc = sc;
473 num_root++;
474 }
475 } else {
476 /* The autoconfig didn't work :( */
477 aprint_debug("Autoconfig failed\n");
478 rf_release_all_vps(cset);
479 }
480 } else {
481 /* we're not autoconfiguring this set...
482 release the associated resources */
483 rf_release_all_vps(cset);
484 }
485 /* cleanup */
486 rf_cleanup_config_set(cset);
487 cset = next_cset;
488 }
489
490 /* if the user has specified what the root device should be
491 then we don't touch booted_device or boothowto... */
492
493 if (rootspec != NULL)
494 return;
495
496 /* we found something bootable... */
497
498 if (num_root == 1) {
499 if (rsc->sc_dkdev.dk_nwedges != 0) {
500 /* XXX: How do we find the real root partition? */
501 char cname[sizeof(cset->ac->devname)];
502 snprintf(cname, sizeof(cname), "%s%c",
503 device_xname(rsc->sc_dev), 'a');
504 booted_device = dkwedge_find_by_wname(cname);
505 } else
506 booted_device = rsc->sc_dev;
507 } else if (num_root > 1) {
508
509 /*
510 * Maybe the MD code can help. If it cannot, then
511 * setroot() will discover that we have no
512 * booted_device and will ask the user if nothing was
513 * hardwired in the kernel config file
514 */
515
516 if (booted_device == NULL)
517 cpu_rootconf();
518 if (booted_device == NULL)
519 return;
520
521 num_root = 0;
522 mutex_enter(&raid_lock);
523 LIST_FOREACH(sc, &raids, sc_link) {
524 RF_Raid_t *r = &sc->sc_r;
525 if (r->valid == 0)
526 continue;
527
528 if (r->root_partition == 0)
529 continue;
530
531 for (col = 0; col < r->numCol; col++) {
532 devname = r->Disks[col].devname;
533 devname += sizeof("/dev/") - 1;
534 if (strncmp(devname, device_xname(booted_device),
535 strlen(device_xname(booted_device))) != 0)
536 continue;
537 aprint_debug("raid%d includes boot device %s\n",
538 sc->sc_unit, devname);
539 num_root++;
540 rsc = sc;
541 }
542 }
543 mutex_exit(&raid_lock);
544
545 if (num_root == 1) {
546 booted_device = rsc->sc_dev;
547 } else {
548 /* we can't guess.. require the user to answer... */
549 boothowto |= RB_ASKNAME;
550 }
551 }
552 }
553
554
555 int
556 raidsize(dev_t dev)
557 {
558 struct raid_softc *rs;
559 struct disklabel *lp;
560 int part, unit, omask, size;
561
562 unit = raidunit(dev);
563 if ((rs = raidget(unit)) == NULL)
564 return -1;
565 if ((rs->sc_flags & RAIDF_INITED) == 0)
566 return (-1);
567
568 part = DISKPART(dev);
569 omask = rs->sc_dkdev.dk_openmask & (1 << part);
570 lp = rs->sc_dkdev.dk_label;
571
572 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
573 return (-1);
574
575 if (lp->d_partitions[part].p_fstype != FS_SWAP)
576 size = -1;
577 else
578 size = lp->d_partitions[part].p_size *
579 (lp->d_secsize / DEV_BSIZE);
580
581 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
582 return (-1);
583
584 return (size);
585
586 }
587
588 int
589 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
590 {
591 int unit = raidunit(dev);
592 struct raid_softc *rs;
593 const struct bdevsw *bdev;
594 struct disklabel *lp;
595 RF_Raid_t *raidPtr;
596 daddr_t offset;
597 int part, c, sparecol, j, scol, dumpto;
598 int error = 0;
599
600 if ((rs = raidget(unit)) == NULL)
601 return ENXIO;
602
603 raidPtr = &rs->sc_r;
604
605 if ((rs->sc_flags & RAIDF_INITED) == 0)
606 return ENXIO;
607
608 /* we only support dumping to RAID 1 sets */
609 if (raidPtr->Layout.numDataCol != 1 ||
610 raidPtr->Layout.numParityCol != 1)
611 return EINVAL;
612
613
614 if ((error = raidlock(rs)) != 0)
615 return error;
616
617 if (size % DEV_BSIZE != 0) {
618 error = EINVAL;
619 goto out;
620 }
621
622 if (blkno + size / DEV_BSIZE > rs->sc_size) {
623 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
624 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
625 size / DEV_BSIZE, rs->sc_size);
626 error = EINVAL;
627 goto out;
628 }
629
630 part = DISKPART(dev);
631 lp = rs->sc_dkdev.dk_label;
632 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
633
634 /* figure out what device is alive.. */
635
636 /*
637 Look for a component to dump to. The preference for the
638 component to dump to is as follows:
639 1) the master
640 2) a used_spare of the master
641 3) the slave
642 4) a used_spare of the slave
643 */
644
645 dumpto = -1;
646 for (c = 0; c < raidPtr->numCol; c++) {
647 if (raidPtr->Disks[c].status == rf_ds_optimal) {
648 /* this might be the one */
649 dumpto = c;
650 break;
651 }
652 }
653
654 /*
655 At this point we have possibly selected a live master or a
656 live slave. We now check to see if there is a spared
657 master (or a spared slave), if we didn't find a live master
658 or a live slave.
659 */
660
661 for (c = 0; c < raidPtr->numSpare; c++) {
662 sparecol = raidPtr->numCol + c;
663 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
664 /* How about this one? */
665 scol = -1;
666 for(j=0;j<raidPtr->numCol;j++) {
667 if (raidPtr->Disks[j].spareCol == sparecol) {
668 scol = j;
669 break;
670 }
671 }
672 if (scol == 0) {
673 /*
674 We must have found a spared master!
675 We'll take that over anything else
676 found so far. (We couldn't have
677 found a real master before, since
678 this is a used spare, and it's
679 saying that it's replacing the
680 master.) On reboot (with
681 autoconfiguration turned on)
682 sparecol will become the 1st
683 component (component0) of this set.
684 */
685 dumpto = sparecol;
686 break;
687 } else if (scol != -1) {
688 /*
689 Must be a spared slave. We'll dump
690 to that if we havn't found anything
691 else so far.
692 */
693 if (dumpto == -1)
694 dumpto = sparecol;
695 }
696 }
697 }
698
699 if (dumpto == -1) {
700 /* we couldn't find any live components to dump to!?!?
701 */
702 error = EINVAL;
703 goto out;
704 }
705
706 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
707
708 /*
709 Note that blkno is relative to this particular partition.
710 By adding the offset of this partition in the RAID
711 set, and also adding RF_PROTECTED_SECTORS, we get a
712 value that is relative to the partition used for the
713 underlying component.
714 */
715
716 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
717 blkno + offset, va, size);
718
719 out:
720 raidunlock(rs);
721
722 return error;
723 }
724 /* ARGSUSED */
725 int
726 raidopen(dev_t dev, int flags, int fmt,
727 struct lwp *l)
728 {
729 int unit = raidunit(dev);
730 struct raid_softc *rs;
731 struct disklabel *lp;
732 int part, pmask;
733 int error = 0;
734
735 if ((rs = raidget(unit)) == NULL)
736 return ENXIO;
737 if ((error = raidlock(rs)) != 0)
738 return (error);
739
740 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
741 error = EBUSY;
742 goto bad;
743 }
744
745 lp = rs->sc_dkdev.dk_label;
746
747 part = DISKPART(dev);
748
749 /*
750 * If there are wedges, and this is not RAW_PART, then we
751 * need to fail.
752 */
753 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
754 error = EBUSY;
755 goto bad;
756 }
757 pmask = (1 << part);
758
759 if ((rs->sc_flags & RAIDF_INITED) &&
760 (rs->sc_dkdev.dk_openmask == 0))
761 raidgetdisklabel(dev);
762
763 /* make sure that this partition exists */
764
765 if (part != RAW_PART) {
766 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
767 ((part >= lp->d_npartitions) ||
768 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
769 error = ENXIO;
770 goto bad;
771 }
772 }
773 /* Prevent this unit from being unconfigured while open. */
774 switch (fmt) {
775 case S_IFCHR:
776 rs->sc_dkdev.dk_copenmask |= pmask;
777 break;
778
779 case S_IFBLK:
780 rs->sc_dkdev.dk_bopenmask |= pmask;
781 break;
782 }
783
784 if ((rs->sc_dkdev.dk_openmask == 0) &&
785 ((rs->sc_flags & RAIDF_INITED) != 0)) {
786 /* First one... mark things as dirty... Note that we *MUST*
787 have done a configure before this. I DO NOT WANT TO BE
788 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
789 THAT THEY BELONG TOGETHER!!!!! */
790 /* XXX should check to see if we're only open for reading
791 here... If so, we needn't do this, but then need some
792 other way of keeping track of what's happened.. */
793
794 rf_markalldirty(&rs->sc_r);
795 }
796
797
798 rs->sc_dkdev.dk_openmask =
799 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
800
801 bad:
802 raidunlock(rs);
803
804 return (error);
805
806
807 }
808 /* ARGSUSED */
809 int
810 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
811 {
812 int unit = raidunit(dev);
813 struct raid_softc *rs;
814 int error = 0;
815 int part;
816
817 if ((rs = raidget(unit)) == NULL)
818 return ENXIO;
819
820 if ((error = raidlock(rs)) != 0)
821 return (error);
822
823 part = DISKPART(dev);
824
825 /* ...that much closer to allowing unconfiguration... */
826 switch (fmt) {
827 case S_IFCHR:
828 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
829 break;
830
831 case S_IFBLK:
832 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
833 break;
834 }
835 rs->sc_dkdev.dk_openmask =
836 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
837
838 if ((rs->sc_dkdev.dk_openmask == 0) &&
839 ((rs->sc_flags & RAIDF_INITED) != 0)) {
840 /* Last one... device is not unconfigured yet.
841 Device shutdown has taken care of setting the
842 clean bits if RAIDF_INITED is not set
843 mark things as clean... */
844
845 rf_update_component_labels(&rs->sc_r,
846 RF_FINAL_COMPONENT_UPDATE);
847
848 /* If the kernel is shutting down, it will detach
849 * this RAID set soon enough.
850 */
851 }
852
853 raidunlock(rs);
854 return (0);
855
856 }
857
858 void
859 raidstrategy(struct buf *bp)
860 {
861 unsigned int unit = raidunit(bp->b_dev);
862 RF_Raid_t *raidPtr;
863 int wlabel;
864 struct raid_softc *rs;
865
866 if ((rs = raidget(unit)) == NULL) {
867 bp->b_error = ENXIO;
868 goto done;
869 }
870 if ((rs->sc_flags & RAIDF_INITED) == 0) {
871 bp->b_error = ENXIO;
872 goto done;
873 }
874 raidPtr = &rs->sc_r;
875 if (!raidPtr->valid) {
876 bp->b_error = ENODEV;
877 goto done;
878 }
879 if (bp->b_bcount == 0) {
880 db1_printf(("b_bcount is zero..\n"));
881 goto done;
882 }
883
884 /*
885 * Do bounds checking and adjust transfer. If there's an
886 * error, the bounds check will flag that for us.
887 */
888
889 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
890 if (DISKPART(bp->b_dev) == RAW_PART) {
891 uint64_t size; /* device size in DEV_BSIZE unit */
892
893 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
894 size = raidPtr->totalSectors <<
895 (raidPtr->logBytesPerSector - DEV_BSHIFT);
896 } else {
897 size = raidPtr->totalSectors >>
898 (DEV_BSHIFT - raidPtr->logBytesPerSector);
899 }
900 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
901 goto done;
902 }
903 } else {
904 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
905 db1_printf(("Bounds check failed!!:%d %d\n",
906 (int) bp->b_blkno, (int) wlabel));
907 goto done;
908 }
909 }
910
911 rf_lock_mutex2(raidPtr->iodone_lock);
912
913 bp->b_resid = 0;
914
915 /* stuff it onto our queue */
916 bufq_put(rs->buf_queue, bp);
917
918 /* scheduled the IO to happen at the next convenient time */
919 rf_signal_cond2(raidPtr->iodone_cv);
920 rf_unlock_mutex2(raidPtr->iodone_lock);
921
922 return;
923
924 done:
925 bp->b_resid = bp->b_bcount;
926 biodone(bp);
927 }
928 /* ARGSUSED */
929 int
930 raidread(dev_t dev, struct uio *uio, int flags)
931 {
932 int unit = raidunit(dev);
933 struct raid_softc *rs;
934
935 if ((rs = raidget(unit)) == NULL)
936 return ENXIO;
937
938 if ((rs->sc_flags & RAIDF_INITED) == 0)
939 return (ENXIO);
940
941 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
942
943 }
944 /* ARGSUSED */
945 int
946 raidwrite(dev_t dev, struct uio *uio, int flags)
947 {
948 int unit = raidunit(dev);
949 struct raid_softc *rs;
950
951 if ((rs = raidget(unit)) == NULL)
952 return ENXIO;
953
954 if ((rs->sc_flags & RAIDF_INITED) == 0)
955 return (ENXIO);
956
957 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
958
959 }
960
961 static int
962 raid_detach_unlocked(struct raid_softc *rs)
963 {
964 int error;
965 RF_Raid_t *raidPtr;
966
967 raidPtr = &rs->sc_r;
968
969 /*
970 * If somebody has a partition mounted, we shouldn't
971 * shutdown.
972 */
973 if (rs->sc_dkdev.dk_openmask != 0)
974 return EBUSY;
975
976 if ((rs->sc_flags & RAIDF_INITED) == 0)
977 ; /* not initialized: nothing to do */
978 else if ((error = rf_Shutdown(raidPtr)) != 0)
979 return error;
980 else
981 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
982
983 /* Detach the disk. */
984 dkwedge_delall(&rs->sc_dkdev);
985 disk_detach(&rs->sc_dkdev);
986 disk_destroy(&rs->sc_dkdev);
987
988 aprint_normal_dev(rs->sc_dev, "detached\n");
989
990 return 0;
991 }
992
993 int
994 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
995 {
996 int unit = raidunit(dev);
997 int error = 0;
998 int part, pmask, s;
999 cfdata_t cf;
1000 struct raid_softc *rs;
1001 RF_Config_t *k_cfg, *u_cfg;
1002 RF_Raid_t *raidPtr;
1003 RF_RaidDisk_t *diskPtr;
1004 RF_AccTotals_t *totals;
1005 RF_DeviceConfig_t *d_cfg, **ucfgp;
1006 u_char *specific_buf;
1007 int retcode = 0;
1008 int column;
1009 /* int raidid; */
1010 struct rf_recon_req *rrcopy, *rr;
1011 RF_ComponentLabel_t *clabel;
1012 RF_ComponentLabel_t *ci_label;
1013 RF_ComponentLabel_t **clabel_ptr;
1014 RF_SingleComponent_t *sparePtr,*componentPtr;
1015 RF_SingleComponent_t component;
1016 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1017 int i, j, d;
1018 #ifdef __HAVE_OLD_DISKLABEL
1019 struct disklabel newlabel;
1020 #endif
1021 struct dkwedge_info *dkw;
1022
1023 if ((rs = raidget(unit)) == NULL)
1024 return ENXIO;
1025 raidPtr = &rs->sc_r;
1026
1027 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1028 (int) DISKPART(dev), (int) unit, cmd));
1029
1030 /* Must be open for writes for these commands... */
1031 switch (cmd) {
1032 #ifdef DIOCGSECTORSIZE
1033 case DIOCGSECTORSIZE:
1034 *(u_int *)data = raidPtr->bytesPerSector;
1035 return 0;
1036 case DIOCGMEDIASIZE:
1037 *(off_t *)data =
1038 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1039 return 0;
1040 #endif
1041 case DIOCSDINFO:
1042 case DIOCWDINFO:
1043 #ifdef __HAVE_OLD_DISKLABEL
1044 case ODIOCWDINFO:
1045 case ODIOCSDINFO:
1046 #endif
1047 case DIOCWLABEL:
1048 case DIOCAWEDGE:
1049 case DIOCDWEDGE:
1050 case DIOCSSTRATEGY:
1051 if ((flag & FWRITE) == 0)
1052 return (EBADF);
1053 }
1054
1055 /* Must be initialized for these... */
1056 switch (cmd) {
1057 case DIOCGDINFO:
1058 case DIOCSDINFO:
1059 case DIOCWDINFO:
1060 #ifdef __HAVE_OLD_DISKLABEL
1061 case ODIOCGDINFO:
1062 case ODIOCWDINFO:
1063 case ODIOCSDINFO:
1064 case ODIOCGDEFLABEL:
1065 #endif
1066 case DIOCGPART:
1067 case DIOCWLABEL:
1068 case DIOCGDEFLABEL:
1069 case DIOCAWEDGE:
1070 case DIOCDWEDGE:
1071 case DIOCLWEDGES:
1072 case DIOCCACHESYNC:
1073 case RAIDFRAME_SHUTDOWN:
1074 case RAIDFRAME_REWRITEPARITY:
1075 case RAIDFRAME_GET_INFO:
1076 case RAIDFRAME_RESET_ACCTOTALS:
1077 case RAIDFRAME_GET_ACCTOTALS:
1078 case RAIDFRAME_KEEP_ACCTOTALS:
1079 case RAIDFRAME_GET_SIZE:
1080 case RAIDFRAME_FAIL_DISK:
1081 case RAIDFRAME_COPYBACK:
1082 case RAIDFRAME_CHECK_RECON_STATUS:
1083 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1084 case RAIDFRAME_GET_COMPONENT_LABEL:
1085 case RAIDFRAME_SET_COMPONENT_LABEL:
1086 case RAIDFRAME_ADD_HOT_SPARE:
1087 case RAIDFRAME_REMOVE_HOT_SPARE:
1088 case RAIDFRAME_INIT_LABELS:
1089 case RAIDFRAME_REBUILD_IN_PLACE:
1090 case RAIDFRAME_CHECK_PARITY:
1091 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1092 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1093 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1094 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1095 case RAIDFRAME_SET_AUTOCONFIG:
1096 case RAIDFRAME_SET_ROOT:
1097 case RAIDFRAME_DELETE_COMPONENT:
1098 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1099 case RAIDFRAME_PARITYMAP_STATUS:
1100 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1101 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1102 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1103 case DIOCGSTRATEGY:
1104 case DIOCSSTRATEGY:
1105 if ((rs->sc_flags & RAIDF_INITED) == 0)
1106 return (ENXIO);
1107 }
1108
1109 switch (cmd) {
1110 #ifdef COMPAT_50
1111 case RAIDFRAME_GET_INFO50:
1112 return rf_get_info50(raidPtr, data);
1113
1114 case RAIDFRAME_CONFIGURE50:
1115 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1116 return retcode;
1117 goto config;
1118 #endif
1119 /* configure the system */
1120 case RAIDFRAME_CONFIGURE:
1121
1122 if (raidPtr->valid) {
1123 /* There is a valid RAID set running on this unit! */
1124 printf("raid%d: Device already configured!\n",unit);
1125 return(EINVAL);
1126 }
1127
1128 /* copy-in the configuration information */
1129 /* data points to a pointer to the configuration structure */
1130
1131 u_cfg = *((RF_Config_t **) data);
1132 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1133 if (k_cfg == NULL) {
1134 return (ENOMEM);
1135 }
1136 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1137 if (retcode) {
1138 RF_Free(k_cfg, sizeof(RF_Config_t));
1139 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1140 retcode));
1141 return (retcode);
1142 }
1143 goto config;
1144 config:
1145 /* allocate a buffer for the layout-specific data, and copy it
1146 * in */
1147 if (k_cfg->layoutSpecificSize) {
1148 if (k_cfg->layoutSpecificSize > 10000) {
1149 /* sanity check */
1150 RF_Free(k_cfg, sizeof(RF_Config_t));
1151 return (EINVAL);
1152 }
1153 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1154 (u_char *));
1155 if (specific_buf == NULL) {
1156 RF_Free(k_cfg, sizeof(RF_Config_t));
1157 return (ENOMEM);
1158 }
1159 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1160 k_cfg->layoutSpecificSize);
1161 if (retcode) {
1162 RF_Free(k_cfg, sizeof(RF_Config_t));
1163 RF_Free(specific_buf,
1164 k_cfg->layoutSpecificSize);
1165 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1166 retcode));
1167 return (retcode);
1168 }
1169 } else
1170 specific_buf = NULL;
1171 k_cfg->layoutSpecific = specific_buf;
1172
1173 /* should do some kind of sanity check on the configuration.
1174 * Store the sum of all the bytes in the last byte? */
1175
1176 /* configure the system */
1177
1178 /*
1179 * Clear the entire RAID descriptor, just to make sure
1180 * there is no stale data left in the case of a
1181 * reconfiguration
1182 */
1183 memset(raidPtr, 0, sizeof(*raidPtr));
1184 raidPtr->softc = rs;
1185 raidPtr->raidid = unit;
1186
1187 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1188
1189 if (retcode == 0) {
1190
1191 /* allow this many simultaneous IO's to
1192 this RAID device */
1193 raidPtr->openings = RAIDOUTSTANDING;
1194
1195 raidinit(rs);
1196 rf_markalldirty(raidPtr);
1197 }
1198 /* free the buffers. No return code here. */
1199 if (k_cfg->layoutSpecificSize) {
1200 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1201 }
1202 RF_Free(k_cfg, sizeof(RF_Config_t));
1203
1204 return (retcode);
1205
1206 /* shutdown the system */
1207 case RAIDFRAME_SHUTDOWN:
1208
1209 part = DISKPART(dev);
1210 pmask = (1 << part);
1211
1212 if ((error = raidlock(rs)) != 0)
1213 return (error);
1214
1215 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1216 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1217 (rs->sc_dkdev.dk_copenmask & pmask)))
1218 retcode = EBUSY;
1219 else {
1220 rs->sc_flags |= RAIDF_SHUTDOWN;
1221 rs->sc_dkdev.dk_copenmask &= ~pmask;
1222 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1223 rs->sc_dkdev.dk_openmask &= ~pmask;
1224 retcode = 0;
1225 }
1226
1227 raidunlock(rs);
1228
1229 if (retcode != 0)
1230 return retcode;
1231
1232 /* free the pseudo device attach bits */
1233
1234 cf = device_cfdata(rs->sc_dev);
1235 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1236 free(cf, M_RAIDFRAME);
1237
1238 return (retcode);
1239 case RAIDFRAME_GET_COMPONENT_LABEL:
1240 clabel_ptr = (RF_ComponentLabel_t **) data;
1241 /* need to read the component label for the disk indicated
1242 by row,column in clabel */
1243
1244 /*
1245 * Perhaps there should be an option to skip the in-core
1246 * copy and hit the disk, as with disklabel(8).
1247 */
1248 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1249
1250 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1251
1252 if (retcode) {
1253 RF_Free(clabel, sizeof(*clabel));
1254 return retcode;
1255 }
1256
1257 clabel->row = 0; /* Don't allow looking at anything else.*/
1258
1259 column = clabel->column;
1260
1261 if ((column < 0) || (column >= raidPtr->numCol +
1262 raidPtr->numSpare)) {
1263 RF_Free(clabel, sizeof(*clabel));
1264 return EINVAL;
1265 }
1266
1267 RF_Free(clabel, sizeof(*clabel));
1268
1269 clabel = raidget_component_label(raidPtr, column);
1270
1271 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1272
1273 #if 0
1274 case RAIDFRAME_SET_COMPONENT_LABEL:
1275 clabel = (RF_ComponentLabel_t *) data;
1276
1277 /* XXX check the label for valid stuff... */
1278 /* Note that some things *should not* get modified --
1279 the user should be re-initing the labels instead of
1280 trying to patch things.
1281 */
1282
1283 raidid = raidPtr->raidid;
1284 #ifdef DEBUG
1285 printf("raid%d: Got component label:\n", raidid);
1286 printf("raid%d: Version: %d\n", raidid, clabel->version);
1287 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1288 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1289 printf("raid%d: Column: %d\n", raidid, clabel->column);
1290 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1291 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1292 printf("raid%d: Status: %d\n", raidid, clabel->status);
1293 #endif
1294 clabel->row = 0;
1295 column = clabel->column;
1296
1297 if ((column < 0) || (column >= raidPtr->numCol)) {
1298 return(EINVAL);
1299 }
1300
1301 /* XXX this isn't allowed to do anything for now :-) */
1302
1303 /* XXX and before it is, we need to fill in the rest
1304 of the fields!?!?!?! */
1305 memcpy(raidget_component_label(raidPtr, column),
1306 clabel, sizeof(*clabel));
1307 raidflush_component_label(raidPtr, column);
1308 return (0);
1309 #endif
1310
1311 case RAIDFRAME_INIT_LABELS:
1312 clabel = (RF_ComponentLabel_t *) data;
1313 /*
1314 we only want the serial number from
1315 the above. We get all the rest of the information
1316 from the config that was used to create this RAID
1317 set.
1318 */
1319
1320 raidPtr->serial_number = clabel->serial_number;
1321
1322 for(column=0;column<raidPtr->numCol;column++) {
1323 diskPtr = &raidPtr->Disks[column];
1324 if (!RF_DEAD_DISK(diskPtr->status)) {
1325 ci_label = raidget_component_label(raidPtr,
1326 column);
1327 /* Zeroing this is important. */
1328 memset(ci_label, 0, sizeof(*ci_label));
1329 raid_init_component_label(raidPtr, ci_label);
1330 ci_label->serial_number =
1331 raidPtr->serial_number;
1332 ci_label->row = 0; /* we dont' pretend to support more */
1333 rf_component_label_set_partitionsize(ci_label,
1334 diskPtr->partitionSize);
1335 ci_label->column = column;
1336 raidflush_component_label(raidPtr, column);
1337 }
1338 /* XXXjld what about the spares? */
1339 }
1340
1341 return (retcode);
1342 case RAIDFRAME_SET_AUTOCONFIG:
1343 d = rf_set_autoconfig(raidPtr, *(int *) data);
1344 printf("raid%d: New autoconfig value is: %d\n",
1345 raidPtr->raidid, d);
1346 *(int *) data = d;
1347 return (retcode);
1348
1349 case RAIDFRAME_SET_ROOT:
1350 d = rf_set_rootpartition(raidPtr, *(int *) data);
1351 printf("raid%d: New rootpartition value is: %d\n",
1352 raidPtr->raidid, d);
1353 *(int *) data = d;
1354 return (retcode);
1355
1356 /* initialize all parity */
1357 case RAIDFRAME_REWRITEPARITY:
1358
1359 if (raidPtr->Layout.map->faultsTolerated == 0) {
1360 /* Parity for RAID 0 is trivially correct */
1361 raidPtr->parity_good = RF_RAID_CLEAN;
1362 return(0);
1363 }
1364
1365 if (raidPtr->parity_rewrite_in_progress == 1) {
1366 /* Re-write is already in progress! */
1367 return(EINVAL);
1368 }
1369
1370 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1371 rf_RewriteParityThread,
1372 raidPtr,"raid_parity");
1373 return (retcode);
1374
1375
1376 case RAIDFRAME_ADD_HOT_SPARE:
1377 sparePtr = (RF_SingleComponent_t *) data;
1378 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1379 retcode = rf_add_hot_spare(raidPtr, &component);
1380 return(retcode);
1381
1382 case RAIDFRAME_REMOVE_HOT_SPARE:
1383 return(retcode);
1384
1385 case RAIDFRAME_DELETE_COMPONENT:
1386 componentPtr = (RF_SingleComponent_t *)data;
1387 memcpy( &component, componentPtr,
1388 sizeof(RF_SingleComponent_t));
1389 retcode = rf_delete_component(raidPtr, &component);
1390 return(retcode);
1391
1392 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1393 componentPtr = (RF_SingleComponent_t *)data;
1394 memcpy( &component, componentPtr,
1395 sizeof(RF_SingleComponent_t));
1396 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1397 return(retcode);
1398
1399 case RAIDFRAME_REBUILD_IN_PLACE:
1400
1401 if (raidPtr->Layout.map->faultsTolerated == 0) {
1402 /* Can't do this on a RAID 0!! */
1403 return(EINVAL);
1404 }
1405
1406 if (raidPtr->recon_in_progress == 1) {
1407 /* a reconstruct is already in progress! */
1408 return(EINVAL);
1409 }
1410
1411 componentPtr = (RF_SingleComponent_t *) data;
1412 memcpy( &component, componentPtr,
1413 sizeof(RF_SingleComponent_t));
1414 component.row = 0; /* we don't support any more */
1415 column = component.column;
1416
1417 if ((column < 0) || (column >= raidPtr->numCol)) {
1418 return(EINVAL);
1419 }
1420
1421 rf_lock_mutex2(raidPtr->mutex);
1422 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1423 (raidPtr->numFailures > 0)) {
1424 /* XXX 0 above shouldn't be constant!!! */
1425 /* some component other than this has failed.
1426 Let's not make things worse than they already
1427 are... */
1428 printf("raid%d: Unable to reconstruct to disk at:\n",
1429 raidPtr->raidid);
1430 printf("raid%d: Col: %d Too many failures.\n",
1431 raidPtr->raidid, column);
1432 rf_unlock_mutex2(raidPtr->mutex);
1433 return (EINVAL);
1434 }
1435 if (raidPtr->Disks[column].status ==
1436 rf_ds_reconstructing) {
1437 printf("raid%d: Unable to reconstruct to disk at:\n",
1438 raidPtr->raidid);
1439 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1440
1441 rf_unlock_mutex2(raidPtr->mutex);
1442 return (EINVAL);
1443 }
1444 if (raidPtr->Disks[column].status == rf_ds_spared) {
1445 rf_unlock_mutex2(raidPtr->mutex);
1446 return (EINVAL);
1447 }
1448 rf_unlock_mutex2(raidPtr->mutex);
1449
1450 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1451 if (rrcopy == NULL)
1452 return(ENOMEM);
1453
1454 rrcopy->raidPtr = (void *) raidPtr;
1455 rrcopy->col = column;
1456
1457 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1458 rf_ReconstructInPlaceThread,
1459 rrcopy,"raid_reconip");
1460 return(retcode);
1461
1462 case RAIDFRAME_GET_INFO:
1463 if (!raidPtr->valid)
1464 return (ENODEV);
1465 ucfgp = (RF_DeviceConfig_t **) data;
1466 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1467 (RF_DeviceConfig_t *));
1468 if (d_cfg == NULL)
1469 return (ENOMEM);
1470 d_cfg->rows = 1; /* there is only 1 row now */
1471 d_cfg->cols = raidPtr->numCol;
1472 d_cfg->ndevs = raidPtr->numCol;
1473 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1474 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1475 return (ENOMEM);
1476 }
1477 d_cfg->nspares = raidPtr->numSpare;
1478 if (d_cfg->nspares >= RF_MAX_DISKS) {
1479 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1480 return (ENOMEM);
1481 }
1482 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1483 d = 0;
1484 for (j = 0; j < d_cfg->cols; j++) {
1485 d_cfg->devs[d] = raidPtr->Disks[j];
1486 d++;
1487 }
1488 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1489 d_cfg->spares[i] = raidPtr->Disks[j];
1490 }
1491 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1492 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1493
1494 return (retcode);
1495
1496 case RAIDFRAME_CHECK_PARITY:
1497 *(int *) data = raidPtr->parity_good;
1498 return (0);
1499
1500 case RAIDFRAME_PARITYMAP_STATUS:
1501 if (rf_paritymap_ineligible(raidPtr))
1502 return EINVAL;
1503 rf_paritymap_status(raidPtr->parity_map,
1504 (struct rf_pmstat *)data);
1505 return 0;
1506
1507 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1508 if (rf_paritymap_ineligible(raidPtr))
1509 return EINVAL;
1510 if (raidPtr->parity_map == NULL)
1511 return ENOENT; /* ??? */
1512 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1513 (struct rf_pmparams *)data, 1))
1514 return EINVAL;
1515 return 0;
1516
1517 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1518 if (rf_paritymap_ineligible(raidPtr))
1519 return EINVAL;
1520 *(int *) data = rf_paritymap_get_disable(raidPtr);
1521 return 0;
1522
1523 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1524 if (rf_paritymap_ineligible(raidPtr))
1525 return EINVAL;
1526 rf_paritymap_set_disable(raidPtr, *(int *)data);
1527 /* XXX should errors be passed up? */
1528 return 0;
1529
1530 case RAIDFRAME_RESET_ACCTOTALS:
1531 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1532 return (0);
1533
1534 case RAIDFRAME_GET_ACCTOTALS:
1535 totals = (RF_AccTotals_t *) data;
1536 *totals = raidPtr->acc_totals;
1537 return (0);
1538
1539 case RAIDFRAME_KEEP_ACCTOTALS:
1540 raidPtr->keep_acc_totals = *(int *)data;
1541 return (0);
1542
1543 case RAIDFRAME_GET_SIZE:
1544 *(int *) data = raidPtr->totalSectors;
1545 return (0);
1546
1547 /* fail a disk & optionally start reconstruction */
1548 case RAIDFRAME_FAIL_DISK:
1549
1550 if (raidPtr->Layout.map->faultsTolerated == 0) {
1551 /* Can't do this on a RAID 0!! */
1552 return(EINVAL);
1553 }
1554
1555 rr = (struct rf_recon_req *) data;
1556 rr->row = 0;
1557 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1558 return (EINVAL);
1559
1560
1561 rf_lock_mutex2(raidPtr->mutex);
1562 if (raidPtr->status == rf_rs_reconstructing) {
1563 /* you can't fail a disk while we're reconstructing! */
1564 /* XXX wrong for RAID6 */
1565 rf_unlock_mutex2(raidPtr->mutex);
1566 return (EINVAL);
1567 }
1568 if ((raidPtr->Disks[rr->col].status ==
1569 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1570 /* some other component has failed. Let's not make
1571 things worse. XXX wrong for RAID6 */
1572 rf_unlock_mutex2(raidPtr->mutex);
1573 return (EINVAL);
1574 }
1575 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1576 /* Can't fail a spared disk! */
1577 rf_unlock_mutex2(raidPtr->mutex);
1578 return (EINVAL);
1579 }
1580 rf_unlock_mutex2(raidPtr->mutex);
1581
1582 /* make a copy of the recon request so that we don't rely on
1583 * the user's buffer */
1584 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1585 if (rrcopy == NULL)
1586 return(ENOMEM);
1587 memcpy(rrcopy, rr, sizeof(*rr));
1588 rrcopy->raidPtr = (void *) raidPtr;
1589
1590 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1591 rf_ReconThread,
1592 rrcopy,"raid_recon");
1593 return (0);
1594
1595 /* invoke a copyback operation after recon on whatever disk
1596 * needs it, if any */
1597 case RAIDFRAME_COPYBACK:
1598
1599 if (raidPtr->Layout.map->faultsTolerated == 0) {
1600 /* This makes no sense on a RAID 0!! */
1601 return(EINVAL);
1602 }
1603
1604 if (raidPtr->copyback_in_progress == 1) {
1605 /* Copyback is already in progress! */
1606 return(EINVAL);
1607 }
1608
1609 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1610 rf_CopybackThread,
1611 raidPtr,"raid_copyback");
1612 return (retcode);
1613
1614 /* return the percentage completion of reconstruction */
1615 case RAIDFRAME_CHECK_RECON_STATUS:
1616 if (raidPtr->Layout.map->faultsTolerated == 0) {
1617 /* This makes no sense on a RAID 0, so tell the
1618 user it's done. */
1619 *(int *) data = 100;
1620 return(0);
1621 }
1622 if (raidPtr->status != rf_rs_reconstructing)
1623 *(int *) data = 100;
1624 else {
1625 if (raidPtr->reconControl->numRUsTotal > 0) {
1626 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1627 } else {
1628 *(int *) data = 0;
1629 }
1630 }
1631 return (0);
1632 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1633 progressInfoPtr = (RF_ProgressInfo_t **) data;
1634 if (raidPtr->status != rf_rs_reconstructing) {
1635 progressInfo.remaining = 0;
1636 progressInfo.completed = 100;
1637 progressInfo.total = 100;
1638 } else {
1639 progressInfo.total =
1640 raidPtr->reconControl->numRUsTotal;
1641 progressInfo.completed =
1642 raidPtr->reconControl->numRUsComplete;
1643 progressInfo.remaining = progressInfo.total -
1644 progressInfo.completed;
1645 }
1646 retcode = copyout(&progressInfo, *progressInfoPtr,
1647 sizeof(RF_ProgressInfo_t));
1648 return (retcode);
1649
1650 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1651 if (raidPtr->Layout.map->faultsTolerated == 0) {
1652 /* This makes no sense on a RAID 0, so tell the
1653 user it's done. */
1654 *(int *) data = 100;
1655 return(0);
1656 }
1657 if (raidPtr->parity_rewrite_in_progress == 1) {
1658 *(int *) data = 100 *
1659 raidPtr->parity_rewrite_stripes_done /
1660 raidPtr->Layout.numStripe;
1661 } else {
1662 *(int *) data = 100;
1663 }
1664 return (0);
1665
1666 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1667 progressInfoPtr = (RF_ProgressInfo_t **) data;
1668 if (raidPtr->parity_rewrite_in_progress == 1) {
1669 progressInfo.total = raidPtr->Layout.numStripe;
1670 progressInfo.completed =
1671 raidPtr->parity_rewrite_stripes_done;
1672 progressInfo.remaining = progressInfo.total -
1673 progressInfo.completed;
1674 } else {
1675 progressInfo.remaining = 0;
1676 progressInfo.completed = 100;
1677 progressInfo.total = 100;
1678 }
1679 retcode = copyout(&progressInfo, *progressInfoPtr,
1680 sizeof(RF_ProgressInfo_t));
1681 return (retcode);
1682
1683 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1684 if (raidPtr->Layout.map->faultsTolerated == 0) {
1685 /* This makes no sense on a RAID 0 */
1686 *(int *) data = 100;
1687 return(0);
1688 }
1689 if (raidPtr->copyback_in_progress == 1) {
1690 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1691 raidPtr->Layout.numStripe;
1692 } else {
1693 *(int *) data = 100;
1694 }
1695 return (0);
1696
1697 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1698 progressInfoPtr = (RF_ProgressInfo_t **) data;
1699 if (raidPtr->copyback_in_progress == 1) {
1700 progressInfo.total = raidPtr->Layout.numStripe;
1701 progressInfo.completed =
1702 raidPtr->copyback_stripes_done;
1703 progressInfo.remaining = progressInfo.total -
1704 progressInfo.completed;
1705 } else {
1706 progressInfo.remaining = 0;
1707 progressInfo.completed = 100;
1708 progressInfo.total = 100;
1709 }
1710 retcode = copyout(&progressInfo, *progressInfoPtr,
1711 sizeof(RF_ProgressInfo_t));
1712 return (retcode);
1713
1714 /* the sparetable daemon calls this to wait for the kernel to
1715 * need a spare table. this ioctl does not return until a
1716 * spare table is needed. XXX -- calling mpsleep here in the
1717 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1718 * -- I should either compute the spare table in the kernel,
1719 * or have a different -- XXX XXX -- interface (a different
1720 * character device) for delivering the table -- XXX */
1721 #if 0
1722 case RAIDFRAME_SPARET_WAIT:
1723 rf_lock_mutex2(rf_sparet_wait_mutex);
1724 while (!rf_sparet_wait_queue)
1725 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1726 waitreq = rf_sparet_wait_queue;
1727 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1728 rf_unlock_mutex2(rf_sparet_wait_mutex);
1729
1730 /* structure assignment */
1731 *((RF_SparetWait_t *) data) = *waitreq;
1732
1733 RF_Free(waitreq, sizeof(*waitreq));
1734 return (0);
1735
1736 /* wakes up a process waiting on SPARET_WAIT and puts an error
1737 * code in it that will cause the dameon to exit */
1738 case RAIDFRAME_ABORT_SPARET_WAIT:
1739 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1740 waitreq->fcol = -1;
1741 rf_lock_mutex2(rf_sparet_wait_mutex);
1742 waitreq->next = rf_sparet_wait_queue;
1743 rf_sparet_wait_queue = waitreq;
1744 rf_broadcast_conf2(rf_sparet_wait_cv);
1745 rf_unlock_mutex2(rf_sparet_wait_mutex);
1746 return (0);
1747
1748 /* used by the spare table daemon to deliver a spare table
1749 * into the kernel */
1750 case RAIDFRAME_SEND_SPARET:
1751
1752 /* install the spare table */
1753 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1754
1755 /* respond to the requestor. the return status of the spare
1756 * table installation is passed in the "fcol" field */
1757 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1758 waitreq->fcol = retcode;
1759 rf_lock_mutex2(rf_sparet_wait_mutex);
1760 waitreq->next = rf_sparet_resp_queue;
1761 rf_sparet_resp_queue = waitreq;
1762 rf_broadcast_cond2(rf_sparet_resp_cv);
1763 rf_unlock_mutex2(rf_sparet_wait_mutex);
1764
1765 return (retcode);
1766 #endif
1767
1768 default:
1769 break; /* fall through to the os-specific code below */
1770
1771 }
1772
1773 if (!raidPtr->valid)
1774 return (EINVAL);
1775
1776 /*
1777 * Add support for "regular" device ioctls here.
1778 */
1779
1780 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1781 if (error != EPASSTHROUGH)
1782 return (error);
1783
1784 switch (cmd) {
1785 case DIOCGDINFO:
1786 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1787 break;
1788 #ifdef __HAVE_OLD_DISKLABEL
1789 case ODIOCGDINFO:
1790 newlabel = *(rs->sc_dkdev.dk_label);
1791 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1792 return ENOTTY;
1793 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1794 break;
1795 #endif
1796
1797 case DIOCGPART:
1798 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1799 ((struct partinfo *) data)->part =
1800 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1801 break;
1802
1803 case DIOCWDINFO:
1804 case DIOCSDINFO:
1805 #ifdef __HAVE_OLD_DISKLABEL
1806 case ODIOCWDINFO:
1807 case ODIOCSDINFO:
1808 #endif
1809 {
1810 struct disklabel *lp;
1811 #ifdef __HAVE_OLD_DISKLABEL
1812 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1813 memset(&newlabel, 0, sizeof newlabel);
1814 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1815 lp = &newlabel;
1816 } else
1817 #endif
1818 lp = (struct disklabel *)data;
1819
1820 if ((error = raidlock(rs)) != 0)
1821 return (error);
1822
1823 rs->sc_flags |= RAIDF_LABELLING;
1824
1825 error = setdisklabel(rs->sc_dkdev.dk_label,
1826 lp, 0, rs->sc_dkdev.dk_cpulabel);
1827 if (error == 0) {
1828 if (cmd == DIOCWDINFO
1829 #ifdef __HAVE_OLD_DISKLABEL
1830 || cmd == ODIOCWDINFO
1831 #endif
1832 )
1833 error = writedisklabel(RAIDLABELDEV(dev),
1834 raidstrategy, rs->sc_dkdev.dk_label,
1835 rs->sc_dkdev.dk_cpulabel);
1836 }
1837 rs->sc_flags &= ~RAIDF_LABELLING;
1838
1839 raidunlock(rs);
1840
1841 if (error)
1842 return (error);
1843 break;
1844 }
1845
1846 case DIOCWLABEL:
1847 if (*(int *) data != 0)
1848 rs->sc_flags |= RAIDF_WLABEL;
1849 else
1850 rs->sc_flags &= ~RAIDF_WLABEL;
1851 break;
1852
1853 case DIOCGDEFLABEL:
1854 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1855 break;
1856
1857 #ifdef __HAVE_OLD_DISKLABEL
1858 case ODIOCGDEFLABEL:
1859 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1860 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1861 return ENOTTY;
1862 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1863 break;
1864 #endif
1865
1866 case DIOCAWEDGE:
1867 case DIOCDWEDGE:
1868 dkw = (void *)data;
1869
1870 /* If the ioctl happens here, the parent is us. */
1871 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1872 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1873
1874 case DIOCLWEDGES:
1875 return dkwedge_list(&rs->sc_dkdev,
1876 (struct dkwedge_list *)data, l);
1877 case DIOCCACHESYNC:
1878 return rf_sync_component_caches(raidPtr);
1879
1880 case DIOCGSTRATEGY:
1881 {
1882 struct disk_strategy *dks = (void *)data;
1883
1884 s = splbio();
1885 strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
1886 sizeof(dks->dks_name));
1887 splx(s);
1888 dks->dks_paramlen = 0;
1889
1890 return 0;
1891 }
1892
1893 case DIOCSSTRATEGY:
1894 {
1895 struct disk_strategy *dks = (void *)data;
1896 struct bufq_state *new;
1897 struct bufq_state *old;
1898
1899 if (dks->dks_param != NULL) {
1900 return EINVAL;
1901 }
1902 dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
1903 error = bufq_alloc(&new, dks->dks_name,
1904 BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
1905 if (error) {
1906 return error;
1907 }
1908 s = splbio();
1909 old = rs->buf_queue;
1910 bufq_move(new, old);
1911 rs->buf_queue = new;
1912 splx(s);
1913 bufq_free(old);
1914
1915 return 0;
1916 }
1917
1918 default:
1919 retcode = ENOTTY;
1920 }
1921 return (retcode);
1922
1923 }
1924
1925
1926 /* raidinit -- complete the rest of the initialization for the
1927 RAIDframe device. */
1928
1929
1930 static void
1931 raidinit(struct raid_softc *rs)
1932 {
1933 cfdata_t cf;
1934 int unit;
1935 RF_Raid_t *raidPtr = &rs->sc_r;
1936
1937 unit = raidPtr->raidid;
1938
1939
1940 /* XXX should check return code first... */
1941 rs->sc_flags |= RAIDF_INITED;
1942
1943 /* XXX doesn't check bounds. */
1944 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1945
1946 /* attach the pseudo device */
1947 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1948 cf->cf_name = raid_cd.cd_name;
1949 cf->cf_atname = raid_cd.cd_name;
1950 cf->cf_unit = unit;
1951 cf->cf_fstate = FSTATE_STAR;
1952
1953 rs->sc_dev = config_attach_pseudo(cf);
1954
1955 if (rs->sc_dev == NULL) {
1956 printf("raid%d: config_attach_pseudo failed\n",
1957 raidPtr->raidid);
1958 rs->sc_flags &= ~RAIDF_INITED;
1959 free(cf, M_RAIDFRAME);
1960 return;
1961 }
1962
1963 /* disk_attach actually creates space for the CPU disklabel, among
1964 * other things, so it's critical to call this *BEFORE* we try putzing
1965 * with disklabels. */
1966
1967 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1968 disk_attach(&rs->sc_dkdev);
1969 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1970
1971 /* XXX There may be a weird interaction here between this, and
1972 * protectedSectors, as used in RAIDframe. */
1973
1974 rs->sc_size = raidPtr->totalSectors;
1975
1976 dkwedge_discover(&rs->sc_dkdev);
1977
1978 rf_set_geometry(rs, raidPtr);
1979
1980 }
1981 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1982 /* wake up the daemon & tell it to get us a spare table
1983 * XXX
1984 * the entries in the queues should be tagged with the raidPtr
1985 * so that in the extremely rare case that two recons happen at once,
1986 * we know for which device were requesting a spare table
1987 * XXX
1988 *
1989 * XXX This code is not currently used. GO
1990 */
1991 int
1992 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1993 {
1994 int retcode;
1995
1996 rf_lock_mutex2(rf_sparet_wait_mutex);
1997 req->next = rf_sparet_wait_queue;
1998 rf_sparet_wait_queue = req;
1999 rf_broadcast_cond2(rf_sparet_wait_cv);
2000
2001 /* mpsleep unlocks the mutex */
2002 while (!rf_sparet_resp_queue) {
2003 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2004 }
2005 req = rf_sparet_resp_queue;
2006 rf_sparet_resp_queue = req->next;
2007 rf_unlock_mutex2(rf_sparet_wait_mutex);
2008
2009 retcode = req->fcol;
2010 RF_Free(req, sizeof(*req)); /* this is not the same req as we
2011 * alloc'd */
2012 return (retcode);
2013 }
2014 #endif
2015
2016 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2017 * bp & passes it down.
2018 * any calls originating in the kernel must use non-blocking I/O
2019 * do some extra sanity checking to return "appropriate" error values for
2020 * certain conditions (to make some standard utilities work)
2021 *
2022 * Formerly known as: rf_DoAccessKernel
2023 */
2024 void
2025 raidstart(RF_Raid_t *raidPtr)
2026 {
2027 RF_SectorCount_t num_blocks, pb, sum;
2028 RF_RaidAddr_t raid_addr;
2029 struct partition *pp;
2030 daddr_t blocknum;
2031 struct raid_softc *rs;
2032 int do_async;
2033 struct buf *bp;
2034 int rc;
2035
2036 rs = raidPtr->softc;
2037 /* quick check to see if anything has died recently */
2038 rf_lock_mutex2(raidPtr->mutex);
2039 if (raidPtr->numNewFailures > 0) {
2040 rf_unlock_mutex2(raidPtr->mutex);
2041 rf_update_component_labels(raidPtr,
2042 RF_NORMAL_COMPONENT_UPDATE);
2043 rf_lock_mutex2(raidPtr->mutex);
2044 raidPtr->numNewFailures--;
2045 }
2046
2047 /* Check to see if we're at the limit... */
2048 while (raidPtr->openings > 0) {
2049 rf_unlock_mutex2(raidPtr->mutex);
2050
2051 /* get the next item, if any, from the queue */
2052 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2053 /* nothing more to do */
2054 return;
2055 }
2056
2057 /* Ok, for the bp we have here, bp->b_blkno is relative to the
2058 * partition.. Need to make it absolute to the underlying
2059 * device.. */
2060
2061 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2062 if (DISKPART(bp->b_dev) != RAW_PART) {
2063 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2064 blocknum += pp->p_offset;
2065 }
2066
2067 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2068 (int) blocknum));
2069
2070 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2071 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2072
2073 /* *THIS* is where we adjust what block we're going to...
2074 * but DO NOT TOUCH bp->b_blkno!!! */
2075 raid_addr = blocknum;
2076
2077 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2078 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2079 sum = raid_addr + num_blocks + pb;
2080 if (1 || rf_debugKernelAccess) {
2081 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2082 (int) raid_addr, (int) sum, (int) num_blocks,
2083 (int) pb, (int) bp->b_resid));
2084 }
2085 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2086 || (sum < num_blocks) || (sum < pb)) {
2087 bp->b_error = ENOSPC;
2088 bp->b_resid = bp->b_bcount;
2089 biodone(bp);
2090 rf_lock_mutex2(raidPtr->mutex);
2091 continue;
2092 }
2093 /*
2094 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2095 */
2096
2097 if (bp->b_bcount & raidPtr->sectorMask) {
2098 bp->b_error = EINVAL;
2099 bp->b_resid = bp->b_bcount;
2100 biodone(bp);
2101 rf_lock_mutex2(raidPtr->mutex);
2102 continue;
2103
2104 }
2105 db1_printf(("Calling DoAccess..\n"));
2106
2107
2108 rf_lock_mutex2(raidPtr->mutex);
2109 raidPtr->openings--;
2110 rf_unlock_mutex2(raidPtr->mutex);
2111
2112 /*
2113 * Everything is async.
2114 */
2115 do_async = 1;
2116
2117 disk_busy(&rs->sc_dkdev);
2118
2119 /* XXX we're still at splbio() here... do we *really*
2120 need to be? */
2121
2122 /* don't ever condition on bp->b_flags & B_WRITE.
2123 * always condition on B_READ instead */
2124
2125 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2126 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2127 do_async, raid_addr, num_blocks,
2128 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2129
2130 if (rc) {
2131 bp->b_error = rc;
2132 bp->b_resid = bp->b_bcount;
2133 biodone(bp);
2134 /* continue loop */
2135 }
2136
2137 rf_lock_mutex2(raidPtr->mutex);
2138 }
2139 rf_unlock_mutex2(raidPtr->mutex);
2140 }
2141
2142
2143
2144
2145 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2146
2147 int
2148 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2149 {
2150 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2151 struct buf *bp;
2152
2153 req->queue = queue;
2154 bp = req->bp;
2155
2156 switch (req->type) {
2157 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2158 /* XXX need to do something extra here.. */
2159 /* I'm leaving this in, as I've never actually seen it used,
2160 * and I'd like folks to report it... GO */
2161 printf(("WAKEUP CALLED\n"));
2162 queue->numOutstanding++;
2163
2164 bp->b_flags = 0;
2165 bp->b_private = req;
2166
2167 KernelWakeupFunc(bp);
2168 break;
2169
2170 case RF_IO_TYPE_READ:
2171 case RF_IO_TYPE_WRITE:
2172 #if RF_ACC_TRACE > 0
2173 if (req->tracerec) {
2174 RF_ETIMER_START(req->tracerec->timer);
2175 }
2176 #endif
2177 InitBP(bp, queue->rf_cinfo->ci_vp,
2178 op, queue->rf_cinfo->ci_dev,
2179 req->sectorOffset, req->numSector,
2180 req->buf, KernelWakeupFunc, (void *) req,
2181 queue->raidPtr->logBytesPerSector, req->b_proc);
2182
2183 if (rf_debugKernelAccess) {
2184 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2185 (long) bp->b_blkno));
2186 }
2187 queue->numOutstanding++;
2188 queue->last_deq_sector = req->sectorOffset;
2189 /* acc wouldn't have been let in if there were any pending
2190 * reqs at any other priority */
2191 queue->curPriority = req->priority;
2192
2193 db1_printf(("Going for %c to unit %d col %d\n",
2194 req->type, queue->raidPtr->raidid,
2195 queue->col));
2196 db1_printf(("sector %d count %d (%d bytes) %d\n",
2197 (int) req->sectorOffset, (int) req->numSector,
2198 (int) (req->numSector <<
2199 queue->raidPtr->logBytesPerSector),
2200 (int) queue->raidPtr->logBytesPerSector));
2201
2202 /*
2203 * XXX: drop lock here since this can block at
2204 * least with backing SCSI devices. Retake it
2205 * to minimize fuss with calling interfaces.
2206 */
2207
2208 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2209 bdev_strategy(bp);
2210 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2211 break;
2212
2213 default:
2214 panic("bad req->type in rf_DispatchKernelIO");
2215 }
2216 db1_printf(("Exiting from DispatchKernelIO\n"));
2217
2218 return (0);
2219 }
2220 /* this is the callback function associated with a I/O invoked from
2221 kernel code.
2222 */
2223 static void
2224 KernelWakeupFunc(struct buf *bp)
2225 {
2226 RF_DiskQueueData_t *req = NULL;
2227 RF_DiskQueue_t *queue;
2228
2229 db1_printf(("recovering the request queue:\n"));
2230
2231 req = bp->b_private;
2232
2233 queue = (RF_DiskQueue_t *) req->queue;
2234
2235 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2236
2237 #if RF_ACC_TRACE > 0
2238 if (req->tracerec) {
2239 RF_ETIMER_STOP(req->tracerec->timer);
2240 RF_ETIMER_EVAL(req->tracerec->timer);
2241 rf_lock_mutex2(rf_tracing_mutex);
2242 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2243 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2244 req->tracerec->num_phys_ios++;
2245 rf_unlock_mutex2(rf_tracing_mutex);
2246 }
2247 #endif
2248
2249 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2250 * ballistic, and mark the component as hosed... */
2251
2252 if (bp->b_error != 0) {
2253 /* Mark the disk as dead */
2254 /* but only mark it once... */
2255 /* and only if it wouldn't leave this RAID set
2256 completely broken */
2257 if (((queue->raidPtr->Disks[queue->col].status ==
2258 rf_ds_optimal) ||
2259 (queue->raidPtr->Disks[queue->col].status ==
2260 rf_ds_used_spare)) &&
2261 (queue->raidPtr->numFailures <
2262 queue->raidPtr->Layout.map->faultsTolerated)) {
2263 printf("raid%d: IO Error. Marking %s as failed.\n",
2264 queue->raidPtr->raidid,
2265 queue->raidPtr->Disks[queue->col].devname);
2266 queue->raidPtr->Disks[queue->col].status =
2267 rf_ds_failed;
2268 queue->raidPtr->status = rf_rs_degraded;
2269 queue->raidPtr->numFailures++;
2270 queue->raidPtr->numNewFailures++;
2271 } else { /* Disk is already dead... */
2272 /* printf("Disk already marked as dead!\n"); */
2273 }
2274
2275 }
2276
2277 /* Fill in the error value */
2278 req->error = bp->b_error;
2279
2280 /* Drop this one on the "finished" queue... */
2281 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2282
2283 /* Let the raidio thread know there is work to be done. */
2284 rf_signal_cond2(queue->raidPtr->iodone_cv);
2285
2286 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2287 }
2288
2289
2290 /*
2291 * initialize a buf structure for doing an I/O in the kernel.
2292 */
2293 static void
2294 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2295 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2296 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2297 struct proc *b_proc)
2298 {
2299 /* bp->b_flags = B_PHYS | rw_flag; */
2300 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2301 bp->b_oflags = 0;
2302 bp->b_cflags = 0;
2303 bp->b_bcount = numSect << logBytesPerSector;
2304 bp->b_bufsize = bp->b_bcount;
2305 bp->b_error = 0;
2306 bp->b_dev = dev;
2307 bp->b_data = bf;
2308 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2309 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2310 if (bp->b_bcount == 0) {
2311 panic("bp->b_bcount is zero in InitBP!!");
2312 }
2313 bp->b_proc = b_proc;
2314 bp->b_iodone = cbFunc;
2315 bp->b_private = cbArg;
2316 }
2317
2318 static void
2319 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2320 struct disklabel *lp)
2321 {
2322 memset(lp, 0, sizeof(*lp));
2323
2324 /* fabricate a label... */
2325 lp->d_secperunit = raidPtr->totalSectors;
2326 lp->d_secsize = raidPtr->bytesPerSector;
2327 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2328 lp->d_ntracks = 4 * raidPtr->numCol;
2329 lp->d_ncylinders = raidPtr->totalSectors /
2330 (lp->d_nsectors * lp->d_ntracks);
2331 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2332
2333 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2334 lp->d_type = DTYPE_RAID;
2335 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2336 lp->d_rpm = 3600;
2337 lp->d_interleave = 1;
2338 lp->d_flags = 0;
2339
2340 lp->d_partitions[RAW_PART].p_offset = 0;
2341 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2342 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2343 lp->d_npartitions = RAW_PART + 1;
2344
2345 lp->d_magic = DISKMAGIC;
2346 lp->d_magic2 = DISKMAGIC;
2347 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2348
2349 }
2350 /*
2351 * Read the disklabel from the raid device. If one is not present, fake one
2352 * up.
2353 */
2354 static void
2355 raidgetdisklabel(dev_t dev)
2356 {
2357 int unit = raidunit(dev);
2358 struct raid_softc *rs;
2359 const char *errstring;
2360 struct disklabel *lp;
2361 struct cpu_disklabel *clp;
2362 RF_Raid_t *raidPtr;
2363
2364 if ((rs = raidget(unit)) == NULL)
2365 return;
2366
2367 lp = rs->sc_dkdev.dk_label;
2368 clp = rs->sc_dkdev.dk_cpulabel;
2369
2370 db1_printf(("Getting the disklabel...\n"));
2371
2372 memset(clp, 0, sizeof(*clp));
2373
2374 raidPtr = &rs->sc_r;
2375
2376 raidgetdefaultlabel(raidPtr, rs, lp);
2377
2378 /*
2379 * Call the generic disklabel extraction routine.
2380 */
2381 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2382 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2383 if (errstring)
2384 raidmakedisklabel(rs);
2385 else {
2386 int i;
2387 struct partition *pp;
2388
2389 /*
2390 * Sanity check whether the found disklabel is valid.
2391 *
2392 * This is necessary since total size of the raid device
2393 * may vary when an interleave is changed even though exactly
2394 * same components are used, and old disklabel may used
2395 * if that is found.
2396 */
2397 if (lp->d_secperunit != rs->sc_size)
2398 printf("raid%d: WARNING: %s: "
2399 "total sector size in disklabel (%" PRIu32 ") != "
2400 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2401 lp->d_secperunit, rs->sc_size);
2402 for (i = 0; i < lp->d_npartitions; i++) {
2403 pp = &lp->d_partitions[i];
2404 if (pp->p_offset + pp->p_size > rs->sc_size)
2405 printf("raid%d: WARNING: %s: end of partition `%c' "
2406 "exceeds the size of raid (%" PRIu64 ")\n",
2407 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2408 }
2409 }
2410
2411 }
2412 /*
2413 * Take care of things one might want to take care of in the event
2414 * that a disklabel isn't present.
2415 */
2416 static void
2417 raidmakedisklabel(struct raid_softc *rs)
2418 {
2419 struct disklabel *lp = rs->sc_dkdev.dk_label;
2420 db1_printf(("Making a label..\n"));
2421
2422 /*
2423 * For historical reasons, if there's no disklabel present
2424 * the raw partition must be marked FS_BSDFFS.
2425 */
2426
2427 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2428
2429 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2430
2431 lp->d_checksum = dkcksum(lp);
2432 }
2433 /*
2434 * Wait interruptibly for an exclusive lock.
2435 *
2436 * XXX
2437 * Several drivers do this; it should be abstracted and made MP-safe.
2438 * (Hmm... where have we seen this warning before :-> GO )
2439 */
2440 static int
2441 raidlock(struct raid_softc *rs)
2442 {
2443 int error;
2444
2445 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2446 rs->sc_flags |= RAIDF_WANTED;
2447 if ((error =
2448 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2449 return (error);
2450 }
2451 rs->sc_flags |= RAIDF_LOCKED;
2452 return (0);
2453 }
2454 /*
2455 * Unlock and wake up any waiters.
2456 */
2457 static void
2458 raidunlock(struct raid_softc *rs)
2459 {
2460
2461 rs->sc_flags &= ~RAIDF_LOCKED;
2462 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2463 rs->sc_flags &= ~RAIDF_WANTED;
2464 wakeup(rs);
2465 }
2466 }
2467
2468
2469 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2470 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2471 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2472
2473 static daddr_t
2474 rf_component_info_offset(void)
2475 {
2476
2477 return RF_COMPONENT_INFO_OFFSET;
2478 }
2479
2480 static daddr_t
2481 rf_component_info_size(unsigned secsize)
2482 {
2483 daddr_t info_size;
2484
2485 KASSERT(secsize);
2486 if (secsize > RF_COMPONENT_INFO_SIZE)
2487 info_size = secsize;
2488 else
2489 info_size = RF_COMPONENT_INFO_SIZE;
2490
2491 return info_size;
2492 }
2493
2494 static daddr_t
2495 rf_parity_map_offset(RF_Raid_t *raidPtr)
2496 {
2497 daddr_t map_offset;
2498
2499 KASSERT(raidPtr->bytesPerSector);
2500 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2501 map_offset = raidPtr->bytesPerSector;
2502 else
2503 map_offset = RF_COMPONENT_INFO_SIZE;
2504 map_offset += rf_component_info_offset();
2505
2506 return map_offset;
2507 }
2508
2509 static daddr_t
2510 rf_parity_map_size(RF_Raid_t *raidPtr)
2511 {
2512 daddr_t map_size;
2513
2514 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2515 map_size = raidPtr->bytesPerSector;
2516 else
2517 map_size = RF_PARITY_MAP_SIZE;
2518
2519 return map_size;
2520 }
2521
2522 int
2523 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2524 {
2525 RF_ComponentLabel_t *clabel;
2526
2527 clabel = raidget_component_label(raidPtr, col);
2528 clabel->clean = RF_RAID_CLEAN;
2529 raidflush_component_label(raidPtr, col);
2530 return(0);
2531 }
2532
2533
2534 int
2535 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2536 {
2537 RF_ComponentLabel_t *clabel;
2538
2539 clabel = raidget_component_label(raidPtr, col);
2540 clabel->clean = RF_RAID_DIRTY;
2541 raidflush_component_label(raidPtr, col);
2542 return(0);
2543 }
2544
2545 int
2546 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2547 {
2548 KASSERT(raidPtr->bytesPerSector);
2549 return raidread_component_label(raidPtr->bytesPerSector,
2550 raidPtr->Disks[col].dev,
2551 raidPtr->raid_cinfo[col].ci_vp,
2552 &raidPtr->raid_cinfo[col].ci_label);
2553 }
2554
2555 RF_ComponentLabel_t *
2556 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2557 {
2558 return &raidPtr->raid_cinfo[col].ci_label;
2559 }
2560
2561 int
2562 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2563 {
2564 RF_ComponentLabel_t *label;
2565
2566 label = &raidPtr->raid_cinfo[col].ci_label;
2567 label->mod_counter = raidPtr->mod_counter;
2568 #ifndef RF_NO_PARITY_MAP
2569 label->parity_map_modcount = label->mod_counter;
2570 #endif
2571 return raidwrite_component_label(raidPtr->bytesPerSector,
2572 raidPtr->Disks[col].dev,
2573 raidPtr->raid_cinfo[col].ci_vp, label);
2574 }
2575
2576
2577 static int
2578 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2579 RF_ComponentLabel_t *clabel)
2580 {
2581 return raidread_component_area(dev, b_vp, clabel,
2582 sizeof(RF_ComponentLabel_t),
2583 rf_component_info_offset(),
2584 rf_component_info_size(secsize));
2585 }
2586
2587 /* ARGSUSED */
2588 static int
2589 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2590 size_t msize, daddr_t offset, daddr_t dsize)
2591 {
2592 struct buf *bp;
2593 const struct bdevsw *bdev;
2594 int error;
2595
2596 /* XXX should probably ensure that we don't try to do this if
2597 someone has changed rf_protected_sectors. */
2598
2599 if (b_vp == NULL) {
2600 /* For whatever reason, this component is not valid.
2601 Don't try to read a component label from it. */
2602 return(EINVAL);
2603 }
2604
2605 /* get a block of the appropriate size... */
2606 bp = geteblk((int)dsize);
2607 bp->b_dev = dev;
2608
2609 /* get our ducks in a row for the read */
2610 bp->b_blkno = offset / DEV_BSIZE;
2611 bp->b_bcount = dsize;
2612 bp->b_flags |= B_READ;
2613 bp->b_resid = dsize;
2614
2615 bdev = bdevsw_lookup(bp->b_dev);
2616 if (bdev == NULL)
2617 return (ENXIO);
2618 (*bdev->d_strategy)(bp);
2619
2620 error = biowait(bp);
2621
2622 if (!error) {
2623 memcpy(data, bp->b_data, msize);
2624 }
2625
2626 brelse(bp, 0);
2627 return(error);
2628 }
2629
2630
2631 static int
2632 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2633 RF_ComponentLabel_t *clabel)
2634 {
2635 return raidwrite_component_area(dev, b_vp, clabel,
2636 sizeof(RF_ComponentLabel_t),
2637 rf_component_info_offset(),
2638 rf_component_info_size(secsize), 0);
2639 }
2640
2641 /* ARGSUSED */
2642 static int
2643 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2644 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2645 {
2646 struct buf *bp;
2647 const struct bdevsw *bdev;
2648 int error;
2649
2650 /* get a block of the appropriate size... */
2651 bp = geteblk((int)dsize);
2652 bp->b_dev = dev;
2653
2654 /* get our ducks in a row for the write */
2655 bp->b_blkno = offset / DEV_BSIZE;
2656 bp->b_bcount = dsize;
2657 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2658 bp->b_resid = dsize;
2659
2660 memset(bp->b_data, 0, dsize);
2661 memcpy(bp->b_data, data, msize);
2662
2663 bdev = bdevsw_lookup(bp->b_dev);
2664 if (bdev == NULL)
2665 return (ENXIO);
2666 (*bdev->d_strategy)(bp);
2667 if (asyncp)
2668 return 0;
2669 error = biowait(bp);
2670 brelse(bp, 0);
2671 if (error) {
2672 #if 1
2673 printf("Failed to write RAID component info!\n");
2674 #endif
2675 }
2676
2677 return(error);
2678 }
2679
2680 void
2681 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2682 {
2683 int c;
2684
2685 for (c = 0; c < raidPtr->numCol; c++) {
2686 /* Skip dead disks. */
2687 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2688 continue;
2689 /* XXXjld: what if an error occurs here? */
2690 raidwrite_component_area(raidPtr->Disks[c].dev,
2691 raidPtr->raid_cinfo[c].ci_vp, map,
2692 RF_PARITYMAP_NBYTE,
2693 rf_parity_map_offset(raidPtr),
2694 rf_parity_map_size(raidPtr), 0);
2695 }
2696 }
2697
2698 void
2699 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2700 {
2701 struct rf_paritymap_ondisk tmp;
2702 int c,first;
2703
2704 first=1;
2705 for (c = 0; c < raidPtr->numCol; c++) {
2706 /* Skip dead disks. */
2707 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2708 continue;
2709 raidread_component_area(raidPtr->Disks[c].dev,
2710 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2711 RF_PARITYMAP_NBYTE,
2712 rf_parity_map_offset(raidPtr),
2713 rf_parity_map_size(raidPtr));
2714 if (first) {
2715 memcpy(map, &tmp, sizeof(*map));
2716 first = 0;
2717 } else {
2718 rf_paritymap_merge(map, &tmp);
2719 }
2720 }
2721 }
2722
2723 void
2724 rf_markalldirty(RF_Raid_t *raidPtr)
2725 {
2726 RF_ComponentLabel_t *clabel;
2727 int sparecol;
2728 int c;
2729 int j;
2730 int scol = -1;
2731
2732 raidPtr->mod_counter++;
2733 for (c = 0; c < raidPtr->numCol; c++) {
2734 /* we don't want to touch (at all) a disk that has
2735 failed */
2736 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2737 clabel = raidget_component_label(raidPtr, c);
2738 if (clabel->status == rf_ds_spared) {
2739 /* XXX do something special...
2740 but whatever you do, don't
2741 try to access it!! */
2742 } else {
2743 raidmarkdirty(raidPtr, c);
2744 }
2745 }
2746 }
2747
2748 for( c = 0; c < raidPtr->numSpare ; c++) {
2749 sparecol = raidPtr->numCol + c;
2750 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2751 /*
2752
2753 we claim this disk is "optimal" if it's
2754 rf_ds_used_spare, as that means it should be
2755 directly substitutable for the disk it replaced.
2756 We note that too...
2757
2758 */
2759
2760 for(j=0;j<raidPtr->numCol;j++) {
2761 if (raidPtr->Disks[j].spareCol == sparecol) {
2762 scol = j;
2763 break;
2764 }
2765 }
2766
2767 clabel = raidget_component_label(raidPtr, sparecol);
2768 /* make sure status is noted */
2769
2770 raid_init_component_label(raidPtr, clabel);
2771
2772 clabel->row = 0;
2773 clabel->column = scol;
2774 /* Note: we *don't* change status from rf_ds_used_spare
2775 to rf_ds_optimal */
2776 /* clabel.status = rf_ds_optimal; */
2777
2778 raidmarkdirty(raidPtr, sparecol);
2779 }
2780 }
2781 }
2782
2783
2784 void
2785 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2786 {
2787 RF_ComponentLabel_t *clabel;
2788 int sparecol;
2789 int c;
2790 int j;
2791 int scol;
2792
2793 scol = -1;
2794
2795 /* XXX should do extra checks to make sure things really are clean,
2796 rather than blindly setting the clean bit... */
2797
2798 raidPtr->mod_counter++;
2799
2800 for (c = 0; c < raidPtr->numCol; c++) {
2801 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2802 clabel = raidget_component_label(raidPtr, c);
2803 /* make sure status is noted */
2804 clabel->status = rf_ds_optimal;
2805
2806 /* note what unit we are configured as */
2807 clabel->last_unit = raidPtr->raidid;
2808
2809 raidflush_component_label(raidPtr, c);
2810 if (final == RF_FINAL_COMPONENT_UPDATE) {
2811 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2812 raidmarkclean(raidPtr, c);
2813 }
2814 }
2815 }
2816 /* else we don't touch it.. */
2817 }
2818
2819 for( c = 0; c < raidPtr->numSpare ; c++) {
2820 sparecol = raidPtr->numCol + c;
2821 /* Need to ensure that the reconstruct actually completed! */
2822 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2823 /*
2824
2825 we claim this disk is "optimal" if it's
2826 rf_ds_used_spare, as that means it should be
2827 directly substitutable for the disk it replaced.
2828 We note that too...
2829
2830 */
2831
2832 for(j=0;j<raidPtr->numCol;j++) {
2833 if (raidPtr->Disks[j].spareCol == sparecol) {
2834 scol = j;
2835 break;
2836 }
2837 }
2838
2839 /* XXX shouldn't *really* need this... */
2840 clabel = raidget_component_label(raidPtr, sparecol);
2841 /* make sure status is noted */
2842
2843 raid_init_component_label(raidPtr, clabel);
2844
2845 clabel->column = scol;
2846 clabel->status = rf_ds_optimal;
2847 clabel->last_unit = raidPtr->raidid;
2848
2849 raidflush_component_label(raidPtr, sparecol);
2850 if (final == RF_FINAL_COMPONENT_UPDATE) {
2851 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2852 raidmarkclean(raidPtr, sparecol);
2853 }
2854 }
2855 }
2856 }
2857 }
2858
2859 void
2860 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2861 {
2862
2863 if (vp != NULL) {
2864 if (auto_configured == 1) {
2865 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2866 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2867 vput(vp);
2868
2869 } else {
2870 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2871 }
2872 }
2873 }
2874
2875
2876 void
2877 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2878 {
2879 int r,c;
2880 struct vnode *vp;
2881 int acd;
2882
2883
2884 /* We take this opportunity to close the vnodes like we should.. */
2885
2886 for (c = 0; c < raidPtr->numCol; c++) {
2887 vp = raidPtr->raid_cinfo[c].ci_vp;
2888 acd = raidPtr->Disks[c].auto_configured;
2889 rf_close_component(raidPtr, vp, acd);
2890 raidPtr->raid_cinfo[c].ci_vp = NULL;
2891 raidPtr->Disks[c].auto_configured = 0;
2892 }
2893
2894 for (r = 0; r < raidPtr->numSpare; r++) {
2895 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2896 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2897 rf_close_component(raidPtr, vp, acd);
2898 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2899 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2900 }
2901 }
2902
2903
2904 void
2905 rf_ReconThread(struct rf_recon_req *req)
2906 {
2907 int s;
2908 RF_Raid_t *raidPtr;
2909
2910 s = splbio();
2911 raidPtr = (RF_Raid_t *) req->raidPtr;
2912 raidPtr->recon_in_progress = 1;
2913
2914 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2915 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2916
2917 RF_Free(req, sizeof(*req));
2918
2919 raidPtr->recon_in_progress = 0;
2920 splx(s);
2921
2922 /* That's all... */
2923 kthread_exit(0); /* does not return */
2924 }
2925
2926 void
2927 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2928 {
2929 int retcode;
2930 int s;
2931
2932 raidPtr->parity_rewrite_stripes_done = 0;
2933 raidPtr->parity_rewrite_in_progress = 1;
2934 s = splbio();
2935 retcode = rf_RewriteParity(raidPtr);
2936 splx(s);
2937 if (retcode) {
2938 printf("raid%d: Error re-writing parity (%d)!\n",
2939 raidPtr->raidid, retcode);
2940 } else {
2941 /* set the clean bit! If we shutdown correctly,
2942 the clean bit on each component label will get
2943 set */
2944 raidPtr->parity_good = RF_RAID_CLEAN;
2945 }
2946 raidPtr->parity_rewrite_in_progress = 0;
2947
2948 /* Anyone waiting for us to stop? If so, inform them... */
2949 if (raidPtr->waitShutdown) {
2950 wakeup(&raidPtr->parity_rewrite_in_progress);
2951 }
2952
2953 /* That's all... */
2954 kthread_exit(0); /* does not return */
2955 }
2956
2957
2958 void
2959 rf_CopybackThread(RF_Raid_t *raidPtr)
2960 {
2961 int s;
2962
2963 raidPtr->copyback_in_progress = 1;
2964 s = splbio();
2965 rf_CopybackReconstructedData(raidPtr);
2966 splx(s);
2967 raidPtr->copyback_in_progress = 0;
2968
2969 /* That's all... */
2970 kthread_exit(0); /* does not return */
2971 }
2972
2973
2974 void
2975 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2976 {
2977 int s;
2978 RF_Raid_t *raidPtr;
2979
2980 s = splbio();
2981 raidPtr = req->raidPtr;
2982 raidPtr->recon_in_progress = 1;
2983 rf_ReconstructInPlace(raidPtr, req->col);
2984 RF_Free(req, sizeof(*req));
2985 raidPtr->recon_in_progress = 0;
2986 splx(s);
2987
2988 /* That's all... */
2989 kthread_exit(0); /* does not return */
2990 }
2991
2992 static RF_AutoConfig_t *
2993 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2994 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2995 unsigned secsize)
2996 {
2997 int good_one = 0;
2998 RF_ComponentLabel_t *clabel;
2999 RF_AutoConfig_t *ac;
3000
3001 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
3002 if (clabel == NULL) {
3003 oomem:
3004 while(ac_list) {
3005 ac = ac_list;
3006 if (ac->clabel)
3007 free(ac->clabel, M_RAIDFRAME);
3008 ac_list = ac_list->next;
3009 free(ac, M_RAIDFRAME);
3010 }
3011 printf("RAID auto config: out of memory!\n");
3012 return NULL; /* XXX probably should panic? */
3013 }
3014
3015 if (!raidread_component_label(secsize, dev, vp, clabel)) {
3016 /* Got the label. Does it look reasonable? */
3017 if (rf_reasonable_label(clabel, numsecs) &&
3018 (rf_component_label_partitionsize(clabel) <= size)) {
3019 #ifdef DEBUG
3020 printf("Component on: %s: %llu\n",
3021 cname, (unsigned long long)size);
3022 rf_print_component_label(clabel);
3023 #endif
3024 /* if it's reasonable, add it, else ignore it. */
3025 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
3026 M_NOWAIT);
3027 if (ac == NULL) {
3028 free(clabel, M_RAIDFRAME);
3029 goto oomem;
3030 }
3031 strlcpy(ac->devname, cname, sizeof(ac->devname));
3032 ac->dev = dev;
3033 ac->vp = vp;
3034 ac->clabel = clabel;
3035 ac->next = ac_list;
3036 ac_list = ac;
3037 good_one = 1;
3038 }
3039 }
3040 if (!good_one) {
3041 /* cleanup */
3042 free(clabel, M_RAIDFRAME);
3043 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3044 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3045 vput(vp);
3046 }
3047 return ac_list;
3048 }
3049
3050 RF_AutoConfig_t *
3051 rf_find_raid_components(void)
3052 {
3053 struct vnode *vp;
3054 struct disklabel label;
3055 device_t dv;
3056 deviter_t di;
3057 dev_t dev;
3058 int bmajor, bminor, wedge, rf_part_found;
3059 int error;
3060 int i;
3061 RF_AutoConfig_t *ac_list;
3062 uint64_t numsecs;
3063 unsigned secsize;
3064
3065 /* initialize the AutoConfig list */
3066 ac_list = NULL;
3067
3068 /* we begin by trolling through *all* the devices on the system */
3069
3070 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3071 dv = deviter_next(&di)) {
3072
3073 /* we are only interested in disks... */
3074 if (device_class(dv) != DV_DISK)
3075 continue;
3076
3077 /* we don't care about floppies... */
3078 if (device_is_a(dv, "fd")) {
3079 continue;
3080 }
3081
3082 /* we don't care about CD's... */
3083 if (device_is_a(dv, "cd")) {
3084 continue;
3085 }
3086
3087 /* we don't care about md's... */
3088 if (device_is_a(dv, "md")) {
3089 continue;
3090 }
3091
3092 /* hdfd is the Atari/Hades floppy driver */
3093 if (device_is_a(dv, "hdfd")) {
3094 continue;
3095 }
3096
3097 /* fdisa is the Atari/Milan floppy driver */
3098 if (device_is_a(dv, "fdisa")) {
3099 continue;
3100 }
3101
3102 /* need to find the device_name_to_block_device_major stuff */
3103 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3104
3105 rf_part_found = 0; /*No raid partition as yet*/
3106
3107 /* get a vnode for the raw partition of this disk */
3108
3109 wedge = device_is_a(dv, "dk");
3110 bminor = minor(device_unit(dv));
3111 dev = wedge ? makedev(bmajor, bminor) :
3112 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3113 if (bdevvp(dev, &vp))
3114 panic("RAID can't alloc vnode");
3115
3116 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3117
3118 if (error) {
3119 /* "Who cares." Continue looking
3120 for something that exists*/
3121 vput(vp);
3122 continue;
3123 }
3124
3125 error = getdisksize(vp, &numsecs, &secsize);
3126 if (error) {
3127 vput(vp);
3128 continue;
3129 }
3130 if (wedge) {
3131 struct dkwedge_info dkw;
3132 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3133 NOCRED);
3134 if (error) {
3135 printf("RAIDframe: can't get wedge info for "
3136 "dev %s (%d)\n", device_xname(dv), error);
3137 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3138 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3139 vput(vp);
3140 continue;
3141 }
3142
3143 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3144 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3145 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3146 vput(vp);
3147 continue;
3148 }
3149
3150 ac_list = rf_get_component(ac_list, dev, vp,
3151 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3152 rf_part_found = 1; /*There is a raid component on this disk*/
3153 continue;
3154 }
3155
3156 /* Ok, the disk exists. Go get the disklabel. */
3157 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3158 if (error) {
3159 /*
3160 * XXX can't happen - open() would
3161 * have errored out (or faked up one)
3162 */
3163 if (error != ENOTTY)
3164 printf("RAIDframe: can't get label for dev "
3165 "%s (%d)\n", device_xname(dv), error);
3166 }
3167
3168 /* don't need this any more. We'll allocate it again
3169 a little later if we really do... */
3170 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3171 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3172 vput(vp);
3173
3174 if (error)
3175 continue;
3176
3177 rf_part_found = 0; /*No raid partitions yet*/
3178 for (i = 0; i < label.d_npartitions; i++) {
3179 char cname[sizeof(ac_list->devname)];
3180
3181 /* We only support partitions marked as RAID */
3182 if (label.d_partitions[i].p_fstype != FS_RAID)
3183 continue;
3184
3185 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3186 if (bdevvp(dev, &vp))
3187 panic("RAID can't alloc vnode");
3188
3189 error = VOP_OPEN(vp, FREAD, NOCRED);
3190 if (error) {
3191 /* Whatever... */
3192 vput(vp);
3193 continue;
3194 }
3195 snprintf(cname, sizeof(cname), "%s%c",
3196 device_xname(dv), 'a' + i);
3197 ac_list = rf_get_component(ac_list, dev, vp, cname,
3198 label.d_partitions[i].p_size, numsecs, secsize);
3199 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3200 }
3201
3202 /*
3203 *If there is no raid component on this disk, either in a
3204 *disklabel or inside a wedge, check the raw partition as well,
3205 *as it is possible to configure raid components on raw disk
3206 *devices.
3207 */
3208
3209 if (!rf_part_found) {
3210 char cname[sizeof(ac_list->devname)];
3211
3212 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3213 if (bdevvp(dev, &vp))
3214 panic("RAID can't alloc vnode");
3215
3216 error = VOP_OPEN(vp, FREAD, NOCRED);
3217 if (error) {
3218 /* Whatever... */
3219 vput(vp);
3220 continue;
3221 }
3222 snprintf(cname, sizeof(cname), "%s%c",
3223 device_xname(dv), 'a' + RAW_PART);
3224 ac_list = rf_get_component(ac_list, dev, vp, cname,
3225 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3226 }
3227 }
3228 deviter_release(&di);
3229 return ac_list;
3230 }
3231
3232
3233 int
3234 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3235 {
3236
3237 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3238 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3239 ((clabel->clean == RF_RAID_CLEAN) ||
3240 (clabel->clean == RF_RAID_DIRTY)) &&
3241 clabel->row >=0 &&
3242 clabel->column >= 0 &&
3243 clabel->num_rows > 0 &&
3244 clabel->num_columns > 0 &&
3245 clabel->row < clabel->num_rows &&
3246 clabel->column < clabel->num_columns &&
3247 clabel->blockSize > 0 &&
3248 /*
3249 * numBlocksHi may contain garbage, but it is ok since
3250 * the type is unsigned. If it is really garbage,
3251 * rf_fix_old_label_size() will fix it.
3252 */
3253 rf_component_label_numblocks(clabel) > 0) {
3254 /*
3255 * label looks reasonable enough...
3256 * let's make sure it has no old garbage.
3257 */
3258 if (numsecs)
3259 rf_fix_old_label_size(clabel, numsecs);
3260 return(1);
3261 }
3262 return(0);
3263 }
3264
3265
3266 /*
3267 * For reasons yet unknown, some old component labels have garbage in
3268 * the newer numBlocksHi region, and this causes lossage. Since those
3269 * disks will also have numsecs set to less than 32 bits of sectors,
3270 * we can determine when this corruption has occurred, and fix it.
3271 *
3272 * The exact same problem, with the same unknown reason, happens to
3273 * the partitionSizeHi member as well.
3274 */
3275 static void
3276 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3277 {
3278
3279 if (numsecs < ((uint64_t)1 << 32)) {
3280 if (clabel->numBlocksHi) {
3281 printf("WARNING: total sectors < 32 bits, yet "
3282 "numBlocksHi set\n"
3283 "WARNING: resetting numBlocksHi to zero.\n");
3284 clabel->numBlocksHi = 0;
3285 }
3286
3287 if (clabel->partitionSizeHi) {
3288 printf("WARNING: total sectors < 32 bits, yet "
3289 "partitionSizeHi set\n"
3290 "WARNING: resetting partitionSizeHi to zero.\n");
3291 clabel->partitionSizeHi = 0;
3292 }
3293 }
3294 }
3295
3296
3297 #ifdef DEBUG
3298 void
3299 rf_print_component_label(RF_ComponentLabel_t *clabel)
3300 {
3301 uint64_t numBlocks;
3302
3303 numBlocks = rf_component_label_numblocks(clabel);
3304
3305 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3306 clabel->row, clabel->column,
3307 clabel->num_rows, clabel->num_columns);
3308 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3309 clabel->version, clabel->serial_number,
3310 clabel->mod_counter);
3311 printf(" Clean: %s Status: %d\n",
3312 clabel->clean ? "Yes" : "No", clabel->status);
3313 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3314 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3315 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3316 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3317 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3318 printf(" Contains root partition: %s\n",
3319 clabel->root_partition ? "Yes" : "No");
3320 printf(" Last configured as: raid%d\n", clabel->last_unit);
3321 #if 0
3322 printf(" Config order: %d\n", clabel->config_order);
3323 #endif
3324
3325 }
3326 #endif
3327
3328 RF_ConfigSet_t *
3329 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3330 {
3331 RF_AutoConfig_t *ac;
3332 RF_ConfigSet_t *config_sets;
3333 RF_ConfigSet_t *cset;
3334 RF_AutoConfig_t *ac_next;
3335
3336
3337 config_sets = NULL;
3338
3339 /* Go through the AutoConfig list, and figure out which components
3340 belong to what sets. */
3341 ac = ac_list;
3342 while(ac!=NULL) {
3343 /* we're going to putz with ac->next, so save it here
3344 for use at the end of the loop */
3345 ac_next = ac->next;
3346
3347 if (config_sets == NULL) {
3348 /* will need at least this one... */
3349 config_sets = (RF_ConfigSet_t *)
3350 malloc(sizeof(RF_ConfigSet_t),
3351 M_RAIDFRAME, M_NOWAIT);
3352 if (config_sets == NULL) {
3353 panic("rf_create_auto_sets: No memory!");
3354 }
3355 /* this one is easy :) */
3356 config_sets->ac = ac;
3357 config_sets->next = NULL;
3358 config_sets->rootable = 0;
3359 ac->next = NULL;
3360 } else {
3361 /* which set does this component fit into? */
3362 cset = config_sets;
3363 while(cset!=NULL) {
3364 if (rf_does_it_fit(cset, ac)) {
3365 /* looks like it matches... */
3366 ac->next = cset->ac;
3367 cset->ac = ac;
3368 break;
3369 }
3370 cset = cset->next;
3371 }
3372 if (cset==NULL) {
3373 /* didn't find a match above... new set..*/
3374 cset = (RF_ConfigSet_t *)
3375 malloc(sizeof(RF_ConfigSet_t),
3376 M_RAIDFRAME, M_NOWAIT);
3377 if (cset == NULL) {
3378 panic("rf_create_auto_sets: No memory!");
3379 }
3380 cset->ac = ac;
3381 ac->next = NULL;
3382 cset->next = config_sets;
3383 cset->rootable = 0;
3384 config_sets = cset;
3385 }
3386 }
3387 ac = ac_next;
3388 }
3389
3390
3391 return(config_sets);
3392 }
3393
3394 static int
3395 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3396 {
3397 RF_ComponentLabel_t *clabel1, *clabel2;
3398
3399 /* If this one matches the *first* one in the set, that's good
3400 enough, since the other members of the set would have been
3401 through here too... */
3402 /* note that we are not checking partitionSize here..
3403
3404 Note that we are also not checking the mod_counters here.
3405 If everything else matches except the mod_counter, that's
3406 good enough for this test. We will deal with the mod_counters
3407 a little later in the autoconfiguration process.
3408
3409 (clabel1->mod_counter == clabel2->mod_counter) &&
3410
3411 The reason we don't check for this is that failed disks
3412 will have lower modification counts. If those disks are
3413 not added to the set they used to belong to, then they will
3414 form their own set, which may result in 2 different sets,
3415 for example, competing to be configured at raid0, and
3416 perhaps competing to be the root filesystem set. If the
3417 wrong ones get configured, or both attempt to become /,
3418 weird behaviour and or serious lossage will occur. Thus we
3419 need to bring them into the fold here, and kick them out at
3420 a later point.
3421
3422 */
3423
3424 clabel1 = cset->ac->clabel;
3425 clabel2 = ac->clabel;
3426 if ((clabel1->version == clabel2->version) &&
3427 (clabel1->serial_number == clabel2->serial_number) &&
3428 (clabel1->num_rows == clabel2->num_rows) &&
3429 (clabel1->num_columns == clabel2->num_columns) &&
3430 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3431 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3432 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3433 (clabel1->parityConfig == clabel2->parityConfig) &&
3434 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3435 (clabel1->blockSize == clabel2->blockSize) &&
3436 rf_component_label_numblocks(clabel1) ==
3437 rf_component_label_numblocks(clabel2) &&
3438 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3439 (clabel1->root_partition == clabel2->root_partition) &&
3440 (clabel1->last_unit == clabel2->last_unit) &&
3441 (clabel1->config_order == clabel2->config_order)) {
3442 /* if it get's here, it almost *has* to be a match */
3443 } else {
3444 /* it's not consistent with somebody in the set..
3445 punt */
3446 return(0);
3447 }
3448 /* all was fine.. it must fit... */
3449 return(1);
3450 }
3451
3452 int
3453 rf_have_enough_components(RF_ConfigSet_t *cset)
3454 {
3455 RF_AutoConfig_t *ac;
3456 RF_AutoConfig_t *auto_config;
3457 RF_ComponentLabel_t *clabel;
3458 int c;
3459 int num_cols;
3460 int num_missing;
3461 int mod_counter;
3462 int mod_counter_found;
3463 int even_pair_failed;
3464 char parity_type;
3465
3466
3467 /* check to see that we have enough 'live' components
3468 of this set. If so, we can configure it if necessary */
3469
3470 num_cols = cset->ac->clabel->num_columns;
3471 parity_type = cset->ac->clabel->parityConfig;
3472
3473 /* XXX Check for duplicate components!?!?!? */
3474
3475 /* Determine what the mod_counter is supposed to be for this set. */
3476
3477 mod_counter_found = 0;
3478 mod_counter = 0;
3479 ac = cset->ac;
3480 while(ac!=NULL) {
3481 if (mod_counter_found==0) {
3482 mod_counter = ac->clabel->mod_counter;
3483 mod_counter_found = 1;
3484 } else {
3485 if (ac->clabel->mod_counter > mod_counter) {
3486 mod_counter = ac->clabel->mod_counter;
3487 }
3488 }
3489 ac = ac->next;
3490 }
3491
3492 num_missing = 0;
3493 auto_config = cset->ac;
3494
3495 even_pair_failed = 0;
3496 for(c=0; c<num_cols; c++) {
3497 ac = auto_config;
3498 while(ac!=NULL) {
3499 if ((ac->clabel->column == c) &&
3500 (ac->clabel->mod_counter == mod_counter)) {
3501 /* it's this one... */
3502 #ifdef DEBUG
3503 printf("Found: %s at %d\n",
3504 ac->devname,c);
3505 #endif
3506 break;
3507 }
3508 ac=ac->next;
3509 }
3510 if (ac==NULL) {
3511 /* Didn't find one here! */
3512 /* special case for RAID 1, especially
3513 where there are more than 2
3514 components (where RAIDframe treats
3515 things a little differently :( ) */
3516 if (parity_type == '1') {
3517 if (c%2 == 0) { /* even component */
3518 even_pair_failed = 1;
3519 } else { /* odd component. If
3520 we're failed, and
3521 so is the even
3522 component, it's
3523 "Good Night, Charlie" */
3524 if (even_pair_failed == 1) {
3525 return(0);
3526 }
3527 }
3528 } else {
3529 /* normal accounting */
3530 num_missing++;
3531 }
3532 }
3533 if ((parity_type == '1') && (c%2 == 1)) {
3534 /* Just did an even component, and we didn't
3535 bail.. reset the even_pair_failed flag,
3536 and go on to the next component.... */
3537 even_pair_failed = 0;
3538 }
3539 }
3540
3541 clabel = cset->ac->clabel;
3542
3543 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3544 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3545 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3546 /* XXX this needs to be made *much* more general */
3547 /* Too many failures */
3548 return(0);
3549 }
3550 /* otherwise, all is well, and we've got enough to take a kick
3551 at autoconfiguring this set */
3552 return(1);
3553 }
3554
3555 void
3556 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3557 RF_Raid_t *raidPtr)
3558 {
3559 RF_ComponentLabel_t *clabel;
3560 int i;
3561
3562 clabel = ac->clabel;
3563
3564 /* 1. Fill in the common stuff */
3565 config->numRow = clabel->num_rows = 1;
3566 config->numCol = clabel->num_columns;
3567 config->numSpare = 0; /* XXX should this be set here? */
3568 config->sectPerSU = clabel->sectPerSU;
3569 config->SUsPerPU = clabel->SUsPerPU;
3570 config->SUsPerRU = clabel->SUsPerRU;
3571 config->parityConfig = clabel->parityConfig;
3572 /* XXX... */
3573 strcpy(config->diskQueueType,"fifo");
3574 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3575 config->layoutSpecificSize = 0; /* XXX ?? */
3576
3577 while(ac!=NULL) {
3578 /* row/col values will be in range due to the checks
3579 in reasonable_label() */
3580 strcpy(config->devnames[0][ac->clabel->column],
3581 ac->devname);
3582 ac = ac->next;
3583 }
3584
3585 for(i=0;i<RF_MAXDBGV;i++) {
3586 config->debugVars[i][0] = 0;
3587 }
3588 }
3589
3590 int
3591 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3592 {
3593 RF_ComponentLabel_t *clabel;
3594 int column;
3595 int sparecol;
3596
3597 raidPtr->autoconfigure = new_value;
3598
3599 for(column=0; column<raidPtr->numCol; column++) {
3600 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3601 clabel = raidget_component_label(raidPtr, column);
3602 clabel->autoconfigure = new_value;
3603 raidflush_component_label(raidPtr, column);
3604 }
3605 }
3606 for(column = 0; column < raidPtr->numSpare ; column++) {
3607 sparecol = raidPtr->numCol + column;
3608 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3609 clabel = raidget_component_label(raidPtr, sparecol);
3610 clabel->autoconfigure = new_value;
3611 raidflush_component_label(raidPtr, sparecol);
3612 }
3613 }
3614 return(new_value);
3615 }
3616
3617 int
3618 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3619 {
3620 RF_ComponentLabel_t *clabel;
3621 int column;
3622 int sparecol;
3623
3624 raidPtr->root_partition = new_value;
3625 for(column=0; column<raidPtr->numCol; column++) {
3626 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3627 clabel = raidget_component_label(raidPtr, column);
3628 clabel->root_partition = new_value;
3629 raidflush_component_label(raidPtr, column);
3630 }
3631 }
3632 for(column = 0; column < raidPtr->numSpare ; column++) {
3633 sparecol = raidPtr->numCol + column;
3634 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3635 clabel = raidget_component_label(raidPtr, sparecol);
3636 clabel->root_partition = new_value;
3637 raidflush_component_label(raidPtr, sparecol);
3638 }
3639 }
3640 return(new_value);
3641 }
3642
3643 void
3644 rf_release_all_vps(RF_ConfigSet_t *cset)
3645 {
3646 RF_AutoConfig_t *ac;
3647
3648 ac = cset->ac;
3649 while(ac!=NULL) {
3650 /* Close the vp, and give it back */
3651 if (ac->vp) {
3652 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3653 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3654 vput(ac->vp);
3655 ac->vp = NULL;
3656 }
3657 ac = ac->next;
3658 }
3659 }
3660
3661
3662 void
3663 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3664 {
3665 RF_AutoConfig_t *ac;
3666 RF_AutoConfig_t *next_ac;
3667
3668 ac = cset->ac;
3669 while(ac!=NULL) {
3670 next_ac = ac->next;
3671 /* nuke the label */
3672 free(ac->clabel, M_RAIDFRAME);
3673 /* cleanup the config structure */
3674 free(ac, M_RAIDFRAME);
3675 /* "next.." */
3676 ac = next_ac;
3677 }
3678 /* and, finally, nuke the config set */
3679 free(cset, M_RAIDFRAME);
3680 }
3681
3682
3683 void
3684 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3685 {
3686 /* current version number */
3687 clabel->version = RF_COMPONENT_LABEL_VERSION;
3688 clabel->serial_number = raidPtr->serial_number;
3689 clabel->mod_counter = raidPtr->mod_counter;
3690
3691 clabel->num_rows = 1;
3692 clabel->num_columns = raidPtr->numCol;
3693 clabel->clean = RF_RAID_DIRTY; /* not clean */
3694 clabel->status = rf_ds_optimal; /* "It's good!" */
3695
3696 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3697 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3698 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3699
3700 clabel->blockSize = raidPtr->bytesPerSector;
3701 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3702
3703 /* XXX not portable */
3704 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3705 clabel->maxOutstanding = raidPtr->maxOutstanding;
3706 clabel->autoconfigure = raidPtr->autoconfigure;
3707 clabel->root_partition = raidPtr->root_partition;
3708 clabel->last_unit = raidPtr->raidid;
3709 clabel->config_order = raidPtr->config_order;
3710
3711 #ifndef RF_NO_PARITY_MAP
3712 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3713 #endif
3714 }
3715
3716 struct raid_softc *
3717 rf_auto_config_set(RF_ConfigSet_t *cset)
3718 {
3719 RF_Raid_t *raidPtr;
3720 RF_Config_t *config;
3721 int raidID;
3722 struct raid_softc *sc;
3723
3724 #ifdef DEBUG
3725 printf("RAID autoconfigure\n");
3726 #endif
3727
3728 /* 1. Create a config structure */
3729 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3730 if (config == NULL) {
3731 printf("Out of mem!?!?\n");
3732 /* XXX do something more intelligent here. */
3733 return NULL;
3734 }
3735
3736 /*
3737 2. Figure out what RAID ID this one is supposed to live at
3738 See if we can get the same RAID dev that it was configured
3739 on last time..
3740 */
3741
3742 raidID = cset->ac->clabel->last_unit;
3743 for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
3744 continue;
3745 #ifdef DEBUG
3746 printf("Configuring raid%d:\n",raidID);
3747 #endif
3748
3749 raidPtr = &sc->sc_r;
3750
3751 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3752 raidPtr->softc = sc;
3753 raidPtr->raidid = raidID;
3754 raidPtr->openings = RAIDOUTSTANDING;
3755
3756 /* 3. Build the configuration structure */
3757 rf_create_configuration(cset->ac, config, raidPtr);
3758
3759 /* 4. Do the configuration */
3760 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3761 raidinit(sc);
3762
3763 rf_markalldirty(raidPtr);
3764 raidPtr->autoconfigure = 1; /* XXX do this here? */
3765 if (cset->ac->clabel->root_partition==1) {
3766 /* everything configured just fine. Make a note
3767 that this set is eligible to be root. */
3768 cset->rootable = 1;
3769 /* XXX do this here? */
3770 raidPtr->root_partition = 1;
3771 }
3772 } else {
3773 raidput(sc);
3774 sc = NULL;
3775 }
3776
3777 /* 5. Cleanup */
3778 free(config, M_RAIDFRAME);
3779 return sc;
3780 }
3781
3782 void
3783 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3784 {
3785 struct buf *bp;
3786 struct raid_softc *rs;
3787
3788 bp = (struct buf *)desc->bp;
3789 rs = desc->raidPtr->softc;
3790 disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
3791 (bp->b_flags & B_READ));
3792 }
3793
3794 void
3795 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3796 size_t xmin, size_t xmax)
3797 {
3798 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3799 pool_sethiwat(p, xmax);
3800 pool_prime(p, xmin);
3801 pool_setlowat(p, xmin);
3802 }
3803
3804 /*
3805 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
3806 * if there is IO pending and if that IO could possibly be done for a
3807 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3808 * otherwise.
3809 *
3810 */
3811
3812 int
3813 rf_buf_queue_check(RF_Raid_t *raidPtr)
3814 {
3815 struct raid_softc *rs = raidPtr->softc;
3816 if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
3817 /* there is work to do */
3818 return 0;
3819 }
3820 /* default is nothing to do */
3821 return 1;
3822 }
3823
3824 int
3825 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3826 {
3827 uint64_t numsecs;
3828 unsigned secsize;
3829 int error;
3830
3831 error = getdisksize(vp, &numsecs, &secsize);
3832 if (error == 0) {
3833 diskPtr->blockSize = secsize;
3834 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3835 diskPtr->partitionSize = numsecs;
3836 return 0;
3837 }
3838 return error;
3839 }
3840
3841 static int
3842 raid_match(device_t self, cfdata_t cfdata, void *aux)
3843 {
3844 return 1;
3845 }
3846
3847 static void
3848 raid_attach(device_t parent, device_t self, void *aux)
3849 {
3850
3851 }
3852
3853
3854 static int
3855 raid_detach(device_t self, int flags)
3856 {
3857 int error;
3858 struct raid_softc *rs = raidget(device_unit(self));
3859
3860 if (rs == NULL)
3861 return ENXIO;
3862
3863 if ((error = raidlock(rs)) != 0)
3864 return (error);
3865
3866 error = raid_detach_unlocked(rs);
3867
3868 raidunlock(rs);
3869
3870 /* XXXkd: raidput(rs) ??? */
3871
3872 return error;
3873 }
3874
3875 static void
3876 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3877 {
3878 struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
3879
3880 memset(dg, 0, sizeof(*dg));
3881
3882 dg->dg_secperunit = raidPtr->totalSectors;
3883 dg->dg_secsize = raidPtr->bytesPerSector;
3884 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3885 dg->dg_ntracks = 4 * raidPtr->numCol;
3886
3887 disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
3888 }
3889
3890 /*
3891 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3892 * We end up returning whatever error was returned by the first cache flush
3893 * that fails.
3894 */
3895
3896 int
3897 rf_sync_component_caches(RF_Raid_t *raidPtr)
3898 {
3899 int c, sparecol;
3900 int e,error;
3901 int force = 1;
3902
3903 error = 0;
3904 for (c = 0; c < raidPtr->numCol; c++) {
3905 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3906 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3907 &force, FWRITE, NOCRED);
3908 if (e) {
3909 if (e != ENODEV)
3910 printf("raid%d: cache flush to component %s failed.\n",
3911 raidPtr->raidid, raidPtr->Disks[c].devname);
3912 if (error == 0) {
3913 error = e;
3914 }
3915 }
3916 }
3917 }
3918
3919 for( c = 0; c < raidPtr->numSpare ; c++) {
3920 sparecol = raidPtr->numCol + c;
3921 /* Need to ensure that the reconstruct actually completed! */
3922 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3923 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3924 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3925 if (e) {
3926 if (e != ENODEV)
3927 printf("raid%d: cache flush to component %s failed.\n",
3928 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3929 if (error == 0) {
3930 error = e;
3931 }
3932 }
3933 }
3934 }
3935 return error;
3936 }
3937