rf_netbsdkintf.c revision 1.298.2.3 1 /* $NetBSD: rf_netbsdkintf.c,v 1.298.2.3 2013/06/23 06:20:21 tls Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.298.2.3 2013/06/23 06:20:21 tls Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129
130 #include <prop/proplib.h>
131
132 #include <dev/raidframe/raidframevar.h>
133 #include <dev/raidframe/raidframeio.h>
134 #include <dev/raidframe/rf_paritymap.h>
135
136 #include "rf_raid.h"
137 #include "rf_copyback.h"
138 #include "rf_dag.h"
139 #include "rf_dagflags.h"
140 #include "rf_desc.h"
141 #include "rf_diskqueue.h"
142 #include "rf_etimer.h"
143 #include "rf_general.h"
144 #include "rf_kintf.h"
145 #include "rf_options.h"
146 #include "rf_driver.h"
147 #include "rf_parityscan.h"
148 #include "rf_threadstuff.h"
149
150 #ifdef COMPAT_50
151 #include "rf_compat50.h"
152 #endif
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165
166 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
167 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
169 * installation process */
170 #endif
171
172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
173
174 /* prototypes */
175 static void KernelWakeupFunc(struct buf *);
176 static void InitBP(struct buf *, struct vnode *, unsigned,
177 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
178 void *, int, struct proc *);
179 struct raid_softc;
180 static void raidinit(struct raid_softc *);
181
182 void raidattach(int);
183 static int raid_match(device_t, cfdata_t, void *);
184 static void raid_attach(device_t, device_t, void *);
185 static int raid_detach(device_t, int);
186
187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
188 daddr_t, daddr_t);
189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t, int);
191
192 static int raidwrite_component_label(unsigned,
193 dev_t, struct vnode *, RF_ComponentLabel_t *);
194 static int raidread_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196
197
198 dev_type_open(raidopen);
199 dev_type_close(raidclose);
200 dev_type_read(raidread);
201 dev_type_write(raidwrite);
202 dev_type_ioctl(raidioctl);
203 dev_type_strategy(raidstrategy);
204 dev_type_dump(raiddump);
205 dev_type_size(raidsize);
206
207 const struct bdevsw raid_bdevsw = {
208 raidopen, raidclose, raidstrategy, raidioctl,
209 raiddump, raidsize, D_DISK
210 };
211
212 const struct cdevsw raid_cdevsw = {
213 raidopen, raidclose, raidread, raidwrite, raidioctl,
214 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
215 };
216
217 static void raidminphys(struct buf *);
218
219 static struct dkdriver rf_dkdriver = { raidstrategy, raidminphys };
220
221 struct raid_softc {
222 device_t sc_dev;
223 int sc_unit;
224 int sc_flags; /* flags */
225 int sc_cflags; /* configuration flags */
226 uint64_t sc_size; /* size of the raid device */
227 char sc_xname[20]; /* XXX external name */
228 struct disk sc_dkdev; /* generic disk device info */
229 struct bufq_state *buf_queue; /* used for the device queue */
230 RF_Raid_t sc_r;
231 LIST_ENTRY(raid_softc) sc_link;
232 };
233 /* sc_flags */
234 #define RAIDF_INITED 0x01 /* unit has been initialized */
235 #define RAIDF_WLABEL 0x02 /* label area is writable */
236 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
237 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
238 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
239 #define RAIDF_LOCKED 0x80 /* unit is locked */
240
241 #define raidunit(x) DISKUNIT(x)
242
243 extern struct cfdriver raid_cd;
244 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
245 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
246 DVF_DETACH_SHUTDOWN);
247
248 /*
249 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
250 * Be aware that large numbers can allow the driver to consume a lot of
251 * kernel memory, especially on writes, and in degraded mode reads.
252 *
253 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
254 * a single 64K write will typically require 64K for the old data,
255 * 64K for the old parity, and 64K for the new parity, for a total
256 * of 192K (if the parity buffer is not re-used immediately).
257 * Even it if is used immediately, that's still 128K, which when multiplied
258 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
259 *
260 * Now in degraded mode, for example, a 64K read on the above setup may
261 * require data reconstruction, which will require *all* of the 4 remaining
262 * disks to participate -- 4 * 32K/disk == 128K again.
263 */
264
265 #ifndef RAIDOUTSTANDING
266 #define RAIDOUTSTANDING 6
267 #endif
268
269 #define RAIDLABELDEV(dev) \
270 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
271
272 /* declared here, and made public, for the benefit of KVM stuff.. */
273
274 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
275 struct disklabel *);
276 static void raidgetdisklabel(dev_t);
277 static void raidmakedisklabel(struct raid_softc *);
278
279 static int raidlock(struct raid_softc *);
280 static void raidunlock(struct raid_softc *);
281
282 static int raid_detach_unlocked(struct raid_softc *);
283
284 static void rf_markalldirty(RF_Raid_t *);
285 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
286
287 void rf_ReconThread(struct rf_recon_req *);
288 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
289 void rf_CopybackThread(RF_Raid_t *raidPtr);
290 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
291 int rf_autoconfig(device_t);
292 void rf_buildroothack(RF_ConfigSet_t *);
293
294 RF_AutoConfig_t *rf_find_raid_components(void);
295 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
296 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
297 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
298 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
299 int rf_set_autoconfig(RF_Raid_t *, int);
300 int rf_set_rootpartition(RF_Raid_t *, int);
301 void rf_release_all_vps(RF_ConfigSet_t *);
302 void rf_cleanup_config_set(RF_ConfigSet_t *);
303 int rf_have_enough_components(RF_ConfigSet_t *);
304 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
305 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
306
307 /*
308 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
309 * Note that this is overridden by having RAID_AUTOCONFIG as an option
310 * in the kernel config file.
311 */
312 #ifdef RAID_AUTOCONFIG
313 int raidautoconfig = 1;
314 #else
315 int raidautoconfig = 0;
316 #endif
317 static bool raidautoconfigdone = false;
318
319 struct RF_Pools_s rf_pools;
320
321 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
322 static kmutex_t raid_lock;
323
324 static struct raid_softc *
325 raidcreate(int unit) {
326 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
327 if (sc == NULL) {
328 #ifdef DIAGNOSTIC
329 printf("%s: out of memory\n", __func__);
330 #endif
331 return NULL;
332 }
333 sc->sc_unit = unit;
334 bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
335 return sc;
336 }
337
338 static void
339 raiddestroy(struct raid_softc *sc) {
340 bufq_free(sc->buf_queue);
341 kmem_free(sc, sizeof(*sc));
342 }
343
344 static struct raid_softc *
345 raidget(int unit) {
346 struct raid_softc *sc;
347 if (unit < 0) {
348 #ifdef DIAGNOSTIC
349 panic("%s: unit %d!", __func__, unit);
350 #endif
351 return NULL;
352 }
353 mutex_enter(&raid_lock);
354 LIST_FOREACH(sc, &raids, sc_link) {
355 if (sc->sc_unit == unit) {
356 mutex_exit(&raid_lock);
357 return sc;
358 }
359 }
360 mutex_exit(&raid_lock);
361 if ((sc = raidcreate(unit)) == NULL)
362 return NULL;
363 mutex_enter(&raid_lock);
364 LIST_INSERT_HEAD(&raids, sc, sc_link);
365 mutex_exit(&raid_lock);
366 return sc;
367 }
368
369 static void
370 raidput(struct raid_softc *sc) {
371 mutex_enter(&raid_lock);
372 LIST_REMOVE(sc, sc_link);
373 mutex_exit(&raid_lock);
374 raiddestroy(sc);
375 }
376
377 void
378 raidattach(int num)
379 {
380 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
381 /* This is where all the initialization stuff gets done. */
382
383 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
384 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
385 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
386 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
387
388 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
389 #endif
390
391 if (rf_BootRaidframe() == 0)
392 aprint_verbose("Kernelized RAIDframe activated\n");
393 else
394 panic("Serious error booting RAID!!");
395
396 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
397 aprint_error("raidattach: config_cfattach_attach failed?\n");
398 }
399
400 raidautoconfigdone = false;
401
402 /*
403 * Register a finalizer which will be used to auto-config RAID
404 * sets once all real hardware devices have been found.
405 */
406 if (config_finalize_register(NULL, rf_autoconfig) != 0)
407 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
408 }
409
410 int
411 rf_autoconfig(device_t self)
412 {
413 RF_AutoConfig_t *ac_list;
414 RF_ConfigSet_t *config_sets;
415
416 if (!raidautoconfig || raidautoconfigdone == true)
417 return (0);
418
419 /* XXX This code can only be run once. */
420 raidautoconfigdone = true;
421
422 /* 1. locate all RAID components on the system */
423 aprint_debug("Searching for RAID components...\n");
424 ac_list = rf_find_raid_components();
425
426 /* 2. Sort them into their respective sets. */
427 config_sets = rf_create_auto_sets(ac_list);
428
429 /*
430 * 3. Evaluate each set and configure the valid ones.
431 * This gets done in rf_buildroothack().
432 */
433 rf_buildroothack(config_sets);
434
435 return 1;
436 }
437
438 void
439 rf_buildroothack(RF_ConfigSet_t *config_sets)
440 {
441 RF_ConfigSet_t *cset;
442 RF_ConfigSet_t *next_cset;
443 int col;
444 int num_root;
445 char *devname;
446 struct raid_softc *sc, *rsc;
447
448 sc = rsc = NULL;
449 num_root = 0;
450 cset = config_sets;
451 while (cset != NULL) {
452 next_cset = cset->next;
453 if (rf_have_enough_components(cset) &&
454 cset->ac->clabel->autoconfigure == 1) {
455 sc = rf_auto_config_set(cset);
456 if (sc != NULL) {
457 aprint_debug("raid%d: configured ok\n",
458 sc->sc_unit);
459 if (cset->rootable) {
460 rsc = sc;
461 num_root++;
462 }
463 } else {
464 /* The autoconfig didn't work :( */
465 aprint_debug("Autoconfig failed\n");
466 rf_release_all_vps(cset);
467 }
468 } else {
469 /* we're not autoconfiguring this set...
470 release the associated resources */
471 rf_release_all_vps(cset);
472 }
473 /* cleanup */
474 rf_cleanup_config_set(cset);
475 cset = next_cset;
476 }
477
478 /* if the user has specified what the root device should be
479 then we don't touch booted_device or boothowto... */
480
481 if (rootspec != NULL)
482 return;
483
484 /* we found something bootable... */
485
486 if (num_root == 1) {
487 if (rsc->sc_dkdev.dk_nwedges != 0) {
488 /* XXX: How do we find the real root partition? */
489 char cname[sizeof(cset->ac->devname)];
490 snprintf(cname, sizeof(cname), "%s%c",
491 device_xname(rsc->sc_dev), 'a');
492 booted_device = dkwedge_find_by_wname(cname);
493 } else
494 booted_device = rsc->sc_dev;
495 } else if (num_root > 1) {
496
497 /*
498 * Maybe the MD code can help. If it cannot, then
499 * setroot() will discover that we have no
500 * booted_device and will ask the user if nothing was
501 * hardwired in the kernel config file
502 */
503
504 if (booted_device == NULL)
505 cpu_rootconf();
506 if (booted_device == NULL)
507 return;
508
509 num_root = 0;
510 mutex_enter(&raid_lock);
511 LIST_FOREACH(sc, &raids, sc_link) {
512 RF_Raid_t *r = &sc->sc_r;
513 if (r->valid == 0)
514 continue;
515
516 if (r->root_partition == 0)
517 continue;
518
519 for (col = 0; col < r->numCol; col++) {
520 devname = r->Disks[col].devname;
521 devname += sizeof("/dev/") - 1;
522 if (strncmp(devname, device_xname(booted_device),
523 strlen(device_xname(booted_device))) != 0)
524 continue;
525 aprint_debug("raid%d includes boot device %s\n",
526 sc->sc_unit, devname);
527 num_root++;
528 rsc = sc;
529 }
530 }
531 mutex_exit(&raid_lock);
532
533 if (num_root == 1) {
534 booted_device = rsc->sc_dev;
535 } else {
536 /* we can't guess.. require the user to answer... */
537 boothowto |= RB_ASKNAME;
538 }
539 }
540 }
541
542
543 int
544 raidsize(dev_t dev)
545 {
546 struct raid_softc *rs;
547 struct disklabel *lp;
548 int part, unit, omask, size;
549
550 unit = raidunit(dev);
551 if ((rs = raidget(unit)) == NULL)
552 return -1;
553 if ((rs->sc_flags & RAIDF_INITED) == 0)
554 return (-1);
555
556 part = DISKPART(dev);
557 omask = rs->sc_dkdev.dk_openmask & (1 << part);
558 lp = rs->sc_dkdev.dk_label;
559
560 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
561 return (-1);
562
563 if (lp->d_partitions[part].p_fstype != FS_SWAP)
564 size = -1;
565 else
566 size = lp->d_partitions[part].p_size *
567 (lp->d_secsize / DEV_BSIZE);
568
569 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
570 return (-1);
571
572 return (size);
573
574 }
575
576 int
577 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
578 {
579 int unit = raidunit(dev);
580 struct raid_softc *rs;
581 const struct bdevsw *bdev;
582 struct disklabel *lp;
583 RF_Raid_t *raidPtr;
584 daddr_t offset;
585 int part, c, sparecol, j, scol, dumpto;
586 int error = 0;
587
588 if ((rs = raidget(unit)) == NULL)
589 return ENXIO;
590
591 raidPtr = &rs->sc_r;
592
593 if ((rs->sc_flags & RAIDF_INITED) == 0)
594 return ENXIO;
595
596 /* we only support dumping to RAID 1 sets */
597 if (raidPtr->Layout.numDataCol != 1 ||
598 raidPtr->Layout.numParityCol != 1)
599 return EINVAL;
600
601
602 if ((error = raidlock(rs)) != 0)
603 return error;
604
605 if (size % DEV_BSIZE != 0) {
606 error = EINVAL;
607 goto out;
608 }
609
610 if (blkno + size / DEV_BSIZE > rs->sc_size) {
611 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
612 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
613 size / DEV_BSIZE, rs->sc_size);
614 error = EINVAL;
615 goto out;
616 }
617
618 part = DISKPART(dev);
619 lp = rs->sc_dkdev.dk_label;
620 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
621
622 /* figure out what device is alive.. */
623
624 /*
625 Look for a component to dump to. The preference for the
626 component to dump to is as follows:
627 1) the master
628 2) a used_spare of the master
629 3) the slave
630 4) a used_spare of the slave
631 */
632
633 dumpto = -1;
634 for (c = 0; c < raidPtr->numCol; c++) {
635 if (raidPtr->Disks[c].status == rf_ds_optimal) {
636 /* this might be the one */
637 dumpto = c;
638 break;
639 }
640 }
641
642 /*
643 At this point we have possibly selected a live master or a
644 live slave. We now check to see if there is a spared
645 master (or a spared slave), if we didn't find a live master
646 or a live slave.
647 */
648
649 for (c = 0; c < raidPtr->numSpare; c++) {
650 sparecol = raidPtr->numCol + c;
651 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
652 /* How about this one? */
653 scol = -1;
654 for(j=0;j<raidPtr->numCol;j++) {
655 if (raidPtr->Disks[j].spareCol == sparecol) {
656 scol = j;
657 break;
658 }
659 }
660 if (scol == 0) {
661 /*
662 We must have found a spared master!
663 We'll take that over anything else
664 found so far. (We couldn't have
665 found a real master before, since
666 this is a used spare, and it's
667 saying that it's replacing the
668 master.) On reboot (with
669 autoconfiguration turned on)
670 sparecol will become the 1st
671 component (component0) of this set.
672 */
673 dumpto = sparecol;
674 break;
675 } else if (scol != -1) {
676 /*
677 Must be a spared slave. We'll dump
678 to that if we havn't found anything
679 else so far.
680 */
681 if (dumpto == -1)
682 dumpto = sparecol;
683 }
684 }
685 }
686
687 if (dumpto == -1) {
688 /* we couldn't find any live components to dump to!?!?
689 */
690 error = EINVAL;
691 goto out;
692 }
693
694 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
695
696 /*
697 Note that blkno is relative to this particular partition.
698 By adding the offset of this partition in the RAID
699 set, and also adding RF_PROTECTED_SECTORS, we get a
700 value that is relative to the partition used for the
701 underlying component.
702 */
703
704 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
705 blkno + offset, va, size);
706
707 out:
708 raidunlock(rs);
709
710 return error;
711 }
712 /* ARGSUSED */
713 int
714 raidopen(dev_t dev, int flags, int fmt,
715 struct lwp *l)
716 {
717 int unit = raidunit(dev);
718 struct raid_softc *rs;
719 struct disklabel *lp;
720 int part, pmask;
721 int error = 0;
722
723 if ((rs = raidget(unit)) == NULL)
724 return ENXIO;
725 if ((error = raidlock(rs)) != 0)
726 return (error);
727
728 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
729 error = EBUSY;
730 goto bad;
731 }
732
733 lp = rs->sc_dkdev.dk_label;
734
735 part = DISKPART(dev);
736
737 /*
738 * If there are wedges, and this is not RAW_PART, then we
739 * need to fail.
740 */
741 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
742 error = EBUSY;
743 goto bad;
744 }
745 pmask = (1 << part);
746
747 if ((rs->sc_flags & RAIDF_INITED) &&
748 (rs->sc_dkdev.dk_openmask == 0))
749 raidgetdisklabel(dev);
750
751 /* make sure that this partition exists */
752
753 if (part != RAW_PART) {
754 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
755 ((part >= lp->d_npartitions) ||
756 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
757 error = ENXIO;
758 goto bad;
759 }
760 }
761 /* Prevent this unit from being unconfigured while open. */
762 switch (fmt) {
763 case S_IFCHR:
764 rs->sc_dkdev.dk_copenmask |= pmask;
765 break;
766
767 case S_IFBLK:
768 rs->sc_dkdev.dk_bopenmask |= pmask;
769 break;
770 }
771
772 if ((rs->sc_dkdev.dk_openmask == 0) &&
773 ((rs->sc_flags & RAIDF_INITED) != 0)) {
774 /* First one... mark things as dirty... Note that we *MUST*
775 have done a configure before this. I DO NOT WANT TO BE
776 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
777 THAT THEY BELONG TOGETHER!!!!! */
778 /* XXX should check to see if we're only open for reading
779 here... If so, we needn't do this, but then need some
780 other way of keeping track of what's happened.. */
781
782 rf_markalldirty(&rs->sc_r);
783 }
784
785
786 rs->sc_dkdev.dk_openmask =
787 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
788
789 bad:
790 raidunlock(rs);
791
792 return (error);
793
794
795 }
796 /* ARGSUSED */
797 int
798 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
799 {
800 int unit = raidunit(dev);
801 struct raid_softc *rs;
802 int error = 0;
803 int part;
804
805 if ((rs = raidget(unit)) == NULL)
806 return ENXIO;
807
808 if ((error = raidlock(rs)) != 0)
809 return (error);
810
811 part = DISKPART(dev);
812
813 /* ...that much closer to allowing unconfiguration... */
814 switch (fmt) {
815 case S_IFCHR:
816 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
817 break;
818
819 case S_IFBLK:
820 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
821 break;
822 }
823 rs->sc_dkdev.dk_openmask =
824 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
825
826 if ((rs->sc_dkdev.dk_openmask == 0) &&
827 ((rs->sc_flags & RAIDF_INITED) != 0)) {
828 /* Last one... device is not unconfigured yet.
829 Device shutdown has taken care of setting the
830 clean bits if RAIDF_INITED is not set
831 mark things as clean... */
832
833 rf_update_component_labels(&rs->sc_r,
834 RF_FINAL_COMPONENT_UPDATE);
835
836 /* If the kernel is shutting down, it will detach
837 * this RAID set soon enough.
838 */
839 }
840
841 raidunlock(rs);
842 return (0);
843
844 }
845
846 void
847 raidstrategy(struct buf *bp)
848 {
849 unsigned int unit = raidunit(bp->b_dev);
850 RF_Raid_t *raidPtr;
851 int wlabel;
852 struct raid_softc *rs;
853
854 if ((rs = raidget(unit)) == NULL) {
855 bp->b_error = ENXIO;
856 goto done;
857 }
858 if ((rs->sc_flags & RAIDF_INITED) == 0) {
859 bp->b_error = ENXIO;
860 goto done;
861 }
862 raidPtr = &rs->sc_r;
863 if (!raidPtr->valid) {
864 bp->b_error = ENODEV;
865 goto done;
866 }
867 if (bp->b_bcount == 0) {
868 db1_printf(("b_bcount is zero..\n"));
869 goto done;
870 }
871
872 /*
873 * Do bounds checking and adjust transfer. If there's an
874 * error, the bounds check will flag that for us.
875 */
876
877 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
878 if (DISKPART(bp->b_dev) == RAW_PART) {
879 uint64_t size; /* device size in DEV_BSIZE unit */
880
881 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
882 size = raidPtr->totalSectors <<
883 (raidPtr->logBytesPerSector - DEV_BSHIFT);
884 } else {
885 size = raidPtr->totalSectors >>
886 (DEV_BSHIFT - raidPtr->logBytesPerSector);
887 }
888 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
889 goto done;
890 }
891 } else {
892 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
893 db1_printf(("Bounds check failed!!:%d %d\n",
894 (int) bp->b_blkno, (int) wlabel));
895 goto done;
896 }
897 }
898
899 rf_lock_mutex2(raidPtr->iodone_lock);
900
901 bp->b_resid = 0;
902
903 /* stuff it onto our queue */
904 bufq_put(rs->buf_queue, bp);
905
906 /* scheduled the IO to happen at the next convenient time */
907 rf_signal_cond2(raidPtr->iodone_cv);
908 rf_unlock_mutex2(raidPtr->iodone_lock);
909
910 return;
911
912 done:
913 bp->b_resid = bp->b_bcount;
914 biodone(bp);
915 }
916 /* ARGSUSED */
917 int
918 raidread(dev_t dev, struct uio *uio, int flags)
919 {
920 int unit = raidunit(dev);
921 struct raid_softc *rs;
922
923 if ((rs = raidget(unit)) == NULL)
924 return ENXIO;
925
926 if ((rs->sc_flags & RAIDF_INITED) == 0)
927 return (ENXIO);
928
929 return (physio(raidstrategy, NULL, dev, B_READ, raidminphys, uio));
930
931 }
932 /* ARGSUSED */
933 int
934 raidwrite(dev_t dev, struct uio *uio, int flags)
935 {
936 int unit = raidunit(dev);
937 struct raid_softc *rs;
938
939 if ((rs = raidget(unit)) == NULL)
940 return ENXIO;
941
942 if ((rs->sc_flags & RAIDF_INITED) == 0)
943 return (ENXIO);
944
945 return (physio(raidstrategy, NULL, dev, B_WRITE, raidminphys, uio));
946
947 }
948
949 static int
950 raid_detach_unlocked(struct raid_softc *rs)
951 {
952 int error;
953 RF_Raid_t *raidPtr;
954
955 raidPtr = &rs->sc_r;
956
957 /*
958 * If somebody has a partition mounted, we shouldn't
959 * shutdown.
960 */
961 if (rs->sc_dkdev.dk_openmask != 0)
962 return EBUSY;
963
964 if ((rs->sc_flags & RAIDF_INITED) == 0)
965 ; /* not initialized: nothing to do */
966 else if ((error = rf_Shutdown(raidPtr)) != 0)
967 return error;
968 else
969 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
970
971 /* Detach the disk. */
972 dkwedge_delall(&rs->sc_dkdev);
973 disk_detach(&rs->sc_dkdev);
974 disk_destroy(&rs->sc_dkdev);
975
976 aprint_normal_dev(rs->sc_dev, "detached\n");
977
978 return 0;
979 }
980
981 int
982 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
983 {
984 int unit = raidunit(dev);
985 int error = 0;
986 int part, pmask, s;
987 cfdata_t cf;
988 struct raid_softc *rs;
989 RF_Config_t *k_cfg, *u_cfg;
990 RF_Raid_t *raidPtr;
991 RF_RaidDisk_t *diskPtr;
992 RF_AccTotals_t *totals;
993 RF_DeviceConfig_t *d_cfg, **ucfgp;
994 u_char *specific_buf;
995 int retcode = 0;
996 int column;
997 /* int raidid; */
998 struct rf_recon_req *rrcopy, *rr;
999 RF_ComponentLabel_t *clabel;
1000 RF_ComponentLabel_t *ci_label;
1001 RF_ComponentLabel_t **clabel_ptr;
1002 RF_SingleComponent_t *sparePtr,*componentPtr;
1003 RF_SingleComponent_t component;
1004 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1005 int i, j, d;
1006 #ifdef __HAVE_OLD_DISKLABEL
1007 struct disklabel newlabel;
1008 #endif
1009 struct dkwedge_info *dkw;
1010
1011 if ((rs = raidget(unit)) == NULL)
1012 return ENXIO;
1013 raidPtr = &rs->sc_r;
1014
1015 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1016 (int) DISKPART(dev), (int) unit, cmd));
1017
1018 /* Must be open for writes for these commands... */
1019 switch (cmd) {
1020 #ifdef DIOCGSECTORSIZE
1021 case DIOCGSECTORSIZE:
1022 *(u_int *)data = raidPtr->bytesPerSector;
1023 return 0;
1024 case DIOCGMEDIASIZE:
1025 *(off_t *)data =
1026 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1027 return 0;
1028 #endif
1029 case DIOCSDINFO:
1030 case DIOCWDINFO:
1031 #ifdef __HAVE_OLD_DISKLABEL
1032 case ODIOCWDINFO:
1033 case ODIOCSDINFO:
1034 #endif
1035 case DIOCWLABEL:
1036 case DIOCAWEDGE:
1037 case DIOCDWEDGE:
1038 case DIOCSSTRATEGY:
1039 if ((flag & FWRITE) == 0)
1040 return (EBADF);
1041 }
1042
1043 /* Must be initialized for these... */
1044 switch (cmd) {
1045 case DIOCGDINFO:
1046 case DIOCSDINFO:
1047 case DIOCWDINFO:
1048 #ifdef __HAVE_OLD_DISKLABEL
1049 case ODIOCGDINFO:
1050 case ODIOCWDINFO:
1051 case ODIOCSDINFO:
1052 case ODIOCGDEFLABEL:
1053 #endif
1054 case DIOCGPART:
1055 case DIOCWLABEL:
1056 case DIOCGDEFLABEL:
1057 case DIOCAWEDGE:
1058 case DIOCDWEDGE:
1059 case DIOCLWEDGES:
1060 case DIOCCACHESYNC:
1061 case RAIDFRAME_SHUTDOWN:
1062 case RAIDFRAME_REWRITEPARITY:
1063 case RAIDFRAME_GET_INFO:
1064 case RAIDFRAME_RESET_ACCTOTALS:
1065 case RAIDFRAME_GET_ACCTOTALS:
1066 case RAIDFRAME_KEEP_ACCTOTALS:
1067 case RAIDFRAME_GET_SIZE:
1068 case RAIDFRAME_FAIL_DISK:
1069 case RAIDFRAME_COPYBACK:
1070 case RAIDFRAME_CHECK_RECON_STATUS:
1071 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1072 case RAIDFRAME_GET_COMPONENT_LABEL:
1073 case RAIDFRAME_SET_COMPONENT_LABEL:
1074 case RAIDFRAME_ADD_HOT_SPARE:
1075 case RAIDFRAME_REMOVE_HOT_SPARE:
1076 case RAIDFRAME_INIT_LABELS:
1077 case RAIDFRAME_REBUILD_IN_PLACE:
1078 case RAIDFRAME_CHECK_PARITY:
1079 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1080 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1081 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1082 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1083 case RAIDFRAME_SET_AUTOCONFIG:
1084 case RAIDFRAME_SET_ROOT:
1085 case RAIDFRAME_DELETE_COMPONENT:
1086 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1087 case RAIDFRAME_PARITYMAP_STATUS:
1088 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1089 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1090 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1091 case DIOCGSTRATEGY:
1092 case DIOCSSTRATEGY:
1093 if ((rs->sc_flags & RAIDF_INITED) == 0)
1094 return (ENXIO);
1095 }
1096
1097 switch (cmd) {
1098 #ifdef COMPAT_50
1099 case RAIDFRAME_GET_INFO50:
1100 return rf_get_info50(raidPtr, data);
1101
1102 case RAIDFRAME_CONFIGURE50:
1103 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1104 return retcode;
1105 goto config;
1106 #endif
1107 /* configure the system */
1108 case RAIDFRAME_CONFIGURE:
1109
1110 if (raidPtr->valid) {
1111 /* There is a valid RAID set running on this unit! */
1112 printf("raid%d: Device already configured!\n",unit);
1113 return(EINVAL);
1114 }
1115
1116 /* copy-in the configuration information */
1117 /* data points to a pointer to the configuration structure */
1118
1119 u_cfg = *((RF_Config_t **) data);
1120 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1121 if (k_cfg == NULL) {
1122 return (ENOMEM);
1123 }
1124 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1125 if (retcode) {
1126 RF_Free(k_cfg, sizeof(RF_Config_t));
1127 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1128 retcode));
1129 return (retcode);
1130 }
1131 goto config;
1132 config:
1133 /* allocate a buffer for the layout-specific data, and copy it
1134 * in */
1135 if (k_cfg->layoutSpecificSize) {
1136 if (k_cfg->layoutSpecificSize > 10000) {
1137 /* sanity check */
1138 RF_Free(k_cfg, sizeof(RF_Config_t));
1139 return (EINVAL);
1140 }
1141 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1142 (u_char *));
1143 if (specific_buf == NULL) {
1144 RF_Free(k_cfg, sizeof(RF_Config_t));
1145 return (ENOMEM);
1146 }
1147 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1148 k_cfg->layoutSpecificSize);
1149 if (retcode) {
1150 RF_Free(k_cfg, sizeof(RF_Config_t));
1151 RF_Free(specific_buf,
1152 k_cfg->layoutSpecificSize);
1153 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1154 retcode));
1155 return (retcode);
1156 }
1157 } else
1158 specific_buf = NULL;
1159 k_cfg->layoutSpecific = specific_buf;
1160
1161 /* should do some kind of sanity check on the configuration.
1162 * Store the sum of all the bytes in the last byte? */
1163
1164 /* configure the system */
1165
1166 /*
1167 * Clear the entire RAID descriptor, just to make sure
1168 * there is no stale data left in the case of a
1169 * reconfiguration
1170 */
1171 memset(raidPtr, 0, sizeof(*raidPtr));
1172 raidPtr->softc = rs;
1173 raidPtr->raidid = unit;
1174
1175 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1176
1177 if (retcode == 0) {
1178
1179 /* allow this many simultaneous IO's to
1180 this RAID device */
1181 raidPtr->openings = RAIDOUTSTANDING;
1182
1183 raidinit(rs);
1184 rf_markalldirty(raidPtr);
1185 }
1186 /* free the buffers. No return code here. */
1187 if (k_cfg->layoutSpecificSize) {
1188 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1189 }
1190 RF_Free(k_cfg, sizeof(RF_Config_t));
1191
1192 return (retcode);
1193
1194 /* shutdown the system */
1195 case RAIDFRAME_SHUTDOWN:
1196
1197 part = DISKPART(dev);
1198 pmask = (1 << part);
1199
1200 if ((error = raidlock(rs)) != 0)
1201 return (error);
1202
1203 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1204 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1205 (rs->sc_dkdev.dk_copenmask & pmask)))
1206 retcode = EBUSY;
1207 else {
1208 rs->sc_flags |= RAIDF_SHUTDOWN;
1209 rs->sc_dkdev.dk_copenmask &= ~pmask;
1210 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1211 rs->sc_dkdev.dk_openmask &= ~pmask;
1212 retcode = 0;
1213 }
1214
1215 raidunlock(rs);
1216
1217 if (retcode != 0)
1218 return retcode;
1219
1220 /* free the pseudo device attach bits */
1221
1222 cf = device_cfdata(rs->sc_dev);
1223 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1224 free(cf, M_RAIDFRAME);
1225
1226 return (retcode);
1227 case RAIDFRAME_GET_COMPONENT_LABEL:
1228 clabel_ptr = (RF_ComponentLabel_t **) data;
1229 /* need to read the component label for the disk indicated
1230 by row,column in clabel */
1231
1232 /*
1233 * Perhaps there should be an option to skip the in-core
1234 * copy and hit the disk, as with disklabel(8).
1235 */
1236 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1237
1238 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1239
1240 if (retcode) {
1241 RF_Free(clabel, sizeof(*clabel));
1242 return retcode;
1243 }
1244
1245 clabel->row = 0; /* Don't allow looking at anything else.*/
1246
1247 column = clabel->column;
1248
1249 if ((column < 0) || (column >= raidPtr->numCol +
1250 raidPtr->numSpare)) {
1251 RF_Free(clabel, sizeof(*clabel));
1252 return EINVAL;
1253 }
1254
1255 RF_Free(clabel, sizeof(*clabel));
1256
1257 clabel = raidget_component_label(raidPtr, column);
1258
1259 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1260
1261 #if 0
1262 case RAIDFRAME_SET_COMPONENT_LABEL:
1263 clabel = (RF_ComponentLabel_t *) data;
1264
1265 /* XXX check the label for valid stuff... */
1266 /* Note that some things *should not* get modified --
1267 the user should be re-initing the labels instead of
1268 trying to patch things.
1269 */
1270
1271 raidid = raidPtr->raidid;
1272 #ifdef DEBUG
1273 printf("raid%d: Got component label:\n", raidid);
1274 printf("raid%d: Version: %d\n", raidid, clabel->version);
1275 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1276 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1277 printf("raid%d: Column: %d\n", raidid, clabel->column);
1278 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1279 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1280 printf("raid%d: Status: %d\n", raidid, clabel->status);
1281 #endif
1282 clabel->row = 0;
1283 column = clabel->column;
1284
1285 if ((column < 0) || (column >= raidPtr->numCol)) {
1286 return(EINVAL);
1287 }
1288
1289 /* XXX this isn't allowed to do anything for now :-) */
1290
1291 /* XXX and before it is, we need to fill in the rest
1292 of the fields!?!?!?! */
1293 memcpy(raidget_component_label(raidPtr, column),
1294 clabel, sizeof(*clabel));
1295 raidflush_component_label(raidPtr, column);
1296 return (0);
1297 #endif
1298
1299 case RAIDFRAME_INIT_LABELS:
1300 clabel = (RF_ComponentLabel_t *) data;
1301 /*
1302 we only want the serial number from
1303 the above. We get all the rest of the information
1304 from the config that was used to create this RAID
1305 set.
1306 */
1307
1308 raidPtr->serial_number = clabel->serial_number;
1309
1310 for(column=0;column<raidPtr->numCol;column++) {
1311 diskPtr = &raidPtr->Disks[column];
1312 if (!RF_DEAD_DISK(diskPtr->status)) {
1313 ci_label = raidget_component_label(raidPtr,
1314 column);
1315 /* Zeroing this is important. */
1316 memset(ci_label, 0, sizeof(*ci_label));
1317 raid_init_component_label(raidPtr, ci_label);
1318 ci_label->serial_number =
1319 raidPtr->serial_number;
1320 ci_label->row = 0; /* we dont' pretend to support more */
1321 rf_component_label_set_partitionsize(ci_label,
1322 diskPtr->partitionSize);
1323 ci_label->column = column;
1324 raidflush_component_label(raidPtr, column);
1325 }
1326 /* XXXjld what about the spares? */
1327 }
1328
1329 return (retcode);
1330 case RAIDFRAME_SET_AUTOCONFIG:
1331 d = rf_set_autoconfig(raidPtr, *(int *) data);
1332 printf("raid%d: New autoconfig value is: %d\n",
1333 raidPtr->raidid, d);
1334 *(int *) data = d;
1335 return (retcode);
1336
1337 case RAIDFRAME_SET_ROOT:
1338 d = rf_set_rootpartition(raidPtr, *(int *) data);
1339 printf("raid%d: New rootpartition value is: %d\n",
1340 raidPtr->raidid, d);
1341 *(int *) data = d;
1342 return (retcode);
1343
1344 /* initialize all parity */
1345 case RAIDFRAME_REWRITEPARITY:
1346
1347 if (raidPtr->Layout.map->faultsTolerated == 0) {
1348 /* Parity for RAID 0 is trivially correct */
1349 raidPtr->parity_good = RF_RAID_CLEAN;
1350 return(0);
1351 }
1352
1353 if (raidPtr->parity_rewrite_in_progress == 1) {
1354 /* Re-write is already in progress! */
1355 return(EINVAL);
1356 }
1357
1358 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1359 rf_RewriteParityThread,
1360 raidPtr,"raid_parity");
1361 return (retcode);
1362
1363
1364 case RAIDFRAME_ADD_HOT_SPARE:
1365 sparePtr = (RF_SingleComponent_t *) data;
1366 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1367 retcode = rf_add_hot_spare(raidPtr, &component);
1368 return(retcode);
1369
1370 case RAIDFRAME_REMOVE_HOT_SPARE:
1371 return(retcode);
1372
1373 case RAIDFRAME_DELETE_COMPONENT:
1374 componentPtr = (RF_SingleComponent_t *)data;
1375 memcpy( &component, componentPtr,
1376 sizeof(RF_SingleComponent_t));
1377 retcode = rf_delete_component(raidPtr, &component);
1378 return(retcode);
1379
1380 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1381 componentPtr = (RF_SingleComponent_t *)data;
1382 memcpy( &component, componentPtr,
1383 sizeof(RF_SingleComponent_t));
1384 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1385 return(retcode);
1386
1387 case RAIDFRAME_REBUILD_IN_PLACE:
1388
1389 if (raidPtr->Layout.map->faultsTolerated == 0) {
1390 /* Can't do this on a RAID 0!! */
1391 return(EINVAL);
1392 }
1393
1394 if (raidPtr->recon_in_progress == 1) {
1395 /* a reconstruct is already in progress! */
1396 return(EINVAL);
1397 }
1398
1399 componentPtr = (RF_SingleComponent_t *) data;
1400 memcpy( &component, componentPtr,
1401 sizeof(RF_SingleComponent_t));
1402 component.row = 0; /* we don't support any more */
1403 column = component.column;
1404
1405 if ((column < 0) || (column >= raidPtr->numCol)) {
1406 return(EINVAL);
1407 }
1408
1409 rf_lock_mutex2(raidPtr->mutex);
1410 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1411 (raidPtr->numFailures > 0)) {
1412 /* XXX 0 above shouldn't be constant!!! */
1413 /* some component other than this has failed.
1414 Let's not make things worse than they already
1415 are... */
1416 printf("raid%d: Unable to reconstruct to disk at:\n",
1417 raidPtr->raidid);
1418 printf("raid%d: Col: %d Too many failures.\n",
1419 raidPtr->raidid, column);
1420 rf_unlock_mutex2(raidPtr->mutex);
1421 return (EINVAL);
1422 }
1423 if (raidPtr->Disks[column].status ==
1424 rf_ds_reconstructing) {
1425 printf("raid%d: Unable to reconstruct to disk at:\n",
1426 raidPtr->raidid);
1427 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1428
1429 rf_unlock_mutex2(raidPtr->mutex);
1430 return (EINVAL);
1431 }
1432 if (raidPtr->Disks[column].status == rf_ds_spared) {
1433 rf_unlock_mutex2(raidPtr->mutex);
1434 return (EINVAL);
1435 }
1436 rf_unlock_mutex2(raidPtr->mutex);
1437
1438 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1439 if (rrcopy == NULL)
1440 return(ENOMEM);
1441
1442 rrcopy->raidPtr = (void *) raidPtr;
1443 rrcopy->col = column;
1444
1445 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1446 rf_ReconstructInPlaceThread,
1447 rrcopy,"raid_reconip");
1448 return(retcode);
1449
1450 case RAIDFRAME_GET_INFO:
1451 if (!raidPtr->valid)
1452 return (ENODEV);
1453 ucfgp = (RF_DeviceConfig_t **) data;
1454 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1455 (RF_DeviceConfig_t *));
1456 if (d_cfg == NULL)
1457 return (ENOMEM);
1458 d_cfg->rows = 1; /* there is only 1 row now */
1459 d_cfg->cols = raidPtr->numCol;
1460 d_cfg->ndevs = raidPtr->numCol;
1461 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1462 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1463 return (ENOMEM);
1464 }
1465 d_cfg->nspares = raidPtr->numSpare;
1466 if (d_cfg->nspares >= RF_MAX_DISKS) {
1467 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1468 return (ENOMEM);
1469 }
1470 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1471 d = 0;
1472 for (j = 0; j < d_cfg->cols; j++) {
1473 d_cfg->devs[d] = raidPtr->Disks[j];
1474 d++;
1475 }
1476 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1477 d_cfg->spares[i] = raidPtr->Disks[j];
1478 }
1479 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1480 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1481
1482 return (retcode);
1483
1484 case RAIDFRAME_CHECK_PARITY:
1485 *(int *) data = raidPtr->parity_good;
1486 return (0);
1487
1488 case RAIDFRAME_PARITYMAP_STATUS:
1489 if (rf_paritymap_ineligible(raidPtr))
1490 return EINVAL;
1491 rf_paritymap_status(raidPtr->parity_map,
1492 (struct rf_pmstat *)data);
1493 return 0;
1494
1495 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1496 if (rf_paritymap_ineligible(raidPtr))
1497 return EINVAL;
1498 if (raidPtr->parity_map == NULL)
1499 return ENOENT; /* ??? */
1500 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1501 (struct rf_pmparams *)data, 1))
1502 return EINVAL;
1503 return 0;
1504
1505 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1506 if (rf_paritymap_ineligible(raidPtr))
1507 return EINVAL;
1508 *(int *) data = rf_paritymap_get_disable(raidPtr);
1509 return 0;
1510
1511 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1512 if (rf_paritymap_ineligible(raidPtr))
1513 return EINVAL;
1514 rf_paritymap_set_disable(raidPtr, *(int *)data);
1515 /* XXX should errors be passed up? */
1516 return 0;
1517
1518 case RAIDFRAME_RESET_ACCTOTALS:
1519 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1520 return (0);
1521
1522 case RAIDFRAME_GET_ACCTOTALS:
1523 totals = (RF_AccTotals_t *) data;
1524 *totals = raidPtr->acc_totals;
1525 return (0);
1526
1527 case RAIDFRAME_KEEP_ACCTOTALS:
1528 raidPtr->keep_acc_totals = *(int *)data;
1529 return (0);
1530
1531 case RAIDFRAME_GET_SIZE:
1532 *(int *) data = raidPtr->totalSectors;
1533 return (0);
1534
1535 /* fail a disk & optionally start reconstruction */
1536 case RAIDFRAME_FAIL_DISK:
1537
1538 if (raidPtr->Layout.map->faultsTolerated == 0) {
1539 /* Can't do this on a RAID 0!! */
1540 return(EINVAL);
1541 }
1542
1543 rr = (struct rf_recon_req *) data;
1544 rr->row = 0;
1545 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1546 return (EINVAL);
1547
1548
1549 rf_lock_mutex2(raidPtr->mutex);
1550 if (raidPtr->status == rf_rs_reconstructing) {
1551 /* you can't fail a disk while we're reconstructing! */
1552 /* XXX wrong for RAID6 */
1553 rf_unlock_mutex2(raidPtr->mutex);
1554 return (EINVAL);
1555 }
1556 if ((raidPtr->Disks[rr->col].status ==
1557 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1558 /* some other component has failed. Let's not make
1559 things worse. XXX wrong for RAID6 */
1560 rf_unlock_mutex2(raidPtr->mutex);
1561 return (EINVAL);
1562 }
1563 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1564 /* Can't fail a spared disk! */
1565 rf_unlock_mutex2(raidPtr->mutex);
1566 return (EINVAL);
1567 }
1568 rf_unlock_mutex2(raidPtr->mutex);
1569
1570 /* make a copy of the recon request so that we don't rely on
1571 * the user's buffer */
1572 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1573 if (rrcopy == NULL)
1574 return(ENOMEM);
1575 memcpy(rrcopy, rr, sizeof(*rr));
1576 rrcopy->raidPtr = (void *) raidPtr;
1577
1578 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1579 rf_ReconThread,
1580 rrcopy,"raid_recon");
1581 return (0);
1582
1583 /* invoke a copyback operation after recon on whatever disk
1584 * needs it, if any */
1585 case RAIDFRAME_COPYBACK:
1586
1587 if (raidPtr->Layout.map->faultsTolerated == 0) {
1588 /* This makes no sense on a RAID 0!! */
1589 return(EINVAL);
1590 }
1591
1592 if (raidPtr->copyback_in_progress == 1) {
1593 /* Copyback is already in progress! */
1594 return(EINVAL);
1595 }
1596
1597 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1598 rf_CopybackThread,
1599 raidPtr,"raid_copyback");
1600 return (retcode);
1601
1602 /* return the percentage completion of reconstruction */
1603 case RAIDFRAME_CHECK_RECON_STATUS:
1604 if (raidPtr->Layout.map->faultsTolerated == 0) {
1605 /* This makes no sense on a RAID 0, so tell the
1606 user it's done. */
1607 *(int *) data = 100;
1608 return(0);
1609 }
1610 if (raidPtr->status != rf_rs_reconstructing)
1611 *(int *) data = 100;
1612 else {
1613 if (raidPtr->reconControl->numRUsTotal > 0) {
1614 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1615 } else {
1616 *(int *) data = 0;
1617 }
1618 }
1619 return (0);
1620 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1621 progressInfoPtr = (RF_ProgressInfo_t **) data;
1622 if (raidPtr->status != rf_rs_reconstructing) {
1623 progressInfo.remaining = 0;
1624 progressInfo.completed = 100;
1625 progressInfo.total = 100;
1626 } else {
1627 progressInfo.total =
1628 raidPtr->reconControl->numRUsTotal;
1629 progressInfo.completed =
1630 raidPtr->reconControl->numRUsComplete;
1631 progressInfo.remaining = progressInfo.total -
1632 progressInfo.completed;
1633 }
1634 retcode = copyout(&progressInfo, *progressInfoPtr,
1635 sizeof(RF_ProgressInfo_t));
1636 return (retcode);
1637
1638 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1639 if (raidPtr->Layout.map->faultsTolerated == 0) {
1640 /* This makes no sense on a RAID 0, so tell the
1641 user it's done. */
1642 *(int *) data = 100;
1643 return(0);
1644 }
1645 if (raidPtr->parity_rewrite_in_progress == 1) {
1646 *(int *) data = 100 *
1647 raidPtr->parity_rewrite_stripes_done /
1648 raidPtr->Layout.numStripe;
1649 } else {
1650 *(int *) data = 100;
1651 }
1652 return (0);
1653
1654 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1655 progressInfoPtr = (RF_ProgressInfo_t **) data;
1656 if (raidPtr->parity_rewrite_in_progress == 1) {
1657 progressInfo.total = raidPtr->Layout.numStripe;
1658 progressInfo.completed =
1659 raidPtr->parity_rewrite_stripes_done;
1660 progressInfo.remaining = progressInfo.total -
1661 progressInfo.completed;
1662 } else {
1663 progressInfo.remaining = 0;
1664 progressInfo.completed = 100;
1665 progressInfo.total = 100;
1666 }
1667 retcode = copyout(&progressInfo, *progressInfoPtr,
1668 sizeof(RF_ProgressInfo_t));
1669 return (retcode);
1670
1671 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1672 if (raidPtr->Layout.map->faultsTolerated == 0) {
1673 /* This makes no sense on a RAID 0 */
1674 *(int *) data = 100;
1675 return(0);
1676 }
1677 if (raidPtr->copyback_in_progress == 1) {
1678 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1679 raidPtr->Layout.numStripe;
1680 } else {
1681 *(int *) data = 100;
1682 }
1683 return (0);
1684
1685 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1686 progressInfoPtr = (RF_ProgressInfo_t **) data;
1687 if (raidPtr->copyback_in_progress == 1) {
1688 progressInfo.total = raidPtr->Layout.numStripe;
1689 progressInfo.completed =
1690 raidPtr->copyback_stripes_done;
1691 progressInfo.remaining = progressInfo.total -
1692 progressInfo.completed;
1693 } else {
1694 progressInfo.remaining = 0;
1695 progressInfo.completed = 100;
1696 progressInfo.total = 100;
1697 }
1698 retcode = copyout(&progressInfo, *progressInfoPtr,
1699 sizeof(RF_ProgressInfo_t));
1700 return (retcode);
1701
1702 /* the sparetable daemon calls this to wait for the kernel to
1703 * need a spare table. this ioctl does not return until a
1704 * spare table is needed. XXX -- calling mpsleep here in the
1705 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1706 * -- I should either compute the spare table in the kernel,
1707 * or have a different -- XXX XXX -- interface (a different
1708 * character device) for delivering the table -- XXX */
1709 #if 0
1710 case RAIDFRAME_SPARET_WAIT:
1711 rf_lock_mutex2(rf_sparet_wait_mutex);
1712 while (!rf_sparet_wait_queue)
1713 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1714 waitreq = rf_sparet_wait_queue;
1715 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1716 rf_unlock_mutex2(rf_sparet_wait_mutex);
1717
1718 /* structure assignment */
1719 *((RF_SparetWait_t *) data) = *waitreq;
1720
1721 RF_Free(waitreq, sizeof(*waitreq));
1722 return (0);
1723
1724 /* wakes up a process waiting on SPARET_WAIT and puts an error
1725 * code in it that will cause the dameon to exit */
1726 case RAIDFRAME_ABORT_SPARET_WAIT:
1727 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1728 waitreq->fcol = -1;
1729 rf_lock_mutex2(rf_sparet_wait_mutex);
1730 waitreq->next = rf_sparet_wait_queue;
1731 rf_sparet_wait_queue = waitreq;
1732 rf_broadcast_conf2(rf_sparet_wait_cv);
1733 rf_unlock_mutex2(rf_sparet_wait_mutex);
1734 return (0);
1735
1736 /* used by the spare table daemon to deliver a spare table
1737 * into the kernel */
1738 case RAIDFRAME_SEND_SPARET:
1739
1740 /* install the spare table */
1741 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1742
1743 /* respond to the requestor. the return status of the spare
1744 * table installation is passed in the "fcol" field */
1745 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1746 waitreq->fcol = retcode;
1747 rf_lock_mutex2(rf_sparet_wait_mutex);
1748 waitreq->next = rf_sparet_resp_queue;
1749 rf_sparet_resp_queue = waitreq;
1750 rf_broadcast_cond2(rf_sparet_resp_cv);
1751 rf_unlock_mutex2(rf_sparet_wait_mutex);
1752
1753 return (retcode);
1754 #endif
1755
1756 default:
1757 break; /* fall through to the os-specific code below */
1758
1759 }
1760
1761 if (!raidPtr->valid)
1762 return (EINVAL);
1763
1764 /*
1765 * Add support for "regular" device ioctls here.
1766 */
1767
1768 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1769 if (error != EPASSTHROUGH)
1770 return (error);
1771
1772 switch (cmd) {
1773 case DIOCGDINFO:
1774 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1775 break;
1776 #ifdef __HAVE_OLD_DISKLABEL
1777 case ODIOCGDINFO:
1778 newlabel = *(rs->sc_dkdev.dk_label);
1779 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1780 return ENOTTY;
1781 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1782 break;
1783 #endif
1784
1785 case DIOCGPART:
1786 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1787 ((struct partinfo *) data)->part =
1788 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1789 break;
1790
1791 case DIOCWDINFO:
1792 case DIOCSDINFO:
1793 #ifdef __HAVE_OLD_DISKLABEL
1794 case ODIOCWDINFO:
1795 case ODIOCSDINFO:
1796 #endif
1797 {
1798 struct disklabel *lp;
1799 #ifdef __HAVE_OLD_DISKLABEL
1800 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1801 memset(&newlabel, 0, sizeof newlabel);
1802 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1803 lp = &newlabel;
1804 } else
1805 #endif
1806 lp = (struct disklabel *)data;
1807
1808 if ((error = raidlock(rs)) != 0)
1809 return (error);
1810
1811 rs->sc_flags |= RAIDF_LABELLING;
1812
1813 error = setdisklabel(rs->sc_dkdev.dk_label,
1814 lp, 0, rs->sc_dkdev.dk_cpulabel);
1815 if (error == 0) {
1816 if (cmd == DIOCWDINFO
1817 #ifdef __HAVE_OLD_DISKLABEL
1818 || cmd == ODIOCWDINFO
1819 #endif
1820 )
1821 error = writedisklabel(RAIDLABELDEV(dev),
1822 raidstrategy, rs->sc_dkdev.dk_label,
1823 rs->sc_dkdev.dk_cpulabel);
1824 }
1825 rs->sc_flags &= ~RAIDF_LABELLING;
1826
1827 raidunlock(rs);
1828
1829 if (error)
1830 return (error);
1831 break;
1832 }
1833
1834 case DIOCWLABEL:
1835 if (*(int *) data != 0)
1836 rs->sc_flags |= RAIDF_WLABEL;
1837 else
1838 rs->sc_flags &= ~RAIDF_WLABEL;
1839 break;
1840
1841 case DIOCGDEFLABEL:
1842 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1843 break;
1844
1845 #ifdef __HAVE_OLD_DISKLABEL
1846 case ODIOCGDEFLABEL:
1847 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1848 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1849 return ENOTTY;
1850 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1851 break;
1852 #endif
1853
1854 case DIOCAWEDGE:
1855 case DIOCDWEDGE:
1856 dkw = (void *)data;
1857
1858 /* If the ioctl happens here, the parent is us. */
1859 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1860 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1861
1862 case DIOCLWEDGES:
1863 return dkwedge_list(&rs->sc_dkdev,
1864 (struct dkwedge_list *)data, l);
1865 case DIOCCACHESYNC:
1866 return rf_sync_component_caches(raidPtr);
1867
1868 case DIOCGSTRATEGY:
1869 {
1870 struct disk_strategy *dks = (void *)data;
1871
1872 s = splbio();
1873 strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
1874 sizeof(dks->dks_name));
1875 splx(s);
1876 dks->dks_paramlen = 0;
1877
1878 return 0;
1879 }
1880
1881 case DIOCSSTRATEGY:
1882 {
1883 struct disk_strategy *dks = (void *)data;
1884 struct bufq_state *new;
1885 struct bufq_state *old;
1886
1887 if (dks->dks_param != NULL) {
1888 return EINVAL;
1889 }
1890 dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
1891 error = bufq_alloc(&new, dks->dks_name,
1892 BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
1893 if (error) {
1894 return error;
1895 }
1896 s = splbio();
1897 old = rs->buf_queue;
1898 bufq_move(new, old);
1899 rs->buf_queue = new;
1900 splx(s);
1901 bufq_free(old);
1902
1903 return 0;
1904 }
1905
1906 default:
1907 retcode = ENOTTY;
1908 }
1909 return (retcode);
1910
1911 }
1912
1913
1914 /* raidinit -- complete the rest of the initialization for the
1915 RAIDframe device. */
1916
1917
1918 static void
1919 raidinit(struct raid_softc *rs)
1920 {
1921 cfdata_t cf;
1922 int unit;
1923 RF_Raid_t *raidPtr = &rs->sc_r;
1924
1925 unit = raidPtr->raidid;
1926
1927
1928 /* XXX should check return code first... */
1929 rs->sc_flags |= RAIDF_INITED;
1930
1931 /* XXX doesn't check bounds. */
1932 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1933
1934 /* attach the pseudo device */
1935 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1936 cf->cf_name = raid_cd.cd_name;
1937 cf->cf_atname = raid_cd.cd_name;
1938 cf->cf_unit = unit;
1939 cf->cf_fstate = FSTATE_STAR;
1940
1941 rs->sc_dev = config_attach_pseudo(cf);
1942
1943 if (rs->sc_dev == NULL) {
1944 printf("raid%d: config_attach_pseudo failed\n",
1945 raidPtr->raidid);
1946 rs->sc_flags &= ~RAIDF_INITED;
1947 free(cf, M_RAIDFRAME);
1948 return;
1949 }
1950
1951 /* disk_attach actually creates space for the CPU disklabel, among
1952 * other things, so it's critical to call this *BEFORE* we try putzing
1953 * with disklabels. */
1954
1955 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1956 disk_attach(&rs->sc_dkdev);
1957 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1958
1959 /* XXX There may be a weird interaction here between this, and
1960 * protectedSectors, as used in RAIDframe. */
1961
1962 rs->sc_size = raidPtr->totalSectors;
1963
1964 dkwedge_discover(&rs->sc_dkdev);
1965
1966 rf_set_geometry(rs, raidPtr);
1967
1968 }
1969 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1970 /* wake up the daemon & tell it to get us a spare table
1971 * XXX
1972 * the entries in the queues should be tagged with the raidPtr
1973 * so that in the extremely rare case that two recons happen at once,
1974 * we know for which device were requesting a spare table
1975 * XXX
1976 *
1977 * XXX This code is not currently used. GO
1978 */
1979 int
1980 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1981 {
1982 int retcode;
1983
1984 rf_lock_mutex2(rf_sparet_wait_mutex);
1985 req->next = rf_sparet_wait_queue;
1986 rf_sparet_wait_queue = req;
1987 rf_broadcast_cond2(rf_sparet_wait_cv);
1988
1989 /* mpsleep unlocks the mutex */
1990 while (!rf_sparet_resp_queue) {
1991 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1992 }
1993 req = rf_sparet_resp_queue;
1994 rf_sparet_resp_queue = req->next;
1995 rf_unlock_mutex2(rf_sparet_wait_mutex);
1996
1997 retcode = req->fcol;
1998 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1999 * alloc'd */
2000 return (retcode);
2001 }
2002 #endif
2003
2004 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2005 * bp & passes it down.
2006 * any calls originating in the kernel must use non-blocking I/O
2007 * do some extra sanity checking to return "appropriate" error values for
2008 * certain conditions (to make some standard utilities work)
2009 *
2010 * Formerly known as: rf_DoAccessKernel
2011 */
2012 void
2013 raidstart(RF_Raid_t *raidPtr)
2014 {
2015 RF_SectorCount_t num_blocks, pb, sum;
2016 RF_RaidAddr_t raid_addr;
2017 struct partition *pp;
2018 daddr_t blocknum;
2019 struct raid_softc *rs;
2020 int do_async;
2021 struct buf *bp;
2022 int rc;
2023
2024 rs = raidPtr->softc;
2025 /* quick check to see if anything has died recently */
2026 rf_lock_mutex2(raidPtr->mutex);
2027 if (raidPtr->numNewFailures > 0) {
2028 rf_unlock_mutex2(raidPtr->mutex);
2029 rf_update_component_labels(raidPtr,
2030 RF_NORMAL_COMPONENT_UPDATE);
2031 rf_lock_mutex2(raidPtr->mutex);
2032 raidPtr->numNewFailures--;
2033 }
2034
2035 /* Check to see if we're at the limit... */
2036 while (raidPtr->openings > 0) {
2037 rf_unlock_mutex2(raidPtr->mutex);
2038
2039 /* get the next item, if any, from the queue */
2040 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2041 /* nothing more to do */
2042 return;
2043 }
2044
2045 /* Ok, for the bp we have here, bp->b_blkno is relative to the
2046 * partition.. Need to make it absolute to the underlying
2047 * device.. */
2048
2049 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2050 if (DISKPART(bp->b_dev) != RAW_PART) {
2051 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2052 blocknum += pp->p_offset;
2053 }
2054
2055 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2056 (int) blocknum));
2057
2058 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2059 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2060
2061 /* *THIS* is where we adjust what block we're going to...
2062 * but DO NOT TOUCH bp->b_blkno!!! */
2063 raid_addr = blocknum;
2064
2065 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2066 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2067 sum = raid_addr + num_blocks + pb;
2068 if (1 || rf_debugKernelAccess) {
2069 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2070 (int) raid_addr, (int) sum, (int) num_blocks,
2071 (int) pb, (int) bp->b_resid));
2072 }
2073 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2074 || (sum < num_blocks) || (sum < pb)) {
2075 bp->b_error = ENOSPC;
2076 bp->b_resid = bp->b_bcount;
2077 biodone(bp);
2078 rf_lock_mutex2(raidPtr->mutex);
2079 continue;
2080 }
2081 /*
2082 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2083 */
2084
2085 if (bp->b_bcount & raidPtr->sectorMask) {
2086 bp->b_error = EINVAL;
2087 bp->b_resid = bp->b_bcount;
2088 biodone(bp);
2089 rf_lock_mutex2(raidPtr->mutex);
2090 continue;
2091
2092 }
2093 db1_printf(("Calling DoAccess..\n"));
2094
2095
2096 rf_lock_mutex2(raidPtr->mutex);
2097 raidPtr->openings--;
2098 rf_unlock_mutex2(raidPtr->mutex);
2099
2100 /*
2101 * Everything is async.
2102 */
2103 do_async = 1;
2104
2105 disk_busy(&rs->sc_dkdev);
2106
2107 /* XXX we're still at splbio() here... do we *really*
2108 need to be? */
2109
2110 /* don't ever condition on bp->b_flags & B_WRITE.
2111 * always condition on B_READ instead */
2112
2113 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2114 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2115 do_async, raid_addr, num_blocks,
2116 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2117
2118 if (rc) {
2119 bp->b_error = rc;
2120 bp->b_resid = bp->b_bcount;
2121 biodone(bp);
2122 /* continue loop */
2123 }
2124
2125 rf_lock_mutex2(raidPtr->mutex);
2126 }
2127 rf_unlock_mutex2(raidPtr->mutex);
2128 }
2129
2130
2131
2132
2133 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2134
2135 int
2136 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2137 {
2138 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2139 struct buf *bp;
2140
2141 req->queue = queue;
2142 bp = req->bp;
2143
2144 switch (req->type) {
2145 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2146 /* XXX need to do something extra here.. */
2147 /* I'm leaving this in, as I've never actually seen it used,
2148 * and I'd like folks to report it... GO */
2149 printf(("WAKEUP CALLED\n"));
2150 queue->numOutstanding++;
2151
2152 bp->b_flags = 0;
2153 bp->b_private = req;
2154
2155 KernelWakeupFunc(bp);
2156 break;
2157
2158 case RF_IO_TYPE_READ:
2159 case RF_IO_TYPE_WRITE:
2160 #if RF_ACC_TRACE > 0
2161 if (req->tracerec) {
2162 RF_ETIMER_START(req->tracerec->timer);
2163 }
2164 #endif
2165 InitBP(bp, queue->rf_cinfo->ci_vp,
2166 op, queue->rf_cinfo->ci_dev,
2167 req->sectorOffset, req->numSector,
2168 req->buf, KernelWakeupFunc, (void *) req,
2169 queue->raidPtr->logBytesPerSector, req->b_proc);
2170
2171 if (rf_debugKernelAccess) {
2172 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2173 (long) bp->b_blkno));
2174 }
2175 queue->numOutstanding++;
2176 queue->last_deq_sector = req->sectorOffset;
2177 /* acc wouldn't have been let in if there were any pending
2178 * reqs at any other priority */
2179 queue->curPriority = req->priority;
2180
2181 db1_printf(("Going for %c to unit %d col %d\n",
2182 req->type, queue->raidPtr->raidid,
2183 queue->col));
2184 db1_printf(("sector %d count %d (%d bytes) %d\n",
2185 (int) req->sectorOffset, (int) req->numSector,
2186 (int) (req->numSector <<
2187 queue->raidPtr->logBytesPerSector),
2188 (int) queue->raidPtr->logBytesPerSector));
2189
2190 /*
2191 * XXX: drop lock here since this can block at
2192 * least with backing SCSI devices. Retake it
2193 * to minimize fuss with calling interfaces.
2194 */
2195
2196 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2197 bdev_strategy(bp);
2198 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2199 break;
2200
2201 default:
2202 panic("bad req->type in rf_DispatchKernelIO");
2203 }
2204 db1_printf(("Exiting from DispatchKernelIO\n"));
2205
2206 return (0);
2207 }
2208 /* this is the callback function associated with a I/O invoked from
2209 kernel code.
2210 */
2211 static void
2212 KernelWakeupFunc(struct buf *bp)
2213 {
2214 RF_DiskQueueData_t *req = NULL;
2215 RF_DiskQueue_t *queue;
2216
2217 db1_printf(("recovering the request queue:\n"));
2218
2219 req = bp->b_private;
2220
2221 queue = (RF_DiskQueue_t *) req->queue;
2222
2223 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2224
2225 #if RF_ACC_TRACE > 0
2226 if (req->tracerec) {
2227 RF_ETIMER_STOP(req->tracerec->timer);
2228 RF_ETIMER_EVAL(req->tracerec->timer);
2229 rf_lock_mutex2(rf_tracing_mutex);
2230 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2231 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2232 req->tracerec->num_phys_ios++;
2233 rf_unlock_mutex2(rf_tracing_mutex);
2234 }
2235 #endif
2236
2237 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2238 * ballistic, and mark the component as hosed... */
2239
2240 if (bp->b_error != 0) {
2241 /* Mark the disk as dead */
2242 /* but only mark it once... */
2243 /* and only if it wouldn't leave this RAID set
2244 completely broken */
2245 if (((queue->raidPtr->Disks[queue->col].status ==
2246 rf_ds_optimal) ||
2247 (queue->raidPtr->Disks[queue->col].status ==
2248 rf_ds_used_spare)) &&
2249 (queue->raidPtr->numFailures <
2250 queue->raidPtr->Layout.map->faultsTolerated)) {
2251 printf("raid%d: IO Error. Marking %s as failed.\n",
2252 queue->raidPtr->raidid,
2253 queue->raidPtr->Disks[queue->col].devname);
2254 queue->raidPtr->Disks[queue->col].status =
2255 rf_ds_failed;
2256 queue->raidPtr->status = rf_rs_degraded;
2257 queue->raidPtr->numFailures++;
2258 queue->raidPtr->numNewFailures++;
2259 } else { /* Disk is already dead... */
2260 /* printf("Disk already marked as dead!\n"); */
2261 }
2262
2263 }
2264
2265 /* Fill in the error value */
2266 req->error = bp->b_error;
2267
2268 /* Drop this one on the "finished" queue... */
2269 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2270
2271 /* Let the raidio thread know there is work to be done. */
2272 rf_signal_cond2(queue->raidPtr->iodone_cv);
2273
2274 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2275 }
2276
2277
2278 /*
2279 * initialize a buf structure for doing an I/O in the kernel.
2280 */
2281 static void
2282 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2283 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2284 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2285 struct proc *b_proc)
2286 {
2287 /* bp->b_flags = B_PHYS | rw_flag; */
2288 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2289 bp->b_oflags = 0;
2290 bp->b_cflags = 0;
2291 bp->b_bcount = numSect << logBytesPerSector;
2292 bp->b_bufsize = bp->b_bcount;
2293 bp->b_error = 0;
2294 bp->b_dev = dev;
2295 bp->b_data = bf;
2296 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2297 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2298 if (bp->b_bcount == 0) {
2299 panic("bp->b_bcount is zero in InitBP!!");
2300 }
2301 bp->b_proc = b_proc;
2302 bp->b_iodone = cbFunc;
2303 bp->b_private = cbArg;
2304 }
2305
2306 static void
2307 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2308 struct disklabel *lp)
2309 {
2310 memset(lp, 0, sizeof(*lp));
2311
2312 /* fabricate a label... */
2313 lp->d_secperunit = raidPtr->totalSectors;
2314 lp->d_secsize = raidPtr->bytesPerSector;
2315 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2316 lp->d_ntracks = 4 * raidPtr->numCol;
2317 lp->d_ncylinders = raidPtr->totalSectors /
2318 (lp->d_nsectors * lp->d_ntracks);
2319 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2320
2321 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2322 lp->d_type = DTYPE_RAID;
2323 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2324 lp->d_rpm = 3600;
2325 lp->d_interleave = 1;
2326 lp->d_flags = 0;
2327
2328 lp->d_partitions[RAW_PART].p_offset = 0;
2329 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2330 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2331 lp->d_npartitions = RAW_PART + 1;
2332
2333 lp->d_magic = DISKMAGIC;
2334 lp->d_magic2 = DISKMAGIC;
2335 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2336
2337 }
2338 /*
2339 * Read the disklabel from the raid device. If one is not present, fake one
2340 * up.
2341 */
2342 static void
2343 raidgetdisklabel(dev_t dev)
2344 {
2345 int unit = raidunit(dev);
2346 struct raid_softc *rs;
2347 const char *errstring;
2348 struct disklabel *lp;
2349 struct cpu_disklabel *clp;
2350 RF_Raid_t *raidPtr;
2351
2352 if ((rs = raidget(unit)) == NULL)
2353 return;
2354
2355 lp = rs->sc_dkdev.dk_label;
2356 clp = rs->sc_dkdev.dk_cpulabel;
2357
2358 db1_printf(("Getting the disklabel...\n"));
2359
2360 memset(clp, 0, sizeof(*clp));
2361
2362 raidPtr = &rs->sc_r;
2363
2364 raidgetdefaultlabel(raidPtr, rs, lp);
2365
2366 /*
2367 * Call the generic disklabel extraction routine.
2368 */
2369 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2370 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2371 if (errstring)
2372 raidmakedisklabel(rs);
2373 else {
2374 int i;
2375 struct partition *pp;
2376
2377 /*
2378 * Sanity check whether the found disklabel is valid.
2379 *
2380 * This is necessary since total size of the raid device
2381 * may vary when an interleave is changed even though exactly
2382 * same components are used, and old disklabel may used
2383 * if that is found.
2384 */
2385 if (lp->d_secperunit != rs->sc_size)
2386 printf("raid%d: WARNING: %s: "
2387 "total sector size in disklabel (%" PRIu32 ") != "
2388 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2389 lp->d_secperunit, rs->sc_size);
2390 for (i = 0; i < lp->d_npartitions; i++) {
2391 pp = &lp->d_partitions[i];
2392 if (pp->p_offset + pp->p_size > rs->sc_size)
2393 printf("raid%d: WARNING: %s: end of partition `%c' "
2394 "exceeds the size of raid (%" PRIu64 ")\n",
2395 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2396 }
2397 }
2398
2399 }
2400 /*
2401 * Take care of things one might want to take care of in the event
2402 * that a disklabel isn't present.
2403 */
2404 static void
2405 raidmakedisklabel(struct raid_softc *rs)
2406 {
2407 struct disklabel *lp = rs->sc_dkdev.dk_label;
2408 db1_printf(("Making a label..\n"));
2409
2410 /*
2411 * For historical reasons, if there's no disklabel present
2412 * the raw partition must be marked FS_BSDFFS.
2413 */
2414
2415 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2416
2417 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2418
2419 lp->d_checksum = dkcksum(lp);
2420 }
2421 /*
2422 * Wait interruptibly for an exclusive lock.
2423 *
2424 * XXX
2425 * Several drivers do this; it should be abstracted and made MP-safe.
2426 * (Hmm... where have we seen this warning before :-> GO )
2427 */
2428 static int
2429 raidlock(struct raid_softc *rs)
2430 {
2431 int error;
2432
2433 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2434 rs->sc_flags |= RAIDF_WANTED;
2435 if ((error =
2436 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2437 return (error);
2438 }
2439 rs->sc_flags |= RAIDF_LOCKED;
2440 return (0);
2441 }
2442 /*
2443 * Unlock and wake up any waiters.
2444 */
2445 static void
2446 raidunlock(struct raid_softc *rs)
2447 {
2448
2449 rs->sc_flags &= ~RAIDF_LOCKED;
2450 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2451 rs->sc_flags &= ~RAIDF_WANTED;
2452 wakeup(rs);
2453 }
2454 }
2455
2456
2457 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2458 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2459 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2460
2461 static daddr_t
2462 rf_component_info_offset(void)
2463 {
2464
2465 return RF_COMPONENT_INFO_OFFSET;
2466 }
2467
2468 static daddr_t
2469 rf_component_info_size(unsigned secsize)
2470 {
2471 daddr_t info_size;
2472
2473 KASSERT(secsize);
2474 if (secsize > RF_COMPONENT_INFO_SIZE)
2475 info_size = secsize;
2476 else
2477 info_size = RF_COMPONENT_INFO_SIZE;
2478
2479 return info_size;
2480 }
2481
2482 static daddr_t
2483 rf_parity_map_offset(RF_Raid_t *raidPtr)
2484 {
2485 daddr_t map_offset;
2486
2487 KASSERT(raidPtr->bytesPerSector);
2488 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2489 map_offset = raidPtr->bytesPerSector;
2490 else
2491 map_offset = RF_COMPONENT_INFO_SIZE;
2492 map_offset += rf_component_info_offset();
2493
2494 return map_offset;
2495 }
2496
2497 static daddr_t
2498 rf_parity_map_size(RF_Raid_t *raidPtr)
2499 {
2500 daddr_t map_size;
2501
2502 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2503 map_size = raidPtr->bytesPerSector;
2504 else
2505 map_size = RF_PARITY_MAP_SIZE;
2506
2507 return map_size;
2508 }
2509
2510 int
2511 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2512 {
2513 RF_ComponentLabel_t *clabel;
2514
2515 clabel = raidget_component_label(raidPtr, col);
2516 clabel->clean = RF_RAID_CLEAN;
2517 raidflush_component_label(raidPtr, col);
2518 return(0);
2519 }
2520
2521
2522 int
2523 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2524 {
2525 RF_ComponentLabel_t *clabel;
2526
2527 clabel = raidget_component_label(raidPtr, col);
2528 clabel->clean = RF_RAID_DIRTY;
2529 raidflush_component_label(raidPtr, col);
2530 return(0);
2531 }
2532
2533 int
2534 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2535 {
2536 KASSERT(raidPtr->bytesPerSector);
2537 return raidread_component_label(raidPtr->bytesPerSector,
2538 raidPtr->Disks[col].dev,
2539 raidPtr->raid_cinfo[col].ci_vp,
2540 &raidPtr->raid_cinfo[col].ci_label);
2541 }
2542
2543 RF_ComponentLabel_t *
2544 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2545 {
2546 return &raidPtr->raid_cinfo[col].ci_label;
2547 }
2548
2549 int
2550 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2551 {
2552 RF_ComponentLabel_t *label;
2553
2554 label = &raidPtr->raid_cinfo[col].ci_label;
2555 label->mod_counter = raidPtr->mod_counter;
2556 #ifndef RF_NO_PARITY_MAP
2557 label->parity_map_modcount = label->mod_counter;
2558 #endif
2559 return raidwrite_component_label(raidPtr->bytesPerSector,
2560 raidPtr->Disks[col].dev,
2561 raidPtr->raid_cinfo[col].ci_vp, label);
2562 }
2563
2564
2565 static int
2566 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2567 RF_ComponentLabel_t *clabel)
2568 {
2569 return raidread_component_area(dev, b_vp, clabel,
2570 sizeof(RF_ComponentLabel_t),
2571 rf_component_info_offset(),
2572 rf_component_info_size(secsize));
2573 }
2574
2575 /* ARGSUSED */
2576 static int
2577 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2578 size_t msize, daddr_t offset, daddr_t dsize)
2579 {
2580 struct buf *bp;
2581 const struct bdevsw *bdev;
2582 int error;
2583
2584 /* XXX should probably ensure that we don't try to do this if
2585 someone has changed rf_protected_sectors. */
2586
2587 if (b_vp == NULL) {
2588 /* For whatever reason, this component is not valid.
2589 Don't try to read a component label from it. */
2590 return(EINVAL);
2591 }
2592
2593 /* get a block of the appropriate size... */
2594 bp = geteblk((int)dsize);
2595 bp->b_dev = dev;
2596
2597 /* get our ducks in a row for the read */
2598 bp->b_blkno = offset / DEV_BSIZE;
2599 bp->b_bcount = dsize;
2600 bp->b_flags |= B_READ;
2601 bp->b_resid = dsize;
2602
2603 bdev = bdevsw_lookup(bp->b_dev);
2604 if (bdev == NULL)
2605 return (ENXIO);
2606 (*bdev->d_strategy)(bp);
2607
2608 error = biowait(bp);
2609
2610 if (!error) {
2611 memcpy(data, bp->b_data, msize);
2612 }
2613
2614 brelse(bp, 0);
2615 return(error);
2616 }
2617
2618
2619 static int
2620 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2621 RF_ComponentLabel_t *clabel)
2622 {
2623 return raidwrite_component_area(dev, b_vp, clabel,
2624 sizeof(RF_ComponentLabel_t),
2625 rf_component_info_offset(),
2626 rf_component_info_size(secsize), 0);
2627 }
2628
2629 /* ARGSUSED */
2630 static int
2631 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2632 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2633 {
2634 struct buf *bp;
2635 const struct bdevsw *bdev;
2636 int error;
2637
2638 /* get a block of the appropriate size... */
2639 bp = geteblk((int)dsize);
2640 bp->b_dev = dev;
2641
2642 /* get our ducks in a row for the write */
2643 bp->b_blkno = offset / DEV_BSIZE;
2644 bp->b_bcount = dsize;
2645 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2646 bp->b_resid = dsize;
2647
2648 memset(bp->b_data, 0, dsize);
2649 memcpy(bp->b_data, data, msize);
2650
2651 bdev = bdevsw_lookup(bp->b_dev);
2652 if (bdev == NULL)
2653 return (ENXIO);
2654 (*bdev->d_strategy)(bp);
2655 if (asyncp)
2656 return 0;
2657 error = biowait(bp);
2658 brelse(bp, 0);
2659 if (error) {
2660 #if 1
2661 printf("Failed to write RAID component info!\n");
2662 #endif
2663 }
2664
2665 return(error);
2666 }
2667
2668 void
2669 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2670 {
2671 int c;
2672
2673 for (c = 0; c < raidPtr->numCol; c++) {
2674 /* Skip dead disks. */
2675 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2676 continue;
2677 /* XXXjld: what if an error occurs here? */
2678 raidwrite_component_area(raidPtr->Disks[c].dev,
2679 raidPtr->raid_cinfo[c].ci_vp, map,
2680 RF_PARITYMAP_NBYTE,
2681 rf_parity_map_offset(raidPtr),
2682 rf_parity_map_size(raidPtr), 0);
2683 }
2684 }
2685
2686 void
2687 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2688 {
2689 struct rf_paritymap_ondisk tmp;
2690 int c,first;
2691
2692 first=1;
2693 for (c = 0; c < raidPtr->numCol; c++) {
2694 /* Skip dead disks. */
2695 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2696 continue;
2697 raidread_component_area(raidPtr->Disks[c].dev,
2698 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2699 RF_PARITYMAP_NBYTE,
2700 rf_parity_map_offset(raidPtr),
2701 rf_parity_map_size(raidPtr));
2702 if (first) {
2703 memcpy(map, &tmp, sizeof(*map));
2704 first = 0;
2705 } else {
2706 rf_paritymap_merge(map, &tmp);
2707 }
2708 }
2709 }
2710
2711 void
2712 rf_markalldirty(RF_Raid_t *raidPtr)
2713 {
2714 RF_ComponentLabel_t *clabel;
2715 int sparecol;
2716 int c;
2717 int j;
2718 int scol = -1;
2719
2720 raidPtr->mod_counter++;
2721 for (c = 0; c < raidPtr->numCol; c++) {
2722 /* we don't want to touch (at all) a disk that has
2723 failed */
2724 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2725 clabel = raidget_component_label(raidPtr, c);
2726 if (clabel->status == rf_ds_spared) {
2727 /* XXX do something special...
2728 but whatever you do, don't
2729 try to access it!! */
2730 } else {
2731 raidmarkdirty(raidPtr, c);
2732 }
2733 }
2734 }
2735
2736 for( c = 0; c < raidPtr->numSpare ; c++) {
2737 sparecol = raidPtr->numCol + c;
2738 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2739 /*
2740
2741 we claim this disk is "optimal" if it's
2742 rf_ds_used_spare, as that means it should be
2743 directly substitutable for the disk it replaced.
2744 We note that too...
2745
2746 */
2747
2748 for(j=0;j<raidPtr->numCol;j++) {
2749 if (raidPtr->Disks[j].spareCol == sparecol) {
2750 scol = j;
2751 break;
2752 }
2753 }
2754
2755 clabel = raidget_component_label(raidPtr, sparecol);
2756 /* make sure status is noted */
2757
2758 raid_init_component_label(raidPtr, clabel);
2759
2760 clabel->row = 0;
2761 clabel->column = scol;
2762 /* Note: we *don't* change status from rf_ds_used_spare
2763 to rf_ds_optimal */
2764 /* clabel.status = rf_ds_optimal; */
2765
2766 raidmarkdirty(raidPtr, sparecol);
2767 }
2768 }
2769 }
2770
2771
2772 void
2773 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2774 {
2775 RF_ComponentLabel_t *clabel;
2776 int sparecol;
2777 int c;
2778 int j;
2779 int scol;
2780
2781 scol = -1;
2782
2783 /* XXX should do extra checks to make sure things really are clean,
2784 rather than blindly setting the clean bit... */
2785
2786 raidPtr->mod_counter++;
2787
2788 for (c = 0; c < raidPtr->numCol; c++) {
2789 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2790 clabel = raidget_component_label(raidPtr, c);
2791 /* make sure status is noted */
2792 clabel->status = rf_ds_optimal;
2793
2794 /* note what unit we are configured as */
2795 clabel->last_unit = raidPtr->raidid;
2796
2797 raidflush_component_label(raidPtr, c);
2798 if (final == RF_FINAL_COMPONENT_UPDATE) {
2799 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2800 raidmarkclean(raidPtr, c);
2801 }
2802 }
2803 }
2804 /* else we don't touch it.. */
2805 }
2806
2807 for( c = 0; c < raidPtr->numSpare ; c++) {
2808 sparecol = raidPtr->numCol + c;
2809 /* Need to ensure that the reconstruct actually completed! */
2810 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2811 /*
2812
2813 we claim this disk is "optimal" if it's
2814 rf_ds_used_spare, as that means it should be
2815 directly substitutable for the disk it replaced.
2816 We note that too...
2817
2818 */
2819
2820 for(j=0;j<raidPtr->numCol;j++) {
2821 if (raidPtr->Disks[j].spareCol == sparecol) {
2822 scol = j;
2823 break;
2824 }
2825 }
2826
2827 /* XXX shouldn't *really* need this... */
2828 clabel = raidget_component_label(raidPtr, sparecol);
2829 /* make sure status is noted */
2830
2831 raid_init_component_label(raidPtr, clabel);
2832
2833 clabel->column = scol;
2834 clabel->status = rf_ds_optimal;
2835 clabel->last_unit = raidPtr->raidid;
2836
2837 raidflush_component_label(raidPtr, sparecol);
2838 if (final == RF_FINAL_COMPONENT_UPDATE) {
2839 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2840 raidmarkclean(raidPtr, sparecol);
2841 }
2842 }
2843 }
2844 }
2845 }
2846
2847 void
2848 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2849 {
2850
2851 if (vp != NULL) {
2852 if (auto_configured == 1) {
2853 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2854 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2855 vput(vp);
2856
2857 } else {
2858 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2859 }
2860 }
2861 }
2862
2863
2864 void
2865 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2866 {
2867 int r,c;
2868 struct vnode *vp;
2869 int acd;
2870
2871
2872 /* We take this opportunity to close the vnodes like we should.. */
2873
2874 for (c = 0; c < raidPtr->numCol; c++) {
2875 vp = raidPtr->raid_cinfo[c].ci_vp;
2876 acd = raidPtr->Disks[c].auto_configured;
2877 rf_close_component(raidPtr, vp, acd);
2878 raidPtr->raid_cinfo[c].ci_vp = NULL;
2879 raidPtr->Disks[c].auto_configured = 0;
2880 }
2881
2882 for (r = 0; r < raidPtr->numSpare; r++) {
2883 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2884 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2885 rf_close_component(raidPtr, vp, acd);
2886 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2887 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2888 }
2889 }
2890
2891
2892 void
2893 rf_ReconThread(struct rf_recon_req *req)
2894 {
2895 int s;
2896 RF_Raid_t *raidPtr;
2897
2898 s = splbio();
2899 raidPtr = (RF_Raid_t *) req->raidPtr;
2900 raidPtr->recon_in_progress = 1;
2901
2902 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2903 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2904
2905 RF_Free(req, sizeof(*req));
2906
2907 raidPtr->recon_in_progress = 0;
2908 splx(s);
2909
2910 /* That's all... */
2911 kthread_exit(0); /* does not return */
2912 }
2913
2914 void
2915 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2916 {
2917 int retcode;
2918 int s;
2919
2920 raidPtr->parity_rewrite_stripes_done = 0;
2921 raidPtr->parity_rewrite_in_progress = 1;
2922 s = splbio();
2923 retcode = rf_RewriteParity(raidPtr);
2924 splx(s);
2925 if (retcode) {
2926 printf("raid%d: Error re-writing parity (%d)!\n",
2927 raidPtr->raidid, retcode);
2928 } else {
2929 /* set the clean bit! If we shutdown correctly,
2930 the clean bit on each component label will get
2931 set */
2932 raidPtr->parity_good = RF_RAID_CLEAN;
2933 }
2934 raidPtr->parity_rewrite_in_progress = 0;
2935
2936 /* Anyone waiting for us to stop? If so, inform them... */
2937 if (raidPtr->waitShutdown) {
2938 wakeup(&raidPtr->parity_rewrite_in_progress);
2939 }
2940
2941 /* That's all... */
2942 kthread_exit(0); /* does not return */
2943 }
2944
2945
2946 void
2947 rf_CopybackThread(RF_Raid_t *raidPtr)
2948 {
2949 int s;
2950
2951 raidPtr->copyback_in_progress = 1;
2952 s = splbio();
2953 rf_CopybackReconstructedData(raidPtr);
2954 splx(s);
2955 raidPtr->copyback_in_progress = 0;
2956
2957 /* That's all... */
2958 kthread_exit(0); /* does not return */
2959 }
2960
2961
2962 void
2963 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2964 {
2965 int s;
2966 RF_Raid_t *raidPtr;
2967
2968 s = splbio();
2969 raidPtr = req->raidPtr;
2970 raidPtr->recon_in_progress = 1;
2971 rf_ReconstructInPlace(raidPtr, req->col);
2972 RF_Free(req, sizeof(*req));
2973 raidPtr->recon_in_progress = 0;
2974 splx(s);
2975
2976 /* That's all... */
2977 kthread_exit(0); /* does not return */
2978 }
2979
2980 static RF_AutoConfig_t *
2981 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2982 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2983 unsigned secsize)
2984 {
2985 int good_one = 0;
2986 RF_ComponentLabel_t *clabel;
2987 RF_AutoConfig_t *ac;
2988
2989 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2990 if (clabel == NULL) {
2991 oomem:
2992 while(ac_list) {
2993 ac = ac_list;
2994 if (ac->clabel)
2995 free(ac->clabel, M_RAIDFRAME);
2996 ac_list = ac_list->next;
2997 free(ac, M_RAIDFRAME);
2998 }
2999 printf("RAID auto config: out of memory!\n");
3000 return NULL; /* XXX probably should panic? */
3001 }
3002
3003 if (!raidread_component_label(secsize, dev, vp, clabel)) {
3004 /* Got the label. Does it look reasonable? */
3005 if (rf_reasonable_label(clabel, numsecs) &&
3006 (rf_component_label_partitionsize(clabel) <= size)) {
3007 #ifdef DEBUG
3008 printf("Component on: %s: %llu\n",
3009 cname, (unsigned long long)size);
3010 rf_print_component_label(clabel);
3011 #endif
3012 /* if it's reasonable, add it, else ignore it. */
3013 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
3014 M_NOWAIT);
3015 if (ac == NULL) {
3016 free(clabel, M_RAIDFRAME);
3017 goto oomem;
3018 }
3019 strlcpy(ac->devname, cname, sizeof(ac->devname));
3020 ac->dev = dev;
3021 ac->vp = vp;
3022 ac->clabel = clabel;
3023 ac->next = ac_list;
3024 ac_list = ac;
3025 good_one = 1;
3026 }
3027 }
3028 if (!good_one) {
3029 /* cleanup */
3030 free(clabel, M_RAIDFRAME);
3031 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3032 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3033 vput(vp);
3034 }
3035 return ac_list;
3036 }
3037
3038 RF_AutoConfig_t *
3039 rf_find_raid_components(void)
3040 {
3041 struct vnode *vp;
3042 struct disklabel label;
3043 device_t dv;
3044 deviter_t di;
3045 dev_t dev;
3046 int bmajor, bminor, wedge, rf_part_found;
3047 int error;
3048 int i;
3049 RF_AutoConfig_t *ac_list;
3050 uint64_t numsecs;
3051 unsigned secsize;
3052
3053 /* initialize the AutoConfig list */
3054 ac_list = NULL;
3055
3056 /* we begin by trolling through *all* the devices on the system */
3057
3058 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3059 dv = deviter_next(&di)) {
3060
3061 /* we are only interested in disks... */
3062 if (device_class(dv) != DV_DISK)
3063 continue;
3064
3065 /* we don't care about floppies... */
3066 if (device_is_a(dv, "fd")) {
3067 continue;
3068 }
3069
3070 /* we don't care about CD's... */
3071 if (device_is_a(dv, "cd")) {
3072 continue;
3073 }
3074
3075 /* we don't care about md's... */
3076 if (device_is_a(dv, "md")) {
3077 continue;
3078 }
3079
3080 /* hdfd is the Atari/Hades floppy driver */
3081 if (device_is_a(dv, "hdfd")) {
3082 continue;
3083 }
3084
3085 /* fdisa is the Atari/Milan floppy driver */
3086 if (device_is_a(dv, "fdisa")) {
3087 continue;
3088 }
3089
3090 /* need to find the device_name_to_block_device_major stuff */
3091 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3092
3093 rf_part_found = 0; /*No raid partition as yet*/
3094
3095 /* get a vnode for the raw partition of this disk */
3096
3097 wedge = device_is_a(dv, "dk");
3098 bminor = minor(device_unit(dv));
3099 dev = wedge ? makedev(bmajor, bminor) :
3100 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3101 if (bdevvp(dev, &vp))
3102 panic("RAID can't alloc vnode");
3103
3104 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3105
3106 if (error) {
3107 /* "Who cares." Continue looking
3108 for something that exists*/
3109 vput(vp);
3110 continue;
3111 }
3112
3113 error = getdisksize(vp, &numsecs, &secsize);
3114 if (error) {
3115 vput(vp);
3116 continue;
3117 }
3118 if (wedge) {
3119 struct dkwedge_info dkw;
3120 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3121 NOCRED);
3122 if (error) {
3123 printf("RAIDframe: can't get wedge info for "
3124 "dev %s (%d)\n", device_xname(dv), error);
3125 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3126 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3127 vput(vp);
3128 continue;
3129 }
3130
3131 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3132 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3133 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3134 vput(vp);
3135 continue;
3136 }
3137
3138 ac_list = rf_get_component(ac_list, dev, vp,
3139 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3140 rf_part_found = 1; /*There is a raid component on this disk*/
3141 continue;
3142 }
3143
3144 /* Ok, the disk exists. Go get the disklabel. */
3145 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3146 if (error) {
3147 /*
3148 * XXX can't happen - open() would
3149 * have errored out (or faked up one)
3150 */
3151 if (error != ENOTTY)
3152 printf("RAIDframe: can't get label for dev "
3153 "%s (%d)\n", device_xname(dv), error);
3154 }
3155
3156 /* don't need this any more. We'll allocate it again
3157 a little later if we really do... */
3158 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3159 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3160 vput(vp);
3161
3162 if (error)
3163 continue;
3164
3165 rf_part_found = 0; /*No raid partitions yet*/
3166 for (i = 0; i < label.d_npartitions; i++) {
3167 char cname[sizeof(ac_list->devname)];
3168
3169 /* We only support partitions marked as RAID */
3170 if (label.d_partitions[i].p_fstype != FS_RAID)
3171 continue;
3172
3173 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3174 if (bdevvp(dev, &vp))
3175 panic("RAID can't alloc vnode");
3176
3177 error = VOP_OPEN(vp, FREAD, NOCRED);
3178 if (error) {
3179 /* Whatever... */
3180 vput(vp);
3181 continue;
3182 }
3183 snprintf(cname, sizeof(cname), "%s%c",
3184 device_xname(dv), 'a' + i);
3185 ac_list = rf_get_component(ac_list, dev, vp, cname,
3186 label.d_partitions[i].p_size, numsecs, secsize);
3187 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3188 }
3189
3190 /*
3191 *If there is no raid component on this disk, either in a
3192 *disklabel or inside a wedge, check the raw partition as well,
3193 *as it is possible to configure raid components on raw disk
3194 *devices.
3195 */
3196
3197 if (!rf_part_found) {
3198 char cname[sizeof(ac_list->devname)];
3199
3200 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3201 if (bdevvp(dev, &vp))
3202 panic("RAID can't alloc vnode");
3203
3204 error = VOP_OPEN(vp, FREAD, NOCRED);
3205 if (error) {
3206 /* Whatever... */
3207 vput(vp);
3208 continue;
3209 }
3210 snprintf(cname, sizeof(cname), "%s%c",
3211 device_xname(dv), 'a' + RAW_PART);
3212 ac_list = rf_get_component(ac_list, dev, vp, cname,
3213 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3214 }
3215 }
3216 deviter_release(&di);
3217 return ac_list;
3218 }
3219
3220
3221 int
3222 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3223 {
3224
3225 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3226 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3227 ((clabel->clean == RF_RAID_CLEAN) ||
3228 (clabel->clean == RF_RAID_DIRTY)) &&
3229 clabel->row >=0 &&
3230 clabel->column >= 0 &&
3231 clabel->num_rows > 0 &&
3232 clabel->num_columns > 0 &&
3233 clabel->row < clabel->num_rows &&
3234 clabel->column < clabel->num_columns &&
3235 clabel->blockSize > 0 &&
3236 /*
3237 * numBlocksHi may contain garbage, but it is ok since
3238 * the type is unsigned. If it is really garbage,
3239 * rf_fix_old_label_size() will fix it.
3240 */
3241 rf_component_label_numblocks(clabel) > 0) {
3242 /*
3243 * label looks reasonable enough...
3244 * let's make sure it has no old garbage.
3245 */
3246 if (numsecs)
3247 rf_fix_old_label_size(clabel, numsecs);
3248 return(1);
3249 }
3250 return(0);
3251 }
3252
3253
3254 /*
3255 * For reasons yet unknown, some old component labels have garbage in
3256 * the newer numBlocksHi region, and this causes lossage. Since those
3257 * disks will also have numsecs set to less than 32 bits of sectors,
3258 * we can determine when this corruption has occurred, and fix it.
3259 *
3260 * The exact same problem, with the same unknown reason, happens to
3261 * the partitionSizeHi member as well.
3262 */
3263 static void
3264 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3265 {
3266
3267 if (numsecs < ((uint64_t)1 << 32)) {
3268 if (clabel->numBlocksHi) {
3269 printf("WARNING: total sectors < 32 bits, yet "
3270 "numBlocksHi set\n"
3271 "WARNING: resetting numBlocksHi to zero.\n");
3272 clabel->numBlocksHi = 0;
3273 }
3274
3275 if (clabel->partitionSizeHi) {
3276 printf("WARNING: total sectors < 32 bits, yet "
3277 "partitionSizeHi set\n"
3278 "WARNING: resetting partitionSizeHi to zero.\n");
3279 clabel->partitionSizeHi = 0;
3280 }
3281 }
3282 }
3283
3284
3285 #ifdef DEBUG
3286 void
3287 rf_print_component_label(RF_ComponentLabel_t *clabel)
3288 {
3289 uint64_t numBlocks;
3290
3291 numBlocks = rf_component_label_numblocks(clabel);
3292
3293 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3294 clabel->row, clabel->column,
3295 clabel->num_rows, clabel->num_columns);
3296 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3297 clabel->version, clabel->serial_number,
3298 clabel->mod_counter);
3299 printf(" Clean: %s Status: %d\n",
3300 clabel->clean ? "Yes" : "No", clabel->status);
3301 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3302 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3303 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3304 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3305 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3306 printf(" Contains root partition: %s\n",
3307 clabel->root_partition ? "Yes" : "No");
3308 printf(" Last configured as: raid%d\n", clabel->last_unit);
3309 #if 0
3310 printf(" Config order: %d\n", clabel->config_order);
3311 #endif
3312
3313 }
3314 #endif
3315
3316 RF_ConfigSet_t *
3317 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3318 {
3319 RF_AutoConfig_t *ac;
3320 RF_ConfigSet_t *config_sets;
3321 RF_ConfigSet_t *cset;
3322 RF_AutoConfig_t *ac_next;
3323
3324
3325 config_sets = NULL;
3326
3327 /* Go through the AutoConfig list, and figure out which components
3328 belong to what sets. */
3329 ac = ac_list;
3330 while(ac!=NULL) {
3331 /* we're going to putz with ac->next, so save it here
3332 for use at the end of the loop */
3333 ac_next = ac->next;
3334
3335 if (config_sets == NULL) {
3336 /* will need at least this one... */
3337 config_sets = (RF_ConfigSet_t *)
3338 malloc(sizeof(RF_ConfigSet_t),
3339 M_RAIDFRAME, M_NOWAIT);
3340 if (config_sets == NULL) {
3341 panic("rf_create_auto_sets: No memory!");
3342 }
3343 /* this one is easy :) */
3344 config_sets->ac = ac;
3345 config_sets->next = NULL;
3346 config_sets->rootable = 0;
3347 ac->next = NULL;
3348 } else {
3349 /* which set does this component fit into? */
3350 cset = config_sets;
3351 while(cset!=NULL) {
3352 if (rf_does_it_fit(cset, ac)) {
3353 /* looks like it matches... */
3354 ac->next = cset->ac;
3355 cset->ac = ac;
3356 break;
3357 }
3358 cset = cset->next;
3359 }
3360 if (cset==NULL) {
3361 /* didn't find a match above... new set..*/
3362 cset = (RF_ConfigSet_t *)
3363 malloc(sizeof(RF_ConfigSet_t),
3364 M_RAIDFRAME, M_NOWAIT);
3365 if (cset == NULL) {
3366 panic("rf_create_auto_sets: No memory!");
3367 }
3368 cset->ac = ac;
3369 ac->next = NULL;
3370 cset->next = config_sets;
3371 cset->rootable = 0;
3372 config_sets = cset;
3373 }
3374 }
3375 ac = ac_next;
3376 }
3377
3378
3379 return(config_sets);
3380 }
3381
3382 static int
3383 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3384 {
3385 RF_ComponentLabel_t *clabel1, *clabel2;
3386
3387 /* If this one matches the *first* one in the set, that's good
3388 enough, since the other members of the set would have been
3389 through here too... */
3390 /* note that we are not checking partitionSize here..
3391
3392 Note that we are also not checking the mod_counters here.
3393 If everything else matches except the mod_counter, that's
3394 good enough for this test. We will deal with the mod_counters
3395 a little later in the autoconfiguration process.
3396
3397 (clabel1->mod_counter == clabel2->mod_counter) &&
3398
3399 The reason we don't check for this is that failed disks
3400 will have lower modification counts. If those disks are
3401 not added to the set they used to belong to, then they will
3402 form their own set, which may result in 2 different sets,
3403 for example, competing to be configured at raid0, and
3404 perhaps competing to be the root filesystem set. If the
3405 wrong ones get configured, or both attempt to become /,
3406 weird behaviour and or serious lossage will occur. Thus we
3407 need to bring them into the fold here, and kick them out at
3408 a later point.
3409
3410 */
3411
3412 clabel1 = cset->ac->clabel;
3413 clabel2 = ac->clabel;
3414 if ((clabel1->version == clabel2->version) &&
3415 (clabel1->serial_number == clabel2->serial_number) &&
3416 (clabel1->num_rows == clabel2->num_rows) &&
3417 (clabel1->num_columns == clabel2->num_columns) &&
3418 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3419 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3420 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3421 (clabel1->parityConfig == clabel2->parityConfig) &&
3422 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3423 (clabel1->blockSize == clabel2->blockSize) &&
3424 rf_component_label_numblocks(clabel1) ==
3425 rf_component_label_numblocks(clabel2) &&
3426 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3427 (clabel1->root_partition == clabel2->root_partition) &&
3428 (clabel1->last_unit == clabel2->last_unit) &&
3429 (clabel1->config_order == clabel2->config_order)) {
3430 /* if it get's here, it almost *has* to be a match */
3431 } else {
3432 /* it's not consistent with somebody in the set..
3433 punt */
3434 return(0);
3435 }
3436 /* all was fine.. it must fit... */
3437 return(1);
3438 }
3439
3440 int
3441 rf_have_enough_components(RF_ConfigSet_t *cset)
3442 {
3443 RF_AutoConfig_t *ac;
3444 RF_AutoConfig_t *auto_config;
3445 RF_ComponentLabel_t *clabel;
3446 int c;
3447 int num_cols;
3448 int num_missing;
3449 int mod_counter;
3450 int mod_counter_found;
3451 int even_pair_failed;
3452 char parity_type;
3453
3454
3455 /* check to see that we have enough 'live' components
3456 of this set. If so, we can configure it if necessary */
3457
3458 num_cols = cset->ac->clabel->num_columns;
3459 parity_type = cset->ac->clabel->parityConfig;
3460
3461 /* XXX Check for duplicate components!?!?!? */
3462
3463 /* Determine what the mod_counter is supposed to be for this set. */
3464
3465 mod_counter_found = 0;
3466 mod_counter = 0;
3467 ac = cset->ac;
3468 while(ac!=NULL) {
3469 if (mod_counter_found==0) {
3470 mod_counter = ac->clabel->mod_counter;
3471 mod_counter_found = 1;
3472 } else {
3473 if (ac->clabel->mod_counter > mod_counter) {
3474 mod_counter = ac->clabel->mod_counter;
3475 }
3476 }
3477 ac = ac->next;
3478 }
3479
3480 num_missing = 0;
3481 auto_config = cset->ac;
3482
3483 even_pair_failed = 0;
3484 for(c=0; c<num_cols; c++) {
3485 ac = auto_config;
3486 while(ac!=NULL) {
3487 if ((ac->clabel->column == c) &&
3488 (ac->clabel->mod_counter == mod_counter)) {
3489 /* it's this one... */
3490 #ifdef DEBUG
3491 printf("Found: %s at %d\n",
3492 ac->devname,c);
3493 #endif
3494 break;
3495 }
3496 ac=ac->next;
3497 }
3498 if (ac==NULL) {
3499 /* Didn't find one here! */
3500 /* special case for RAID 1, especially
3501 where there are more than 2
3502 components (where RAIDframe treats
3503 things a little differently :( ) */
3504 if (parity_type == '1') {
3505 if (c%2 == 0) { /* even component */
3506 even_pair_failed = 1;
3507 } else { /* odd component. If
3508 we're failed, and
3509 so is the even
3510 component, it's
3511 "Good Night, Charlie" */
3512 if (even_pair_failed == 1) {
3513 return(0);
3514 }
3515 }
3516 } else {
3517 /* normal accounting */
3518 num_missing++;
3519 }
3520 }
3521 if ((parity_type == '1') && (c%2 == 1)) {
3522 /* Just did an even component, and we didn't
3523 bail.. reset the even_pair_failed flag,
3524 and go on to the next component.... */
3525 even_pair_failed = 0;
3526 }
3527 }
3528
3529 clabel = cset->ac->clabel;
3530
3531 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3532 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3533 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3534 /* XXX this needs to be made *much* more general */
3535 /* Too many failures */
3536 return(0);
3537 }
3538 /* otherwise, all is well, and we've got enough to take a kick
3539 at autoconfiguring this set */
3540 return(1);
3541 }
3542
3543 void
3544 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3545 RF_Raid_t *raidPtr)
3546 {
3547 RF_ComponentLabel_t *clabel;
3548 int i;
3549
3550 clabel = ac->clabel;
3551
3552 /* 1. Fill in the common stuff */
3553 config->numRow = clabel->num_rows = 1;
3554 config->numCol = clabel->num_columns;
3555 config->numSpare = 0; /* XXX should this be set here? */
3556 config->sectPerSU = clabel->sectPerSU;
3557 config->SUsPerPU = clabel->SUsPerPU;
3558 config->SUsPerRU = clabel->SUsPerRU;
3559 config->parityConfig = clabel->parityConfig;
3560 /* XXX... */
3561 strcpy(config->diskQueueType,"fifo");
3562 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3563 config->layoutSpecificSize = 0; /* XXX ?? */
3564
3565 while(ac!=NULL) {
3566 /* row/col values will be in range due to the checks
3567 in reasonable_label() */
3568 strcpy(config->devnames[0][ac->clabel->column],
3569 ac->devname);
3570 ac = ac->next;
3571 }
3572
3573 for(i=0;i<RF_MAXDBGV;i++) {
3574 config->debugVars[i][0] = 0;
3575 }
3576 }
3577
3578 int
3579 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3580 {
3581 RF_ComponentLabel_t *clabel;
3582 int column;
3583 int sparecol;
3584
3585 raidPtr->autoconfigure = new_value;
3586
3587 for(column=0; column<raidPtr->numCol; column++) {
3588 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3589 clabel = raidget_component_label(raidPtr, column);
3590 clabel->autoconfigure = new_value;
3591 raidflush_component_label(raidPtr, column);
3592 }
3593 }
3594 for(column = 0; column < raidPtr->numSpare ; column++) {
3595 sparecol = raidPtr->numCol + column;
3596 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3597 clabel = raidget_component_label(raidPtr, sparecol);
3598 clabel->autoconfigure = new_value;
3599 raidflush_component_label(raidPtr, sparecol);
3600 }
3601 }
3602 return(new_value);
3603 }
3604
3605 int
3606 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3607 {
3608 RF_ComponentLabel_t *clabel;
3609 int column;
3610 int sparecol;
3611
3612 raidPtr->root_partition = new_value;
3613 for(column=0; column<raidPtr->numCol; column++) {
3614 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3615 clabel = raidget_component_label(raidPtr, column);
3616 clabel->root_partition = new_value;
3617 raidflush_component_label(raidPtr, column);
3618 }
3619 }
3620 for(column = 0; column < raidPtr->numSpare ; column++) {
3621 sparecol = raidPtr->numCol + column;
3622 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3623 clabel = raidget_component_label(raidPtr, sparecol);
3624 clabel->root_partition = new_value;
3625 raidflush_component_label(raidPtr, sparecol);
3626 }
3627 }
3628 return(new_value);
3629 }
3630
3631 void
3632 rf_release_all_vps(RF_ConfigSet_t *cset)
3633 {
3634 RF_AutoConfig_t *ac;
3635
3636 ac = cset->ac;
3637 while(ac!=NULL) {
3638 /* Close the vp, and give it back */
3639 if (ac->vp) {
3640 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3641 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3642 vput(ac->vp);
3643 ac->vp = NULL;
3644 }
3645 ac = ac->next;
3646 }
3647 }
3648
3649
3650 void
3651 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3652 {
3653 RF_AutoConfig_t *ac;
3654 RF_AutoConfig_t *next_ac;
3655
3656 ac = cset->ac;
3657 while(ac!=NULL) {
3658 next_ac = ac->next;
3659 /* nuke the label */
3660 free(ac->clabel, M_RAIDFRAME);
3661 /* cleanup the config structure */
3662 free(ac, M_RAIDFRAME);
3663 /* "next.." */
3664 ac = next_ac;
3665 }
3666 /* and, finally, nuke the config set */
3667 free(cset, M_RAIDFRAME);
3668 }
3669
3670
3671 void
3672 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3673 {
3674 /* current version number */
3675 clabel->version = RF_COMPONENT_LABEL_VERSION;
3676 clabel->serial_number = raidPtr->serial_number;
3677 clabel->mod_counter = raidPtr->mod_counter;
3678
3679 clabel->num_rows = 1;
3680 clabel->num_columns = raidPtr->numCol;
3681 clabel->clean = RF_RAID_DIRTY; /* not clean */
3682 clabel->status = rf_ds_optimal; /* "It's good!" */
3683
3684 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3685 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3686 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3687
3688 clabel->blockSize = raidPtr->bytesPerSector;
3689 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3690
3691 /* XXX not portable */
3692 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3693 clabel->maxOutstanding = raidPtr->maxOutstanding;
3694 clabel->autoconfigure = raidPtr->autoconfigure;
3695 clabel->root_partition = raidPtr->root_partition;
3696 clabel->last_unit = raidPtr->raidid;
3697 clabel->config_order = raidPtr->config_order;
3698
3699 #ifndef RF_NO_PARITY_MAP
3700 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3701 #endif
3702 }
3703
3704 struct raid_softc *
3705 rf_auto_config_set(RF_ConfigSet_t *cset)
3706 {
3707 RF_Raid_t *raidPtr;
3708 RF_Config_t *config;
3709 int raidID;
3710 struct raid_softc *sc;
3711
3712 #ifdef DEBUG
3713 printf("RAID autoconfigure\n");
3714 #endif
3715
3716 /* 1. Create a config structure */
3717 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3718 if (config == NULL) {
3719 printf("Out of mem!?!?\n");
3720 /* XXX do something more intelligent here. */
3721 return NULL;
3722 }
3723
3724 /*
3725 2. Figure out what RAID ID this one is supposed to live at
3726 See if we can get the same RAID dev that it was configured
3727 on last time..
3728 */
3729
3730 raidID = cset->ac->clabel->last_unit;
3731 for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
3732 continue;
3733 #ifdef DEBUG
3734 printf("Configuring raid%d:\n",raidID);
3735 #endif
3736
3737 raidPtr = &sc->sc_r;
3738
3739 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3740 raidPtr->softc = sc;
3741 raidPtr->raidid = raidID;
3742 raidPtr->openings = RAIDOUTSTANDING;
3743
3744 /* 3. Build the configuration structure */
3745 rf_create_configuration(cset->ac, config, raidPtr);
3746
3747 /* 4. Do the configuration */
3748 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3749 raidinit(sc);
3750
3751 rf_markalldirty(raidPtr);
3752 raidPtr->autoconfigure = 1; /* XXX do this here? */
3753 if (cset->ac->clabel->root_partition==1) {
3754 /* everything configured just fine. Make a note
3755 that this set is eligible to be root. */
3756 cset->rootable = 1;
3757 /* XXX do this here? */
3758 raidPtr->root_partition = 1;
3759 }
3760 } else {
3761 raidput(sc);
3762 sc = NULL;
3763 }
3764
3765 /* 5. Cleanup */
3766 free(config, M_RAIDFRAME);
3767 return sc;
3768 }
3769
3770 void
3771 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3772 {
3773 struct buf *bp;
3774 struct raid_softc *rs;
3775
3776 bp = (struct buf *)desc->bp;
3777 rs = desc->raidPtr->softc;
3778 disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
3779 (bp->b_flags & B_READ));
3780 }
3781
3782 void
3783 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3784 size_t xmin, size_t xmax)
3785 {
3786 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3787 pool_sethiwat(p, xmax);
3788 pool_prime(p, xmin);
3789 pool_setlowat(p, xmin);
3790 }
3791
3792 /*
3793 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
3794 * if there is IO pending and if that IO could possibly be done for a
3795 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3796 * otherwise.
3797 *
3798 */
3799
3800 int
3801 rf_buf_queue_check(RF_Raid_t *raidPtr)
3802 {
3803 struct raid_softc *rs = raidPtr->softc;
3804 if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
3805 /* there is work to do */
3806 return 0;
3807 }
3808 /* default is nothing to do */
3809 return 1;
3810 }
3811
3812 int
3813 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3814 {
3815 uint64_t numsecs;
3816 unsigned secsize;
3817 int error;
3818
3819 error = getdisksize(vp, &numsecs, &secsize);
3820 if (error == 0) {
3821 diskPtr->blockSize = secsize;
3822 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3823 diskPtr->partitionSize = numsecs;
3824 return 0;
3825 }
3826 return error;
3827 }
3828
3829 static int
3830 raid_match(device_t self, cfdata_t cfdata, void *aux)
3831 {
3832 return 1;
3833 }
3834
3835 static void
3836 raid_attach(device_t parent, device_t self, void *aux)
3837 {
3838
3839 }
3840
3841
3842 static int
3843 raid_detach(device_t self, int flags)
3844 {
3845 int error;
3846 struct raid_softc *rs = raidget(device_unit(self));
3847
3848 if (rs == NULL)
3849 return ENXIO;
3850
3851 if ((error = raidlock(rs)) != 0)
3852 return (error);
3853
3854 error = raid_detach_unlocked(rs);
3855
3856 raidunlock(rs);
3857
3858 /* XXXkd: raidput(rs) ??? */
3859
3860 return error;
3861 }
3862
3863 static void
3864 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3865 {
3866 struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
3867
3868 memset(dg, 0, sizeof(*dg));
3869
3870 dg->dg_secperunit = raidPtr->totalSectors;
3871 dg->dg_secsize = raidPtr->bytesPerSector;
3872 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3873 dg->dg_ntracks = 4 * raidPtr->numCol;
3874
3875 disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
3876 }
3877
3878 /*
3879 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3880 * We end up returning whatever error was returned by the first cache flush
3881 * that fails.
3882 */
3883
3884 int
3885 rf_sync_component_caches(RF_Raid_t *raidPtr)
3886 {
3887 int c, sparecol;
3888 int e,error;
3889 int force = 1;
3890
3891 error = 0;
3892 for (c = 0; c < raidPtr->numCol; c++) {
3893 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3894 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3895 &force, FWRITE, NOCRED);
3896 if (e) {
3897 if (e != ENODEV)
3898 printf("raid%d: cache flush to component %s failed.\n",
3899 raidPtr->raidid, raidPtr->Disks[c].devname);
3900 if (error == 0) {
3901 error = e;
3902 }
3903 }
3904 }
3905 }
3906
3907 for( c = 0; c < raidPtr->numSpare ; c++) {
3908 sparecol = raidPtr->numCol + c;
3909 /* Need to ensure that the reconstruct actually completed! */
3910 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3911 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3912 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3913 if (e) {
3914 if (e != ENODEV)
3915 printf("raid%d: cache flush to component %s failed.\n",
3916 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3917 if (error == 0) {
3918 error = e;
3919 }
3920 }
3921 }
3922 }
3923 return error;
3924 }
3925
3926 static void
3927 raidminphys(struct buf *bp)
3928 {
3929 dev_t dev;
3930 int unit;
3931 struct raid_softc *rs;
3932 RF_Raid_t *raidPtr;
3933 long xmax;
3934
3935 dev = bp->b_dev;
3936 unit = raidunit(dev);
3937 rs = raidget(unit);
3938 raidPtr = &(rs->sc_r);
3939
3940 xmax = raidPtr->Layout.numDataCol * MAXPHYS;
3941
3942 if (bp->b_bcount > xmax) {
3943 bp->b_bcount = xmax;
3944 }
3945 }
3946