rf_netbsdkintf.c revision 1.250.4.6.2.1 1 /* $NetBSD: rf_netbsdkintf.c,v 1.250.4.6.2.1 2014/11/20 15:52:10 sborrill Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Copyright (c) 1990, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * the Systems Programming Group of the University of Utah Computer
37 * Science Department.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 * from: Utah $Hdr: cd.c 1.6 90/11/28$
64 *
65 * @(#)cd.c 8.2 (Berkeley) 11/16/93
66 */
67
68 /*
69 * Copyright (c) 1988 University of Utah.
70 *
71 * This code is derived from software contributed to Berkeley by
72 * the Systems Programming Group of the University of Utah Computer
73 * Science Department.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgement:
85 * This product includes software developed by the University of
86 * California, Berkeley and its contributors.
87 * 4. Neither the name of the University nor the names of its contributors
88 * may be used to endorse or promote products derived from this software
89 * without specific prior written permission.
90 *
91 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
94 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
95 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
96 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
97 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
98 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
99 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
100 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
101 * SUCH DAMAGE.
102 *
103 * from: Utah $Hdr: cd.c 1.6 90/11/28$
104 *
105 * @(#)cd.c 8.2 (Berkeley) 11/16/93
106 */
107
108 /*
109 * Copyright (c) 1995 Carnegie-Mellon University.
110 * All rights reserved.
111 *
112 * Authors: Mark Holland, Jim Zelenka
113 *
114 * Permission to use, copy, modify and distribute this software and
115 * its documentation is hereby granted, provided that both the copyright
116 * notice and this permission notice appear in all copies of the
117 * software, derivative works or modified versions, and any portions
118 * thereof, and that both notices appear in supporting documentation.
119 *
120 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
121 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
122 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
123 *
124 * Carnegie Mellon requests users of this software to return to
125 *
126 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
127 * School of Computer Science
128 * Carnegie Mellon University
129 * Pittsburgh PA 15213-3890
130 *
131 * any improvements or extensions that they make and grant Carnegie the
132 * rights to redistribute these changes.
133 */
134
135 /***********************************************************
136 *
137 * rf_kintf.c -- the kernel interface routines for RAIDframe
138 *
139 ***********************************************************/
140
141 #include <sys/cdefs.h>
142 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.250.4.6.2.1 2014/11/20 15:52:10 sborrill Exp $");
143
144 #include <sys/param.h>
145 #include <sys/errno.h>
146 #include <sys/pool.h>
147 #include <sys/proc.h>
148 #include <sys/queue.h>
149 #include <sys/disk.h>
150 #include <sys/device.h>
151 #include <sys/stat.h>
152 #include <sys/ioctl.h>
153 #include <sys/fcntl.h>
154 #include <sys/systm.h>
155 #include <sys/vnode.h>
156 #include <sys/disklabel.h>
157 #include <sys/conf.h>
158 #include <sys/buf.h>
159 #include <sys/bufq.h>
160 #include <sys/user.h>
161 #include <sys/reboot.h>
162 #include <sys/kauth.h>
163
164 #include <prop/proplib.h>
165
166 #include <dev/raidframe/raidframevar.h>
167 #include <dev/raidframe/raidframeio.h>
168 #include <dev/raidframe/rf_paritymap.h>
169 #include "raid.h"
170 #include "opt_raid_autoconfig.h"
171 #include "rf_raid.h"
172 #include "rf_copyback.h"
173 #include "rf_dag.h"
174 #include "rf_dagflags.h"
175 #include "rf_desc.h"
176 #include "rf_diskqueue.h"
177 #include "rf_etimer.h"
178 #include "rf_general.h"
179 #include "rf_kintf.h"
180 #include "rf_options.h"
181 #include "rf_driver.h"
182 #include "rf_parityscan.h"
183 #include "rf_threadstuff.h"
184
185 #ifdef DEBUG
186 int rf_kdebug_level = 0;
187 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
188 #else /* DEBUG */
189 #define db1_printf(a) { }
190 #endif /* DEBUG */
191
192 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
193
194 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
195 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
196
197 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
198 * spare table */
199 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
200 * installation process */
201 #endif
202
203 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
204
205 /* prototypes */
206 static void KernelWakeupFunc(struct buf *);
207 static void InitBP(struct buf *, struct vnode *, unsigned,
208 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
209 void *, int, struct proc *);
210 static void raidinit(RF_Raid_t *);
211
212 void raidattach(int);
213 static int raid_match(struct device *, struct cfdata *, void *);
214 static void raid_attach(struct device *, struct device *, void *);
215 static int raid_detach(struct device *, int);
216
217 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
218 daddr_t, daddr_t);
219 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
220 daddr_t, daddr_t, int);
221
222 static int raidwrite_component_label(dev_t, struct vnode *,
223 RF_ComponentLabel_t *);
224 static int raidread_component_label(dev_t, struct vnode *,
225 RF_ComponentLabel_t *);
226
227
228 dev_type_open(raidopen);
229 dev_type_close(raidclose);
230 dev_type_read(raidread);
231 dev_type_write(raidwrite);
232 dev_type_ioctl(raidioctl);
233 dev_type_strategy(raidstrategy);
234 dev_type_dump(raiddump);
235 dev_type_size(raidsize);
236
237 const struct bdevsw raid_bdevsw = {
238 raidopen, raidclose, raidstrategy, raidioctl,
239 raiddump, raidsize, D_DISK
240 };
241
242 const struct cdevsw raid_cdevsw = {
243 raidopen, raidclose, raidread, raidwrite, raidioctl,
244 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
245 };
246
247 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
248
249 /* XXX Not sure if the following should be replacing the raidPtrs above,
250 or if it should be used in conjunction with that...
251 */
252
253 struct raid_softc {
254 struct device *sc_dev;
255 int sc_flags; /* flags */
256 int sc_cflags; /* configuration flags */
257 uint64_t sc_size; /* size of the raid device */
258 char sc_xname[20]; /* XXX external name */
259 struct disk sc_dkdev; /* generic disk device info */
260 struct bufq_state *buf_queue; /* used for the device queue */
261 };
262 /* sc_flags */
263 #define RAIDF_INITED 0x01 /* unit has been initialized */
264 #define RAIDF_WLABEL 0x02 /* label area is writable */
265 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
266 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
267 #define RAIDF_LOCKED 0x80 /* unit is locked */
268
269 #define raidunit(x) DISKUNIT(x)
270 int numraid = 0;
271
272 extern struct cfdriver raid_cd;
273 CFATTACH_DECL_NEW(raid, sizeof(struct raid_softc),
274 raid_match, raid_attach, raid_detach, NULL);
275
276 /*
277 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
278 * Be aware that large numbers can allow the driver to consume a lot of
279 * kernel memory, especially on writes, and in degraded mode reads.
280 *
281 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
282 * a single 64K write will typically require 64K for the old data,
283 * 64K for the old parity, and 64K for the new parity, for a total
284 * of 192K (if the parity buffer is not re-used immediately).
285 * Even it if is used immediately, that's still 128K, which when multiplied
286 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
287 *
288 * Now in degraded mode, for example, a 64K read on the above setup may
289 * require data reconstruction, which will require *all* of the 4 remaining
290 * disks to participate -- 4 * 32K/disk == 128K again.
291 */
292
293 #ifndef RAIDOUTSTANDING
294 #define RAIDOUTSTANDING 6
295 #endif
296
297 #define RAIDLABELDEV(dev) \
298 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
299
300 /* declared here, and made public, for the benefit of KVM stuff.. */
301 struct raid_softc *raid_softc;
302
303 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
304 struct disklabel *);
305 static void raidgetdisklabel(dev_t);
306 static void raidmakedisklabel(struct raid_softc *);
307
308 static int raidlock(struct raid_softc *);
309 static void raidunlock(struct raid_softc *);
310
311 static void rf_markalldirty(RF_Raid_t *);
312 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
313
314 void rf_ReconThread(struct rf_recon_req *);
315 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
316 void rf_CopybackThread(RF_Raid_t *raidPtr);
317 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
318 int rf_autoconfig(struct device *self);
319 void rf_buildroothack(RF_ConfigSet_t *);
320
321 RF_AutoConfig_t *rf_find_raid_components(void);
322 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
323 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
324 static int rf_reasonable_label(RF_ComponentLabel_t *);
325 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
326 int rf_set_autoconfig(RF_Raid_t *, int);
327 int rf_set_rootpartition(RF_Raid_t *, int);
328 void rf_release_all_vps(RF_ConfigSet_t *);
329 void rf_cleanup_config_set(RF_ConfigSet_t *);
330 int rf_have_enough_components(RF_ConfigSet_t *);
331 int rf_auto_config_set(RF_ConfigSet_t *, int *);
332
333 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
334 allow autoconfig to take place.
335 Note that this is overridden by having
336 RAID_AUTOCONFIG as an option in the
337 kernel config file. */
338
339 struct RF_Pools_s rf_pools;
340
341 void
342 raidattach(int num)
343 {
344 int raidID;
345 int i, rc;
346
347 #ifdef DEBUG
348 printf("raidattach: Asked for %d units\n", num);
349 #endif
350
351 if (num <= 0) {
352 #ifdef DIAGNOSTIC
353 panic("raidattach: count <= 0");
354 #endif
355 return;
356 }
357 /* This is where all the initialization stuff gets done. */
358
359 numraid = num;
360
361 /* Make some space for requested number of units... */
362
363 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
364 if (raidPtrs == NULL) {
365 panic("raidPtrs is NULL!!");
366 }
367
368 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
369 rf_mutex_init(&rf_sparet_wait_mutex);
370
371 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
372 #endif
373
374 for (i = 0; i < num; i++)
375 raidPtrs[i] = NULL;
376 rc = rf_BootRaidframe();
377 if (rc == 0)
378 aprint_normal("Kernelized RAIDframe activated\n");
379 else
380 panic("Serious error booting RAID!!");
381
382 /* put together some datastructures like the CCD device does.. This
383 * lets us lock the device and what-not when it gets opened. */
384
385 raid_softc = (struct raid_softc *)
386 malloc(num * sizeof(struct raid_softc),
387 M_RAIDFRAME, M_NOWAIT);
388 if (raid_softc == NULL) {
389 aprint_error("WARNING: no memory for RAIDframe driver\n");
390 return;
391 }
392
393 memset(raid_softc, 0, num * sizeof(struct raid_softc));
394
395 for (raidID = 0; raidID < num; raidID++) {
396 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
397
398 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
399 (RF_Raid_t *));
400 if (raidPtrs[raidID] == NULL) {
401 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
402 numraid = raidID;
403 return;
404 }
405 }
406
407 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
408 aprint_error("raidattach: config_cfattach_attach failed?\n");
409 }
410
411 #ifdef RAID_AUTOCONFIG
412 raidautoconfig = 1;
413 #endif
414
415 /*
416 * Register a finalizer which will be used to auto-config RAID
417 * sets once all real hardware devices have been found.
418 */
419 if (config_finalize_register(NULL, rf_autoconfig) != 0)
420 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
421 }
422
423 int
424 rf_autoconfig(struct device *self)
425 {
426 RF_AutoConfig_t *ac_list;
427 RF_ConfigSet_t *config_sets;
428
429 if (raidautoconfig == 0)
430 return (0);
431
432 /* XXX This code can only be run once. */
433 raidautoconfig = 0;
434
435 /* 1. locate all RAID components on the system */
436 #ifdef DEBUG
437 printf("Searching for RAID components...\n");
438 #endif
439 ac_list = rf_find_raid_components();
440
441 /* 2. Sort them into their respective sets. */
442 config_sets = rf_create_auto_sets(ac_list);
443
444 /*
445 * 3. Evaluate each set andconfigure the valid ones.
446 * This gets done in rf_buildroothack().
447 */
448 rf_buildroothack(config_sets);
449
450 return 1;
451 }
452
453 void
454 rf_buildroothack(RF_ConfigSet_t *config_sets)
455 {
456 RF_ConfigSet_t *cset;
457 RF_ConfigSet_t *next_cset;
458 int retcode;
459 int raidID;
460 int rootID;
461 int col;
462 int num_root;
463 char *devname;
464
465 rootID = 0;
466 num_root = 0;
467 cset = config_sets;
468 while(cset != NULL ) {
469 next_cset = cset->next;
470 if (rf_have_enough_components(cset) &&
471 cset->ac->clabel->autoconfigure==1) {
472 retcode = rf_auto_config_set(cset,&raidID);
473 if (!retcode) {
474 #ifdef DEBUG
475 printf("raid%d: configured ok\n", raidID);
476 #endif
477 if (cset->rootable) {
478 rootID = raidID;
479 num_root++;
480 }
481 } else {
482 /* The autoconfig didn't work :( */
483 #ifdef DEBUG
484 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
485 #endif
486 rf_release_all_vps(cset);
487 }
488 } else {
489 /* we're not autoconfiguring this set...
490 release the associated resources */
491 rf_release_all_vps(cset);
492 }
493 /* cleanup */
494 rf_cleanup_config_set(cset);
495 cset = next_cset;
496 }
497
498 /* if the user has specified what the root device should be
499 then we don't touch booted_device or boothowto... */
500
501 if (rootspec != NULL)
502 return;
503
504 /* we found something bootable... */
505
506 if (num_root == 1) {
507 booted_device = raid_softc[rootID].sc_dev;
508 } else if (num_root > 1) {
509
510 /*
511 * Maybe the MD code can help. If it cannot, then
512 * setroot() will discover that we have no
513 * booted_device and will ask the user if nothing was
514 * hardwired in the kernel config file
515 */
516
517 if (booted_device == NULL)
518 cpu_rootconf();
519 if (booted_device == NULL)
520 return;
521
522 num_root = 0;
523 for (raidID = 0; raidID < numraid; raidID++) {
524 if (raidPtrs[raidID]->valid == 0)
525 continue;
526
527 if (raidPtrs[raidID]->root_partition == 0)
528 continue;
529
530 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
531 devname = raidPtrs[raidID]->Disks[col].devname;
532 devname += sizeof("/dev/") - 1;
533 if (strncmp(devname, device_xname(booted_device),
534 strlen(device_xname(booted_device))) != 0)
535 continue;
536 #ifdef DEBUG
537 printf("raid%d includes boot device %s\n",
538 raidID, devname);
539 #endif
540 num_root++;
541 rootID = raidID;
542 }
543 }
544
545 if (num_root == 1) {
546 booted_device = raid_softc[rootID].sc_dev;
547 } else {
548 /* we can't guess.. require the user to answer... */
549 boothowto |= RB_ASKNAME;
550 }
551 }
552 }
553
554
555 int
556 raidsize(dev_t dev)
557 {
558 struct raid_softc *rs;
559 struct disklabel *lp;
560 int part, unit, omask, size;
561
562 unit = raidunit(dev);
563 if (unit >= numraid)
564 return (-1);
565 rs = &raid_softc[unit];
566
567 if ((rs->sc_flags & RAIDF_INITED) == 0)
568 return (-1);
569
570 part = DISKPART(dev);
571 omask = rs->sc_dkdev.dk_openmask & (1 << part);
572 lp = rs->sc_dkdev.dk_label;
573
574 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
575 return (-1);
576
577 if (lp->d_partitions[part].p_fstype != FS_SWAP)
578 size = -1;
579 else
580 size = lp->d_partitions[part].p_size *
581 (lp->d_secsize / DEV_BSIZE);
582
583 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
584 return (-1);
585
586 return (size);
587
588 }
589
590 int
591 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
592 {
593 int unit = raidunit(dev);
594 struct raid_softc *rs;
595 const struct bdevsw *bdev;
596 struct disklabel *lp;
597 RF_Raid_t *raidPtr;
598 daddr_t offset;
599 int part, c, sparecol, j, scol, dumpto;
600 int error = 0;
601
602 if (unit >= numraid)
603 return (ENXIO);
604
605 rs = &raid_softc[unit];
606 raidPtr = raidPtrs[unit];
607
608 if ((rs->sc_flags & RAIDF_INITED) == 0)
609 return ENXIO;
610
611 /* we only support dumping to RAID 1 sets */
612 if (raidPtr->Layout.numDataCol != 1 ||
613 raidPtr->Layout.numParityCol != 1)
614 return EINVAL;
615
616
617 if ((error = raidlock(rs)) != 0)
618 return error;
619
620 if (size % DEV_BSIZE != 0) {
621 error = EINVAL;
622 goto out;
623 }
624
625 if (blkno + size / DEV_BSIZE > rs->sc_size) {
626 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
627 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
628 size / DEV_BSIZE, rs->sc_size);
629 error = EINVAL;
630 goto out;
631 }
632
633 part = DISKPART(dev);
634 lp = rs->sc_dkdev.dk_label;
635 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
636
637 /* figure out what device is alive.. */
638
639 /*
640 Look for a component to dump to. The preference for the
641 component to dump to is as follows:
642 1) the master
643 2) a used_spare of the master
644 3) the slave
645 4) a used_spare of the slave
646 */
647
648 dumpto = -1;
649 for (c = 0; c < raidPtr->numCol; c++) {
650 if (raidPtr->Disks[c].status == rf_ds_optimal) {
651 /* this might be the one */
652 dumpto = c;
653 break;
654 }
655 }
656
657 /*
658 At this point we have possibly selected a live master or a
659 live slave. We now check to see if there is a spared
660 master (or a spared slave), if we didn't find a live master
661 or a live slave.
662 */
663
664 for (c = 0; c < raidPtr->numSpare; c++) {
665 sparecol = raidPtr->numCol + c;
666 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
667 /* How about this one? */
668 scol = -1;
669 for(j=0;j<raidPtr->numCol;j++) {
670 if (raidPtr->Disks[j].spareCol == sparecol) {
671 scol = j;
672 break;
673 }
674 }
675 if (scol == 0) {
676 /*
677 We must have found a spared master!
678 We'll take that over anything else
679 found so far. (We couldn't have
680 found a real master before, since
681 this is a used spare, and it's
682 saying that it's replacing the
683 master.) On reboot (with
684 autoconfiguration turned on)
685 sparecol will become the 1st
686 component (component0) of this set.
687 */
688 dumpto = sparecol;
689 break;
690 } else if (scol != -1) {
691 /*
692 Must be a spared slave. We'll dump
693 to that if we havn't found anything
694 else so far.
695 */
696 if (dumpto == -1)
697 dumpto = sparecol;
698 }
699 }
700 }
701
702 if (dumpto == -1) {
703 /* we couldn't find any live components to dump to!?!?
704 */
705 error = EINVAL;
706 goto out;
707 }
708
709 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
710
711 /*
712 Note that blkno is relative to this particular partition.
713 By adding the offset of this partition in the RAID
714 set, and also adding RF_PROTECTED_SECTORS, we get a
715 value that is relative to the partition used for the
716 underlying component.
717 */
718
719 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
720 blkno + offset, va, size);
721
722 out:
723 raidunlock(rs);
724
725 return error;
726 }
727 /* ARGSUSED */
728 int
729 raidopen(dev_t dev, int flags, int fmt,
730 struct lwp *l)
731 {
732 int unit = raidunit(dev);
733 struct raid_softc *rs;
734 struct disklabel *lp;
735 int part, pmask;
736 int error = 0;
737
738 if (unit >= numraid)
739 return (ENXIO);
740 rs = &raid_softc[unit];
741
742 if ((error = raidlock(rs)) != 0)
743 return (error);
744 lp = rs->sc_dkdev.dk_label;
745
746 part = DISKPART(dev);
747
748 /*
749 * If there are wedges, and this is not RAW_PART, then we
750 * need to fail.
751 */
752 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
753 error = EBUSY;
754 goto bad;
755 }
756 pmask = (1 << part);
757
758 if ((rs->sc_flags & RAIDF_INITED) &&
759 (rs->sc_dkdev.dk_openmask == 0))
760 raidgetdisklabel(dev);
761
762 /* make sure that this partition exists */
763
764 if (part != RAW_PART) {
765 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
766 ((part >= lp->d_npartitions) ||
767 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
768 error = ENXIO;
769 goto bad;
770 }
771 }
772 /* Prevent this unit from being unconfigured while open. */
773 switch (fmt) {
774 case S_IFCHR:
775 rs->sc_dkdev.dk_copenmask |= pmask;
776 break;
777
778 case S_IFBLK:
779 rs->sc_dkdev.dk_bopenmask |= pmask;
780 break;
781 }
782
783 if ((rs->sc_dkdev.dk_openmask == 0) &&
784 ((rs->sc_flags & RAIDF_INITED) != 0)) {
785 /* First one... mark things as dirty... Note that we *MUST*
786 have done a configure before this. I DO NOT WANT TO BE
787 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
788 THAT THEY BELONG TOGETHER!!!!! */
789 /* XXX should check to see if we're only open for reading
790 here... If so, we needn't do this, but then need some
791 other way of keeping track of what's happened.. */
792
793 rf_markalldirty( raidPtrs[unit] );
794 }
795
796
797 rs->sc_dkdev.dk_openmask =
798 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
799
800 bad:
801 raidunlock(rs);
802
803 return (error);
804
805
806 }
807 /* ARGSUSED */
808 int
809 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
810 {
811 int unit = raidunit(dev);
812 struct cfdata *cf;
813 struct raid_softc *rs;
814 int error = 0;
815 int part;
816
817 if (unit >= numraid)
818 return (ENXIO);
819 rs = &raid_softc[unit];
820
821 if ((error = raidlock(rs)) != 0)
822 return (error);
823
824 part = DISKPART(dev);
825
826 /* ...that much closer to allowing unconfiguration... */
827 switch (fmt) {
828 case S_IFCHR:
829 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
830 break;
831
832 case S_IFBLK:
833 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
834 break;
835 }
836 rs->sc_dkdev.dk_openmask =
837 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
838
839 if ((rs->sc_dkdev.dk_openmask == 0) &&
840 ((rs->sc_flags & RAIDF_INITED) != 0)) {
841 /* Last one... device is not unconfigured yet.
842 Device shutdown has taken care of setting the
843 clean bits if RAIDF_INITED is not set
844 mark things as clean... */
845
846 rf_update_component_labels(raidPtrs[unit],
847 RF_FINAL_COMPONENT_UPDATE);
848 if (doing_shutdown) {
849 /* last one, and we're going down, so
850 lights out for this RAID set too. */
851 error = rf_Shutdown(raidPtrs[unit]);
852
853 /* It's no longer initialized... */
854 rs->sc_flags &= ~RAIDF_INITED;
855
856 /* detach the device */
857
858 cf = device_cfdata(rs->sc_dev);
859 error = config_detach(rs->sc_dev, DETACH_QUIET);
860 free(cf, M_RAIDFRAME);
861
862 /* Detach the disk. */
863 disk_detach(&rs->sc_dkdev);
864 disk_destroy(&rs->sc_dkdev);
865 }
866 }
867
868 raidunlock(rs);
869 return (0);
870
871 }
872
873 void
874 raidstrategy(struct buf *bp)
875 {
876 int s;
877
878 unsigned int raidID = raidunit(bp->b_dev);
879 RF_Raid_t *raidPtr;
880 struct raid_softc *rs = &raid_softc[raidID];
881 int wlabel;
882
883 if ((rs->sc_flags & RAIDF_INITED) ==0) {
884 bp->b_error = ENXIO;
885 goto done;
886 }
887 if (raidID >= numraid || !raidPtrs[raidID]) {
888 bp->b_error = ENODEV;
889 goto done;
890 }
891 raidPtr = raidPtrs[raidID];
892 if (!raidPtr->valid) {
893 bp->b_error = ENODEV;
894 goto done;
895 }
896 if (bp->b_bcount == 0) {
897 db1_printf(("b_bcount is zero..\n"));
898 goto done;
899 }
900
901 /*
902 * Do bounds checking and adjust transfer. If there's an
903 * error, the bounds check will flag that for us.
904 */
905
906 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
907 if (DISKPART(bp->b_dev) == RAW_PART) {
908 uint64_t size; /* device size in DEV_BSIZE unit */
909
910 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
911 size = raidPtr->totalSectors <<
912 (raidPtr->logBytesPerSector - DEV_BSHIFT);
913 } else {
914 size = raidPtr->totalSectors >>
915 (DEV_BSHIFT - raidPtr->logBytesPerSector);
916 }
917 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
918 goto done;
919 }
920 } else {
921 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
922 db1_printf(("Bounds check failed!!:%d %d\n",
923 (int) bp->b_blkno, (int) wlabel));
924 goto done;
925 }
926 }
927 s = splbio();
928
929 bp->b_resid = 0;
930
931 /* stuff it onto our queue */
932 BUFQ_PUT(rs->buf_queue, bp);
933
934 /* scheduled the IO to happen at the next convenient time */
935 wakeup(&(raidPtrs[raidID]->iodone));
936
937 splx(s);
938 return;
939
940 done:
941 bp->b_resid = bp->b_bcount;
942 biodone(bp);
943 }
944 /* ARGSUSED */
945 int
946 raidread(dev_t dev, struct uio *uio, int flags)
947 {
948 int unit = raidunit(dev);
949 struct raid_softc *rs;
950
951 if (unit >= numraid)
952 return (ENXIO);
953 rs = &raid_softc[unit];
954
955 if ((rs->sc_flags & RAIDF_INITED) == 0)
956 return (ENXIO);
957
958 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
959
960 }
961 /* ARGSUSED */
962 int
963 raidwrite(dev_t dev, struct uio *uio, int flags)
964 {
965 int unit = raidunit(dev);
966 struct raid_softc *rs;
967
968 if (unit >= numraid)
969 return (ENXIO);
970 rs = &raid_softc[unit];
971
972 if ((rs->sc_flags & RAIDF_INITED) == 0)
973 return (ENXIO);
974
975 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
976
977 }
978
979 int
980 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
981 {
982 int unit = raidunit(dev);
983 int error = 0;
984 int part, pmask;
985 struct cfdata *cf;
986 struct raid_softc *rs;
987 RF_Config_t *k_cfg, *u_cfg;
988 RF_Raid_t *raidPtr;
989 RF_RaidDisk_t *diskPtr;
990 RF_AccTotals_t *totals;
991 RF_DeviceConfig_t *d_cfg, **ucfgp;
992 u_char *specific_buf;
993 int retcode = 0;
994 int column;
995 /* int raidid; */
996 struct rf_recon_req *rrcopy, *rr;
997 RF_ComponentLabel_t *clabel;
998 RF_ComponentLabel_t *ci_label;
999 RF_ComponentLabel_t **clabel_ptr;
1000 RF_SingleComponent_t *sparePtr,*componentPtr;
1001 RF_SingleComponent_t component;
1002 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1003 int i, j, d;
1004 #ifdef __HAVE_OLD_DISKLABEL
1005 struct disklabel newlabel;
1006 #endif
1007 struct dkwedge_info *dkw;
1008
1009 if (unit >= numraid)
1010 return (ENXIO);
1011 rs = &raid_softc[unit];
1012 raidPtr = raidPtrs[unit];
1013
1014 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
1015 (int) DISKPART(dev), (int) unit, (int) cmd));
1016
1017 /* Must be open for writes for these commands... */
1018 switch (cmd) {
1019 #ifdef DIOCGSECTORSIZE
1020 case DIOCGSECTORSIZE:
1021 *(u_int *)data = raidPtr->bytesPerSector;
1022 return 0;
1023 case DIOCGMEDIASIZE:
1024 *(off_t *)data =
1025 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1026 return 0;
1027 #endif
1028 case DIOCSDINFO:
1029 case DIOCWDINFO:
1030 #ifdef __HAVE_OLD_DISKLABEL
1031 case ODIOCWDINFO:
1032 case ODIOCSDINFO:
1033 #endif
1034 case DIOCWLABEL:
1035 case DIOCAWEDGE:
1036 case DIOCDWEDGE:
1037 if ((flag & FWRITE) == 0)
1038 return (EBADF);
1039 }
1040
1041 /* Must be initialized for these... */
1042 switch (cmd) {
1043 case DIOCGDINFO:
1044 case DIOCSDINFO:
1045 case DIOCWDINFO:
1046 #ifdef __HAVE_OLD_DISKLABEL
1047 case ODIOCGDINFO:
1048 case ODIOCWDINFO:
1049 case ODIOCSDINFO:
1050 case ODIOCGDEFLABEL:
1051 #endif
1052 case DIOCGPART:
1053 case DIOCWLABEL:
1054 case DIOCGDEFLABEL:
1055 case DIOCAWEDGE:
1056 case DIOCDWEDGE:
1057 case DIOCLWEDGES:
1058 case DIOCCACHESYNC:
1059 case RAIDFRAME_SHUTDOWN:
1060 case RAIDFRAME_REWRITEPARITY:
1061 case RAIDFRAME_GET_INFO:
1062 case RAIDFRAME_RESET_ACCTOTALS:
1063 case RAIDFRAME_GET_ACCTOTALS:
1064 case RAIDFRAME_KEEP_ACCTOTALS:
1065 case RAIDFRAME_GET_SIZE:
1066 case RAIDFRAME_FAIL_DISK:
1067 case RAIDFRAME_COPYBACK:
1068 case RAIDFRAME_CHECK_RECON_STATUS:
1069 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1070 case RAIDFRAME_GET_COMPONENT_LABEL:
1071 case RAIDFRAME_SET_COMPONENT_LABEL:
1072 case RAIDFRAME_ADD_HOT_SPARE:
1073 case RAIDFRAME_REMOVE_HOT_SPARE:
1074 case RAIDFRAME_INIT_LABELS:
1075 case RAIDFRAME_REBUILD_IN_PLACE:
1076 case RAIDFRAME_CHECK_PARITY:
1077 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1078 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1079 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1080 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1081 case RAIDFRAME_SET_AUTOCONFIG:
1082 case RAIDFRAME_SET_ROOT:
1083 case RAIDFRAME_DELETE_COMPONENT:
1084 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1085 case RAIDFRAME_PARITYMAP_STATUS:
1086 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1087 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1088 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1089 if ((rs->sc_flags & RAIDF_INITED) == 0)
1090 return (ENXIO);
1091 }
1092
1093 switch (cmd) {
1094
1095 /* configure the system */
1096 case RAIDFRAME_CONFIGURE:
1097
1098 if (raidPtr->valid) {
1099 /* There is a valid RAID set running on this unit! */
1100 printf("raid%d: Device already configured!\n",unit);
1101 return(EINVAL);
1102 }
1103
1104 /* copy-in the configuration information */
1105 /* data points to a pointer to the configuration structure */
1106
1107 u_cfg = *((RF_Config_t **) data);
1108 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1109 if (k_cfg == NULL) {
1110 return (ENOMEM);
1111 }
1112 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1113 if (retcode) {
1114 RF_Free(k_cfg, sizeof(RF_Config_t));
1115 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1116 retcode));
1117 return (retcode);
1118 }
1119 /* allocate a buffer for the layout-specific data, and copy it
1120 * in */
1121 if (k_cfg->layoutSpecificSize) {
1122 if (k_cfg->layoutSpecificSize > 10000) {
1123 /* sanity check */
1124 RF_Free(k_cfg, sizeof(RF_Config_t));
1125 return (EINVAL);
1126 }
1127 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1128 (u_char *));
1129 if (specific_buf == NULL) {
1130 RF_Free(k_cfg, sizeof(RF_Config_t));
1131 return (ENOMEM);
1132 }
1133 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1134 k_cfg->layoutSpecificSize);
1135 if (retcode) {
1136 RF_Free(k_cfg, sizeof(RF_Config_t));
1137 RF_Free(specific_buf,
1138 k_cfg->layoutSpecificSize);
1139 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1140 retcode));
1141 return (retcode);
1142 }
1143 } else
1144 specific_buf = NULL;
1145 k_cfg->layoutSpecific = specific_buf;
1146
1147 /* should do some kind of sanity check on the configuration.
1148 * Store the sum of all the bytes in the last byte? */
1149
1150 /* configure the system */
1151
1152 /*
1153 * Clear the entire RAID descriptor, just to make sure
1154 * there is no stale data left in the case of a
1155 * reconfiguration
1156 */
1157 memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1158 raidPtr->raidid = unit;
1159
1160 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1161
1162 if (retcode == 0) {
1163
1164 /* allow this many simultaneous IO's to
1165 this RAID device */
1166 raidPtr->openings = RAIDOUTSTANDING;
1167
1168 raidinit(raidPtr);
1169 rf_markalldirty(raidPtr);
1170 }
1171 /* free the buffers. No return code here. */
1172 if (k_cfg->layoutSpecificSize) {
1173 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1174 }
1175 RF_Free(k_cfg, sizeof(RF_Config_t));
1176
1177 return (retcode);
1178
1179 /* shutdown the system */
1180 case RAIDFRAME_SHUTDOWN:
1181
1182 if ((error = raidlock(rs)) != 0)
1183 return (error);
1184
1185 /*
1186 * If somebody has a partition mounted, we shouldn't
1187 * shutdown.
1188 */
1189
1190 part = DISKPART(dev);
1191 pmask = (1 << part);
1192 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1193 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1194 (rs->sc_dkdev.dk_copenmask & pmask))) {
1195 raidunlock(rs);
1196 return (EBUSY);
1197 }
1198
1199 retcode = rf_Shutdown(raidPtr);
1200
1201 /* It's no longer initialized... */
1202 rs->sc_flags &= ~RAIDF_INITED;
1203
1204 /* free the pseudo device attach bits */
1205
1206 cf = device_cfdata(rs->sc_dev);
1207 /* XXX this causes us to not return any errors
1208 from the above call to rf_Shutdown() */
1209 retcode = config_detach(rs->sc_dev, DETACH_QUIET);
1210 free(cf, M_RAIDFRAME);
1211
1212 /* Detach the disk. */
1213 disk_detach(&rs->sc_dkdev);
1214 disk_destroy(&rs->sc_dkdev);
1215
1216 raidunlock(rs);
1217
1218 return (retcode);
1219 case RAIDFRAME_GET_COMPONENT_LABEL:
1220 clabel_ptr = (RF_ComponentLabel_t **) data;
1221 /* need to read the component label for the disk indicated
1222 by row,column in clabel */
1223
1224 /*
1225 * Perhaps there should be an option to skip the in-core
1226 * copy and hit the disk, as with disklabel(8).
1227 */
1228 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1229
1230 retcode = copyin( *clabel_ptr, clabel,
1231 sizeof(RF_ComponentLabel_t));
1232
1233 if (retcode) {
1234 return(retcode);
1235 }
1236
1237 clabel->row = 0; /* Don't allow looking at anything else.*/
1238
1239 column = clabel->column;
1240
1241 if ((column < 0) || (column >= raidPtr->numCol +
1242 raidPtr->numSpare)) {
1243 return(EINVAL);
1244 }
1245
1246 RF_Free(clabel, sizeof(*clabel));
1247
1248 clabel = raidget_component_label(raidPtr, column);
1249
1250 if (retcode == 0) {
1251 retcode = copyout(clabel, *clabel_ptr,
1252 sizeof(RF_ComponentLabel_t));
1253 }
1254 return (retcode);
1255
1256 #if 0
1257 case RAIDFRAME_SET_COMPONENT_LABEL:
1258 clabel = (RF_ComponentLabel_t *) data;
1259
1260 /* XXX check the label for valid stuff... */
1261 /* Note that some things *should not* get modified --
1262 the user should be re-initing the labels instead of
1263 trying to patch things.
1264 */
1265
1266 raidid = raidPtr->raidid;
1267 #ifdef DEBUG
1268 printf("raid%d: Got component label:\n", raidid);
1269 printf("raid%d: Version: %d\n", raidid, clabel->version);
1270 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1271 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1272 printf("raid%d: Column: %d\n", raidid, clabel->column);
1273 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1274 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1275 printf("raid%d: Status: %d\n", raidid, clabel->status);
1276 #endif
1277 clabel->row = 0;
1278 column = clabel->column;
1279
1280 if ((column < 0) || (column >= raidPtr->numCol)) {
1281 return(EINVAL);
1282 }
1283
1284 /* XXX this isn't allowed to do anything for now :-) */
1285
1286 /* XXX and before it is, we need to fill in the rest
1287 of the fields!?!?!?! */
1288 memcpy(raidget_component_label(raidPtr, column),
1289 clabel, sizeof(*clabel));
1290 raidflush_component_label(raidPtr, column);
1291 return (0);
1292 #endif
1293
1294 case RAIDFRAME_INIT_LABELS:
1295 clabel = (RF_ComponentLabel_t *) data;
1296 /*
1297 we only want the serial number from
1298 the above. We get all the rest of the information
1299 from the config that was used to create this RAID
1300 set.
1301 */
1302
1303 raidPtr->serial_number = clabel->serial_number;
1304
1305 for(column=0;column<raidPtr->numCol;column++) {
1306 diskPtr = &raidPtr->Disks[column];
1307 if (!RF_DEAD_DISK(diskPtr->status)) {
1308 ci_label = raidget_component_label(raidPtr,
1309 column);
1310 /* Zeroing this is important. */
1311 memset(ci_label, 0, sizeof(*ci_label));
1312 raid_init_component_label(raidPtr, ci_label);
1313 ci_label->serial_number =
1314 raidPtr->serial_number;
1315 ci_label->row = 0; /* we dont' pretend to support more */
1316 ci_label->partitionSize =
1317 diskPtr->partitionSize;
1318 ci_label->column = column;
1319 raidflush_component_label(raidPtr, column);
1320 }
1321 /* XXXjld what about the spares? */
1322 }
1323
1324 return (retcode);
1325 case RAIDFRAME_SET_AUTOCONFIG:
1326 d = rf_set_autoconfig(raidPtr, *(int *) data);
1327 printf("raid%d: New autoconfig value is: %d\n",
1328 raidPtr->raidid, d);
1329 *(int *) data = d;
1330 return (retcode);
1331
1332 case RAIDFRAME_SET_ROOT:
1333 d = rf_set_rootpartition(raidPtr, *(int *) data);
1334 printf("raid%d: New rootpartition value is: %d\n",
1335 raidPtr->raidid, d);
1336 *(int *) data = d;
1337 return (retcode);
1338
1339 /* initialize all parity */
1340 case RAIDFRAME_REWRITEPARITY:
1341
1342 if (raidPtr->Layout.map->faultsTolerated == 0) {
1343 /* Parity for RAID 0 is trivially correct */
1344 raidPtr->parity_good = RF_RAID_CLEAN;
1345 return(0);
1346 }
1347
1348 if (raidPtr->parity_rewrite_in_progress == 1) {
1349 /* Re-write is already in progress! */
1350 return(EINVAL);
1351 }
1352
1353 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1354 rf_RewriteParityThread,
1355 raidPtr,"raid_parity");
1356 return (retcode);
1357
1358
1359 case RAIDFRAME_ADD_HOT_SPARE:
1360 sparePtr = (RF_SingleComponent_t *) data;
1361 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1362 retcode = rf_add_hot_spare(raidPtr, &component);
1363 return(retcode);
1364
1365 case RAIDFRAME_REMOVE_HOT_SPARE:
1366 return(retcode);
1367
1368 case RAIDFRAME_DELETE_COMPONENT:
1369 componentPtr = (RF_SingleComponent_t *)data;
1370 memcpy( &component, componentPtr,
1371 sizeof(RF_SingleComponent_t));
1372 retcode = rf_delete_component(raidPtr, &component);
1373 return(retcode);
1374
1375 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1376 componentPtr = (RF_SingleComponent_t *)data;
1377 memcpy( &component, componentPtr,
1378 sizeof(RF_SingleComponent_t));
1379 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1380 return(retcode);
1381
1382 case RAIDFRAME_REBUILD_IN_PLACE:
1383
1384 if (raidPtr->Layout.map->faultsTolerated == 0) {
1385 /* Can't do this on a RAID 0!! */
1386 return(EINVAL);
1387 }
1388
1389 if (raidPtr->recon_in_progress == 1) {
1390 /* a reconstruct is already in progress! */
1391 return(EINVAL);
1392 }
1393
1394 componentPtr = (RF_SingleComponent_t *) data;
1395 memcpy( &component, componentPtr,
1396 sizeof(RF_SingleComponent_t));
1397 component.row = 0; /* we don't support any more */
1398 column = component.column;
1399
1400 if ((column < 0) || (column >= raidPtr->numCol)) {
1401 return(EINVAL);
1402 }
1403
1404 RF_LOCK_MUTEX(raidPtr->mutex);
1405 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1406 (raidPtr->numFailures > 0)) {
1407 /* XXX 0 above shouldn't be constant!!! */
1408 /* some component other than this has failed.
1409 Let's not make things worse than they already
1410 are... */
1411 printf("raid%d: Unable to reconstruct to disk at:\n",
1412 raidPtr->raidid);
1413 printf("raid%d: Col: %d Too many failures.\n",
1414 raidPtr->raidid, column);
1415 RF_UNLOCK_MUTEX(raidPtr->mutex);
1416 return (EINVAL);
1417 }
1418 if (raidPtr->Disks[column].status ==
1419 rf_ds_reconstructing) {
1420 printf("raid%d: Unable to reconstruct to disk at:\n",
1421 raidPtr->raidid);
1422 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1423
1424 RF_UNLOCK_MUTEX(raidPtr->mutex);
1425 return (EINVAL);
1426 }
1427 if (raidPtr->Disks[column].status == rf_ds_spared) {
1428 RF_UNLOCK_MUTEX(raidPtr->mutex);
1429 return (EINVAL);
1430 }
1431 RF_UNLOCK_MUTEX(raidPtr->mutex);
1432
1433 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1434 if (rrcopy == NULL)
1435 return(ENOMEM);
1436
1437 rrcopy->raidPtr = (void *) raidPtr;
1438 rrcopy->col = column;
1439
1440 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1441 rf_ReconstructInPlaceThread,
1442 rrcopy,"raid_reconip");
1443 return(retcode);
1444
1445 case RAIDFRAME_GET_INFO:
1446 if (!raidPtr->valid)
1447 return (ENODEV);
1448 ucfgp = (RF_DeviceConfig_t **) data;
1449 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1450 (RF_DeviceConfig_t *));
1451 if (d_cfg == NULL)
1452 return (ENOMEM);
1453 d_cfg->rows = 1; /* there is only 1 row now */
1454 d_cfg->cols = raidPtr->numCol;
1455 d_cfg->ndevs = raidPtr->numCol;
1456 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1457 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1458 return (ENOMEM);
1459 }
1460 d_cfg->nspares = raidPtr->numSpare;
1461 if (d_cfg->nspares >= RF_MAX_DISKS) {
1462 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1463 return (ENOMEM);
1464 }
1465 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1466 d = 0;
1467 for (j = 0; j < d_cfg->cols; j++) {
1468 d_cfg->devs[d] = raidPtr->Disks[j];
1469 d++;
1470 }
1471 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1472 d_cfg->spares[i] = raidPtr->Disks[j];
1473 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1474 /* XXX: raidctl(8) expects to see this as a used spare */
1475 d_cfg->spares[i].status = rf_ds_used_spare;
1476 }
1477 }
1478 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1479 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1480
1481 return (retcode);
1482
1483 case RAIDFRAME_CHECK_PARITY:
1484 *(int *) data = raidPtr->parity_good;
1485 return (0);
1486
1487 case RAIDFRAME_PARITYMAP_STATUS:
1488 rf_paritymap_status(raidPtr->parity_map,
1489 (struct rf_pmstat *)data);
1490 return 0;
1491
1492 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1493 if (raidPtr->parity_map == NULL)
1494 return ENOENT; /* ??? */
1495 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1496 (struct rf_pmparams *)data, 1))
1497 return EINVAL;
1498 return 0;
1499
1500 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1501 *(int *) data = rf_paritymap_get_disable(raidPtr);
1502 return 0;
1503
1504 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1505 rf_paritymap_set_disable(raidPtr, *(int *)data);
1506 /* XXX should errors be passed up? */
1507 return 0;
1508
1509 case RAIDFRAME_RESET_ACCTOTALS:
1510 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1511 return (0);
1512
1513 case RAIDFRAME_GET_ACCTOTALS:
1514 totals = (RF_AccTotals_t *) data;
1515 *totals = raidPtr->acc_totals;
1516 return (0);
1517
1518 case RAIDFRAME_KEEP_ACCTOTALS:
1519 raidPtr->keep_acc_totals = *(int *)data;
1520 return (0);
1521
1522 case RAIDFRAME_GET_SIZE:
1523 *(int *) data = raidPtr->totalSectors;
1524 return (0);
1525
1526 /* fail a disk & optionally start reconstruction */
1527 case RAIDFRAME_FAIL_DISK:
1528
1529 if (raidPtr->Layout.map->faultsTolerated == 0) {
1530 /* Can't do this on a RAID 0!! */
1531 return(EINVAL);
1532 }
1533
1534 rr = (struct rf_recon_req *) data;
1535 rr->row = 0;
1536 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1537 return (EINVAL);
1538
1539
1540 RF_LOCK_MUTEX(raidPtr->mutex);
1541 if (raidPtr->status == rf_rs_reconstructing) {
1542 /* you can't fail a disk while we're reconstructing! */
1543 /* XXX wrong for RAID6 */
1544 RF_UNLOCK_MUTEX(raidPtr->mutex);
1545 return (EINVAL);
1546 }
1547 if ((raidPtr->Disks[rr->col].status ==
1548 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1549 /* some other component has failed. Let's not make
1550 things worse. XXX wrong for RAID6 */
1551 RF_UNLOCK_MUTEX(raidPtr->mutex);
1552 return (EINVAL);
1553 }
1554 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1555 /* Can't fail a spared disk! */
1556 RF_UNLOCK_MUTEX(raidPtr->mutex);
1557 return (EINVAL);
1558 }
1559 RF_UNLOCK_MUTEX(raidPtr->mutex);
1560
1561 /* make a copy of the recon request so that we don't rely on
1562 * the user's buffer */
1563 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1564 if (rrcopy == NULL)
1565 return(ENOMEM);
1566 memcpy(rrcopy, rr, sizeof(*rr));
1567 rrcopy->raidPtr = (void *) raidPtr;
1568
1569 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1570 rf_ReconThread,
1571 rrcopy,"raid_recon");
1572 return (0);
1573
1574 /* invoke a copyback operation after recon on whatever disk
1575 * needs it, if any */
1576 case RAIDFRAME_COPYBACK:
1577
1578 if (raidPtr->Layout.map->faultsTolerated == 0) {
1579 /* This makes no sense on a RAID 0!! */
1580 return(EINVAL);
1581 }
1582
1583 if (raidPtr->copyback_in_progress == 1) {
1584 /* Copyback is already in progress! */
1585 return(EINVAL);
1586 }
1587
1588 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1589 rf_CopybackThread,
1590 raidPtr,"raid_copyback");
1591 return (retcode);
1592
1593 /* return the percentage completion of reconstruction */
1594 case RAIDFRAME_CHECK_RECON_STATUS:
1595 if (raidPtr->Layout.map->faultsTolerated == 0) {
1596 /* This makes no sense on a RAID 0, so tell the
1597 user it's done. */
1598 *(int *) data = 100;
1599 return(0);
1600 }
1601 if (raidPtr->status != rf_rs_reconstructing)
1602 *(int *) data = 100;
1603 else {
1604 if (raidPtr->reconControl->numRUsTotal > 0) {
1605 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1606 } else {
1607 *(int *) data = 0;
1608 }
1609 }
1610 return (0);
1611 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1612 progressInfoPtr = (RF_ProgressInfo_t **) data;
1613 if (raidPtr->status != rf_rs_reconstructing) {
1614 progressInfo.remaining = 0;
1615 progressInfo.completed = 100;
1616 progressInfo.total = 100;
1617 } else {
1618 progressInfo.total =
1619 raidPtr->reconControl->numRUsTotal;
1620 progressInfo.completed =
1621 raidPtr->reconControl->numRUsComplete;
1622 progressInfo.remaining = progressInfo.total -
1623 progressInfo.completed;
1624 }
1625 retcode = copyout(&progressInfo, *progressInfoPtr,
1626 sizeof(RF_ProgressInfo_t));
1627 return (retcode);
1628
1629 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1630 if (raidPtr->Layout.map->faultsTolerated == 0) {
1631 /* This makes no sense on a RAID 0, so tell the
1632 user it's done. */
1633 *(int *) data = 100;
1634 return(0);
1635 }
1636 if (raidPtr->parity_rewrite_in_progress == 1) {
1637 *(int *) data = 100 *
1638 raidPtr->parity_rewrite_stripes_done /
1639 raidPtr->Layout.numStripe;
1640 } else {
1641 *(int *) data = 100;
1642 }
1643 return (0);
1644
1645 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1646 progressInfoPtr = (RF_ProgressInfo_t **) data;
1647 if (raidPtr->parity_rewrite_in_progress == 1) {
1648 progressInfo.total = raidPtr->Layout.numStripe;
1649 progressInfo.completed =
1650 raidPtr->parity_rewrite_stripes_done;
1651 progressInfo.remaining = progressInfo.total -
1652 progressInfo.completed;
1653 } else {
1654 progressInfo.remaining = 0;
1655 progressInfo.completed = 100;
1656 progressInfo.total = 100;
1657 }
1658 retcode = copyout(&progressInfo, *progressInfoPtr,
1659 sizeof(RF_ProgressInfo_t));
1660 return (retcode);
1661
1662 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1663 if (raidPtr->Layout.map->faultsTolerated == 0) {
1664 /* This makes no sense on a RAID 0 */
1665 *(int *) data = 100;
1666 return(0);
1667 }
1668 if (raidPtr->copyback_in_progress == 1) {
1669 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1670 raidPtr->Layout.numStripe;
1671 } else {
1672 *(int *) data = 100;
1673 }
1674 return (0);
1675
1676 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1677 progressInfoPtr = (RF_ProgressInfo_t **) data;
1678 if (raidPtr->copyback_in_progress == 1) {
1679 progressInfo.total = raidPtr->Layout.numStripe;
1680 progressInfo.completed =
1681 raidPtr->copyback_stripes_done;
1682 progressInfo.remaining = progressInfo.total -
1683 progressInfo.completed;
1684 } else {
1685 progressInfo.remaining = 0;
1686 progressInfo.completed = 100;
1687 progressInfo.total = 100;
1688 }
1689 retcode = copyout(&progressInfo, *progressInfoPtr,
1690 sizeof(RF_ProgressInfo_t));
1691 return (retcode);
1692
1693 /* the sparetable daemon calls this to wait for the kernel to
1694 * need a spare table. this ioctl does not return until a
1695 * spare table is needed. XXX -- calling mpsleep here in the
1696 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1697 * -- I should either compute the spare table in the kernel,
1698 * or have a different -- XXX XXX -- interface (a different
1699 * character device) for delivering the table -- XXX */
1700 #if 0
1701 case RAIDFRAME_SPARET_WAIT:
1702 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1703 while (!rf_sparet_wait_queue)
1704 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1705 waitreq = rf_sparet_wait_queue;
1706 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1707 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1708
1709 /* structure assignment */
1710 *((RF_SparetWait_t *) data) = *waitreq;
1711
1712 RF_Free(waitreq, sizeof(*waitreq));
1713 return (0);
1714
1715 /* wakes up a process waiting on SPARET_WAIT and puts an error
1716 * code in it that will cause the dameon to exit */
1717 case RAIDFRAME_ABORT_SPARET_WAIT:
1718 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1719 waitreq->fcol = -1;
1720 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1721 waitreq->next = rf_sparet_wait_queue;
1722 rf_sparet_wait_queue = waitreq;
1723 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1724 wakeup(&rf_sparet_wait_queue);
1725 return (0);
1726
1727 /* used by the spare table daemon to deliver a spare table
1728 * into the kernel */
1729 case RAIDFRAME_SEND_SPARET:
1730
1731 /* install the spare table */
1732 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1733
1734 /* respond to the requestor. the return status of the spare
1735 * table installation is passed in the "fcol" field */
1736 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1737 waitreq->fcol = retcode;
1738 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1739 waitreq->next = rf_sparet_resp_queue;
1740 rf_sparet_resp_queue = waitreq;
1741 wakeup(&rf_sparet_resp_queue);
1742 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1743
1744 return (retcode);
1745 #endif
1746
1747 default:
1748 break; /* fall through to the os-specific code below */
1749
1750 }
1751
1752 if (!raidPtr->valid)
1753 return (EINVAL);
1754
1755 /*
1756 * Add support for "regular" device ioctls here.
1757 */
1758
1759 switch (cmd) {
1760 case DIOCGDINFO:
1761 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1762 break;
1763 #ifdef __HAVE_OLD_DISKLABEL
1764 case ODIOCGDINFO:
1765 newlabel = *(rs->sc_dkdev.dk_label);
1766 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1767 return ENOTTY;
1768 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1769 break;
1770 #endif
1771
1772 case DIOCGPART:
1773 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1774 ((struct partinfo *) data)->part =
1775 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1776 break;
1777
1778 case DIOCWDINFO:
1779 case DIOCSDINFO:
1780 #ifdef __HAVE_OLD_DISKLABEL
1781 case ODIOCWDINFO:
1782 case ODIOCSDINFO:
1783 #endif
1784 {
1785 struct disklabel *lp;
1786 #ifdef __HAVE_OLD_DISKLABEL
1787 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1788 memset(&newlabel, 0, sizeof newlabel);
1789 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1790 lp = &newlabel;
1791 } else
1792 #endif
1793 lp = (struct disklabel *)data;
1794
1795 if ((error = raidlock(rs)) != 0)
1796 return (error);
1797
1798 rs->sc_flags |= RAIDF_LABELLING;
1799
1800 error = setdisklabel(rs->sc_dkdev.dk_label,
1801 lp, 0, rs->sc_dkdev.dk_cpulabel);
1802 if (error == 0) {
1803 if (cmd == DIOCWDINFO
1804 #ifdef __HAVE_OLD_DISKLABEL
1805 || cmd == ODIOCWDINFO
1806 #endif
1807 )
1808 error = writedisklabel(RAIDLABELDEV(dev),
1809 raidstrategy, rs->sc_dkdev.dk_label,
1810 rs->sc_dkdev.dk_cpulabel);
1811 }
1812 rs->sc_flags &= ~RAIDF_LABELLING;
1813
1814 raidunlock(rs);
1815
1816 if (error)
1817 return (error);
1818 break;
1819 }
1820
1821 case DIOCWLABEL:
1822 if (*(int *) data != 0)
1823 rs->sc_flags |= RAIDF_WLABEL;
1824 else
1825 rs->sc_flags &= ~RAIDF_WLABEL;
1826 break;
1827
1828 case DIOCGDEFLABEL:
1829 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1830 break;
1831
1832 #ifdef __HAVE_OLD_DISKLABEL
1833 case ODIOCGDEFLABEL:
1834 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1835 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1836 return ENOTTY;
1837 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1838 break;
1839 #endif
1840
1841 case DIOCAWEDGE:
1842 case DIOCDWEDGE:
1843 dkw = (void *)data;
1844
1845 /* If the ioctl happens here, the parent is us. */
1846 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1847 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1848
1849 case DIOCLWEDGES:
1850 return dkwedge_list(&rs->sc_dkdev,
1851 (struct dkwedge_list *)data, l);
1852 case DIOCCACHESYNC:
1853 return rf_sync_component_caches(raidPtr);
1854 default:
1855 retcode = ENOTTY;
1856 }
1857 return (retcode);
1858
1859 }
1860
1861
1862 /* raidinit -- complete the rest of the initialization for the
1863 RAIDframe device. */
1864
1865
1866 static void
1867 raidinit(RF_Raid_t *raidPtr)
1868 {
1869 struct cfdata *cf;
1870 struct raid_softc *rs;
1871 int unit;
1872
1873 unit = raidPtr->raidid;
1874
1875 rs = &raid_softc[unit];
1876
1877 /* XXX should check return code first... */
1878 rs->sc_flags |= RAIDF_INITED;
1879
1880 /* XXX doesn't check bounds. */
1881 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1882
1883 /* attach the pseudo device */
1884 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1885 cf->cf_name = raid_cd.cd_name;
1886 cf->cf_atname = raid_cd.cd_name;
1887 cf->cf_unit = unit;
1888 cf->cf_fstate = FSTATE_STAR;
1889
1890 rs->sc_dev = config_attach_pseudo(cf);
1891
1892 if (rs->sc_dev==NULL) {
1893 printf("raid%d: config_attach_pseudo failed\n",
1894 raidPtr->raidid);
1895 }
1896
1897 /* disk_attach actually creates space for the CPU disklabel, among
1898 * other things, so it's critical to call this *BEFORE* we try putzing
1899 * with disklabels. */
1900
1901 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1902 disk_attach(&rs->sc_dkdev);
1903
1904 /* XXX There may be a weird interaction here between this, and
1905 * protectedSectors, as used in RAIDframe. */
1906
1907 rs->sc_size = raidPtr->totalSectors;
1908
1909 dkwedge_discover(&rs->sc_dkdev);
1910
1911 rf_set_properties(rs, raidPtr);
1912
1913 }
1914 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1915 /* wake up the daemon & tell it to get us a spare table
1916 * XXX
1917 * the entries in the queues should be tagged with the raidPtr
1918 * so that in the extremely rare case that two recons happen at once,
1919 * we know for which device were requesting a spare table
1920 * XXX
1921 *
1922 * XXX This code is not currently used. GO
1923 */
1924 int
1925 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1926 {
1927 int retcode;
1928
1929 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1930 req->next = rf_sparet_wait_queue;
1931 rf_sparet_wait_queue = req;
1932 wakeup(&rf_sparet_wait_queue);
1933
1934 /* mpsleep unlocks the mutex */
1935 while (!rf_sparet_resp_queue) {
1936 tsleep(&rf_sparet_resp_queue, PRIBIO,
1937 "raidframe getsparetable", 0);
1938 }
1939 req = rf_sparet_resp_queue;
1940 rf_sparet_resp_queue = req->next;
1941 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1942
1943 retcode = req->fcol;
1944 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1945 * alloc'd */
1946 return (retcode);
1947 }
1948 #endif
1949
1950 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1951 * bp & passes it down.
1952 * any calls originating in the kernel must use non-blocking I/O
1953 * do some extra sanity checking to return "appropriate" error values for
1954 * certain conditions (to make some standard utilities work)
1955 *
1956 * Formerly known as: rf_DoAccessKernel
1957 */
1958 void
1959 raidstart(RF_Raid_t *raidPtr)
1960 {
1961 RF_SectorCount_t num_blocks, pb, sum;
1962 RF_RaidAddr_t raid_addr;
1963 struct partition *pp;
1964 daddr_t blocknum;
1965 int unit;
1966 struct raid_softc *rs;
1967 int do_async;
1968 struct buf *bp;
1969 int rc;
1970
1971 unit = raidPtr->raidid;
1972 rs = &raid_softc[unit];
1973
1974 /* quick check to see if anything has died recently */
1975 RF_LOCK_MUTEX(raidPtr->mutex);
1976 if (raidPtr->numNewFailures > 0) {
1977 RF_UNLOCK_MUTEX(raidPtr->mutex);
1978 rf_update_component_labels(raidPtr,
1979 RF_NORMAL_COMPONENT_UPDATE);
1980 RF_LOCK_MUTEX(raidPtr->mutex);
1981 raidPtr->numNewFailures--;
1982 }
1983
1984 /* Check to see if we're at the limit... */
1985 while (raidPtr->openings > 0) {
1986 RF_UNLOCK_MUTEX(raidPtr->mutex);
1987
1988 /* get the next item, if any, from the queue */
1989 if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
1990 /* nothing more to do */
1991 return;
1992 }
1993
1994 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1995 * partition.. Need to make it absolute to the underlying
1996 * device.. */
1997
1998 blocknum = bp->b_blkno;
1999 if (DISKPART(bp->b_dev) != RAW_PART) {
2000 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2001 blocknum += pp->p_offset;
2002 }
2003
2004 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2005 (int) blocknum));
2006
2007 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2008 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2009
2010 /* *THIS* is where we adjust what block we're going to...
2011 * but DO NOT TOUCH bp->b_blkno!!! */
2012 raid_addr = blocknum;
2013
2014 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2015 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2016 sum = raid_addr + num_blocks + pb;
2017 if (1 || rf_debugKernelAccess) {
2018 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2019 (int) raid_addr, (int) sum, (int) num_blocks,
2020 (int) pb, (int) bp->b_resid));
2021 }
2022 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2023 || (sum < num_blocks) || (sum < pb)) {
2024 bp->b_error = ENOSPC;
2025 bp->b_resid = bp->b_bcount;
2026 biodone(bp);
2027 RF_LOCK_MUTEX(raidPtr->mutex);
2028 continue;
2029 }
2030 /*
2031 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2032 */
2033
2034 if (bp->b_bcount & raidPtr->sectorMask) {
2035 bp->b_error = EINVAL;
2036 bp->b_resid = bp->b_bcount;
2037 biodone(bp);
2038 RF_LOCK_MUTEX(raidPtr->mutex);
2039 continue;
2040
2041 }
2042 db1_printf(("Calling DoAccess..\n"));
2043
2044
2045 RF_LOCK_MUTEX(raidPtr->mutex);
2046 raidPtr->openings--;
2047 RF_UNLOCK_MUTEX(raidPtr->mutex);
2048
2049 /*
2050 * Everything is async.
2051 */
2052 do_async = 1;
2053
2054 disk_busy(&rs->sc_dkdev);
2055
2056 /* XXX we're still at splbio() here... do we *really*
2057 need to be? */
2058
2059 /* don't ever condition on bp->b_flags & B_WRITE.
2060 * always condition on B_READ instead */
2061
2062 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2063 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2064 do_async, raid_addr, num_blocks,
2065 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2066
2067 if (rc) {
2068 bp->b_error = rc;
2069 bp->b_resid = bp->b_bcount;
2070 biodone(bp);
2071 /* continue loop */
2072 }
2073
2074 RF_LOCK_MUTEX(raidPtr->mutex);
2075 }
2076 RF_UNLOCK_MUTEX(raidPtr->mutex);
2077 }
2078
2079
2080
2081
2082 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2083
2084 int
2085 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2086 {
2087 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2088 struct buf *bp;
2089
2090 req->queue = queue;
2091
2092 #if DIAGNOSTIC
2093 if (queue->raidPtr->raidid >= numraid) {
2094 printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
2095 numraid);
2096 panic("Invalid Unit number in rf_DispatchKernelIO");
2097 }
2098 #endif
2099
2100 bp = req->bp;
2101
2102 switch (req->type) {
2103 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2104 /* XXX need to do something extra here.. */
2105 /* I'm leaving this in, as I've never actually seen it used,
2106 * and I'd like folks to report it... GO */
2107 printf(("WAKEUP CALLED\n"));
2108 queue->numOutstanding++;
2109
2110 bp->b_flags = 0;
2111 bp->b_private = req;
2112
2113 KernelWakeupFunc(bp);
2114 break;
2115
2116 case RF_IO_TYPE_READ:
2117 case RF_IO_TYPE_WRITE:
2118 #if RF_ACC_TRACE > 0
2119 if (req->tracerec) {
2120 RF_ETIMER_START(req->tracerec->timer);
2121 }
2122 #endif
2123 InitBP(bp, queue->rf_cinfo->ci_vp,
2124 op, queue->rf_cinfo->ci_dev,
2125 req->sectorOffset, req->numSector,
2126 req->buf, KernelWakeupFunc, (void *) req,
2127 queue->raidPtr->logBytesPerSector, req->b_proc);
2128
2129 if (rf_debugKernelAccess) {
2130 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2131 (long) bp->b_blkno));
2132 }
2133 queue->numOutstanding++;
2134 queue->last_deq_sector = req->sectorOffset;
2135 /* acc wouldn't have been let in if there were any pending
2136 * reqs at any other priority */
2137 queue->curPriority = req->priority;
2138
2139 db1_printf(("Going for %c to unit %d col %d\n",
2140 req->type, queue->raidPtr->raidid,
2141 queue->col));
2142 db1_printf(("sector %d count %d (%d bytes) %d\n",
2143 (int) req->sectorOffset, (int) req->numSector,
2144 (int) (req->numSector <<
2145 queue->raidPtr->logBytesPerSector),
2146 (int) queue->raidPtr->logBytesPerSector));
2147
2148 /*
2149 * XXX: drop lock here since this can block at
2150 * least with backing SCSI devices. Retake it
2151 * to minimize fuss with calling interfaces.
2152 */
2153
2154 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2155 bdev_strategy(bp);
2156 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2157 break;
2158
2159 default:
2160 panic("bad req->type in rf_DispatchKernelIO");
2161 }
2162 db1_printf(("Exiting from DispatchKernelIO\n"));
2163
2164 return (0);
2165 }
2166 /* this is the callback function associated with a I/O invoked from
2167 kernel code.
2168 */
2169 static void
2170 KernelWakeupFunc(struct buf *bp)
2171 {
2172 RF_DiskQueueData_t *req = NULL;
2173 RF_DiskQueue_t *queue;
2174 int s;
2175
2176 s = splbio();
2177 db1_printf(("recovering the request queue:\n"));
2178 req = bp->b_private;
2179
2180 queue = (RF_DiskQueue_t *) req->queue;
2181
2182 #if RF_ACC_TRACE > 0
2183 if (req->tracerec) {
2184 RF_ETIMER_STOP(req->tracerec->timer);
2185 RF_ETIMER_EVAL(req->tracerec->timer);
2186 RF_LOCK_MUTEX(rf_tracing_mutex);
2187 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2188 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2189 req->tracerec->num_phys_ios++;
2190 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2191 }
2192 #endif
2193
2194 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2195 * ballistic, and mark the component as hosed... */
2196
2197 if (bp->b_error != 0) {
2198 /* Mark the disk as dead */
2199 /* but only mark it once... */
2200 /* and only if it wouldn't leave this RAID set
2201 completely broken */
2202 if (((queue->raidPtr->Disks[queue->col].status ==
2203 rf_ds_optimal) ||
2204 (queue->raidPtr->Disks[queue->col].status ==
2205 rf_ds_used_spare)) &&
2206 (queue->raidPtr->numFailures <
2207 queue->raidPtr->Layout.map->faultsTolerated)) {
2208 printf("raid%d: IO Error. Marking %s as failed.\n",
2209 queue->raidPtr->raidid,
2210 queue->raidPtr->Disks[queue->col].devname);
2211 queue->raidPtr->Disks[queue->col].status =
2212 rf_ds_failed;
2213 queue->raidPtr->status = rf_rs_degraded;
2214 queue->raidPtr->numFailures++;
2215 queue->raidPtr->numNewFailures++;
2216 } else { /* Disk is already dead... */
2217 /* printf("Disk already marked as dead!\n"); */
2218 }
2219
2220 }
2221
2222 /* Fill in the error value */
2223
2224 req->error = bp->b_error;
2225
2226 simple_lock(&queue->raidPtr->iodone_lock);
2227
2228 /* Drop this one on the "finished" queue... */
2229 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2230
2231 /* Let the raidio thread know there is work to be done. */
2232 wakeup(&(queue->raidPtr->iodone));
2233
2234 simple_unlock(&queue->raidPtr->iodone_lock);
2235
2236 splx(s);
2237 }
2238
2239
2240
2241 /*
2242 * initialize a buf structure for doing an I/O in the kernel.
2243 */
2244 static void
2245 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2246 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2247 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2248 struct proc *b_proc)
2249 {
2250 /* bp->b_flags = B_PHYS | rw_flag; */
2251 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2252 bp->b_oflags = 0;
2253 bp->b_cflags = 0;
2254 bp->b_bcount = numSect << logBytesPerSector;
2255 bp->b_bufsize = bp->b_bcount;
2256 bp->b_error = 0;
2257 bp->b_dev = dev;
2258 bp->b_data = bf;
2259 bp->b_blkno = startSect;
2260 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2261 if (bp->b_bcount == 0) {
2262 panic("bp->b_bcount is zero in InitBP!!");
2263 }
2264 bp->b_proc = b_proc;
2265 bp->b_iodone = cbFunc;
2266 bp->b_private = cbArg;
2267 }
2268
2269 static void
2270 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2271 struct disklabel *lp)
2272 {
2273 memset(lp, 0, sizeof(*lp));
2274
2275 /* fabricate a label... */
2276 lp->d_secperunit = raidPtr->totalSectors;
2277 lp->d_secsize = raidPtr->bytesPerSector;
2278 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2279 lp->d_ntracks = 4 * raidPtr->numCol;
2280 lp->d_ncylinders = raidPtr->totalSectors /
2281 (lp->d_nsectors * lp->d_ntracks);
2282 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2283
2284 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2285 lp->d_type = DTYPE_RAID;
2286 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2287 lp->d_rpm = 3600;
2288 lp->d_interleave = 1;
2289 lp->d_flags = 0;
2290
2291 lp->d_partitions[RAW_PART].p_offset = 0;
2292 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2293 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2294 lp->d_npartitions = RAW_PART + 1;
2295
2296 lp->d_magic = DISKMAGIC;
2297 lp->d_magic2 = DISKMAGIC;
2298 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2299
2300 }
2301 /*
2302 * Read the disklabel from the raid device. If one is not present, fake one
2303 * up.
2304 */
2305 static void
2306 raidgetdisklabel(dev_t dev)
2307 {
2308 int unit = raidunit(dev);
2309 struct raid_softc *rs = &raid_softc[unit];
2310 const char *errstring;
2311 struct disklabel *lp = rs->sc_dkdev.dk_label;
2312 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2313 RF_Raid_t *raidPtr;
2314
2315 db1_printf(("Getting the disklabel...\n"));
2316
2317 memset(clp, 0, sizeof(*clp));
2318
2319 raidPtr = raidPtrs[unit];
2320
2321 raidgetdefaultlabel(raidPtr, rs, lp);
2322
2323 /*
2324 * Call the generic disklabel extraction routine.
2325 */
2326 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2327 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2328 if (errstring)
2329 raidmakedisklabel(rs);
2330 else {
2331 int i;
2332 struct partition *pp;
2333
2334 /*
2335 * Sanity check whether the found disklabel is valid.
2336 *
2337 * This is necessary since total size of the raid device
2338 * may vary when an interleave is changed even though exactly
2339 * same components are used, and old disklabel may used
2340 * if that is found.
2341 */
2342 if (lp->d_secperunit != rs->sc_size)
2343 printf("raid%d: WARNING: %s: "
2344 "total sector size in disklabel (%" PRIu32 ") != "
2345 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2346 lp->d_secperunit, rs->sc_size);
2347 for (i = 0; i < lp->d_npartitions; i++) {
2348 pp = &lp->d_partitions[i];
2349 if (pp->p_offset + pp->p_size > rs->sc_size)
2350 printf("raid%d: WARNING: %s: end of partition `%c' "
2351 "exceeds the size of raid (%" PRIu64 ")\n",
2352 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2353 }
2354 }
2355
2356 }
2357 /*
2358 * Take care of things one might want to take care of in the event
2359 * that a disklabel isn't present.
2360 */
2361 static void
2362 raidmakedisklabel(struct raid_softc *rs)
2363 {
2364 struct disklabel *lp = rs->sc_dkdev.dk_label;
2365 db1_printf(("Making a label..\n"));
2366
2367 /*
2368 * For historical reasons, if there's no disklabel present
2369 * the raw partition must be marked FS_BSDFFS.
2370 */
2371
2372 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2373
2374 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2375
2376 lp->d_checksum = dkcksum(lp);
2377 }
2378 /*
2379 * Wait interruptibly for an exclusive lock.
2380 *
2381 * XXX
2382 * Several drivers do this; it should be abstracted and made MP-safe.
2383 * (Hmm... where have we seen this warning before :-> GO )
2384 */
2385 static int
2386 raidlock(struct raid_softc *rs)
2387 {
2388 int error;
2389
2390 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2391 rs->sc_flags |= RAIDF_WANTED;
2392 if ((error =
2393 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2394 return (error);
2395 }
2396 rs->sc_flags |= RAIDF_LOCKED;
2397 return (0);
2398 }
2399 /*
2400 * Unlock and wake up any waiters.
2401 */
2402 static void
2403 raidunlock(struct raid_softc *rs)
2404 {
2405
2406 rs->sc_flags &= ~RAIDF_LOCKED;
2407 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2408 rs->sc_flags &= ~RAIDF_WANTED;
2409 wakeup(rs);
2410 }
2411 }
2412
2413
2414 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2415 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2416 #define RF_PARITY_MAP_OFFSET \
2417 (RF_COMPONENT_INFO_OFFSET + RF_COMPONENT_INFO_SIZE)
2418 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2419
2420 int
2421 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2422 {
2423 RF_ComponentLabel_t *clabel;
2424
2425 clabel = raidget_component_label(raidPtr, col);
2426 clabel->clean = RF_RAID_CLEAN;
2427 raidflush_component_label(raidPtr, col);
2428 return(0);
2429 }
2430
2431
2432 int
2433 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2434 {
2435 RF_ComponentLabel_t *clabel;
2436
2437 clabel = raidget_component_label(raidPtr, col);
2438 clabel->clean = RF_RAID_DIRTY;
2439 raidflush_component_label(raidPtr, col);
2440 return(0);
2441 }
2442
2443 int
2444 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2445 {
2446 return raidread_component_label(raidPtr->Disks[col].dev,
2447 raidPtr->raid_cinfo[col].ci_vp,
2448 &raidPtr->raid_cinfo[col].ci_label);
2449 }
2450
2451 RF_ComponentLabel_t *
2452 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2453 {
2454 return &raidPtr->raid_cinfo[col].ci_label;
2455 }
2456
2457 int
2458 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2459 {
2460 RF_ComponentLabel_t *label;
2461
2462 label = &raidPtr->raid_cinfo[col].ci_label;
2463 label->mod_counter = raidPtr->mod_counter;
2464 #ifndef RF_NO_PARITY_MAP
2465 label->parity_map_modcount = label->mod_counter;
2466 #endif
2467 return raidwrite_component_label(raidPtr->Disks[col].dev,
2468 raidPtr->raid_cinfo[col].ci_vp, label);
2469 }
2470
2471
2472 static int
2473 raidread_component_label(dev_t dev, struct vnode *b_vp,
2474 RF_ComponentLabel_t *clabel)
2475 {
2476 return raidread_component_area(dev, b_vp, clabel,
2477 sizeof(RF_ComponentLabel_t),
2478 RF_COMPONENT_INFO_OFFSET, RF_COMPONENT_INFO_SIZE);
2479 }
2480
2481 /* ARGSUSED */
2482 static int
2483 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2484 size_t msize, daddr_t offset, daddr_t dsize)
2485 {
2486 struct buf *bp;
2487 const struct bdevsw *bdev;
2488 int error;
2489
2490 /* XXX should probably ensure that we don't try to do this if
2491 someone has changed rf_protected_sectors. */
2492
2493 if (b_vp == NULL) {
2494 /* For whatever reason, this component is not valid.
2495 Don't try to read a component label from it. */
2496 return(EINVAL);
2497 }
2498
2499 /* get a block of the appropriate size... */
2500 bp = geteblk((int)dsize);
2501 bp->b_dev = dev;
2502
2503 /* get our ducks in a row for the read */
2504 bp->b_blkno = offset / DEV_BSIZE;
2505 bp->b_bcount = dsize;
2506 bp->b_flags |= B_READ;
2507 bp->b_resid = dsize;
2508
2509 bdev = bdevsw_lookup(bp->b_dev);
2510 if (bdev == NULL)
2511 return (ENXIO);
2512 (*bdev->d_strategy)(bp);
2513
2514 error = biowait(bp);
2515
2516 if (!error) {
2517 memcpy(data, bp->b_data, msize);
2518 }
2519
2520 brelse(bp, 0);
2521 return(error);
2522 }
2523
2524
2525 static int
2526 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2527 RF_ComponentLabel_t *clabel)
2528 {
2529 return raidwrite_component_area(dev, b_vp, clabel,
2530 sizeof(RF_ComponentLabel_t),
2531 RF_COMPONENT_INFO_OFFSET, RF_COMPONENT_INFO_SIZE, 0);
2532 }
2533
2534 /* ARGSUSED */
2535 static int
2536 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2537 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2538 {
2539 struct buf *bp;
2540 const struct bdevsw *bdev;
2541 int error;
2542
2543 /* get a block of the appropriate size... */
2544 bp = geteblk((int)dsize);
2545 bp->b_dev = dev;
2546
2547 /* get our ducks in a row for the write */
2548 bp->b_blkno = offset / DEV_BSIZE;
2549 bp->b_bcount = dsize;
2550 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2551 bp->b_resid = dsize;
2552
2553 memset(bp->b_data, 0, dsize);
2554 memcpy(bp->b_data, data, msize);
2555
2556 bdev = bdevsw_lookup(bp->b_dev);
2557 if (bdev == NULL)
2558 return (ENXIO);
2559 (*bdev->d_strategy)(bp);
2560 if (asyncp)
2561 return 0;
2562 error = biowait(bp);
2563 brelse(bp, 0);
2564 if (error) {
2565 #if 1
2566 printf("Failed to write RAID component info!\n");
2567 #endif
2568 }
2569
2570 return(error);
2571 }
2572
2573 void
2574 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2575 {
2576 int c;
2577
2578 for (c = 0; c < raidPtr->numCol; c++) {
2579 /* Skip dead disks. */
2580 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2581 continue;
2582 /* XXXjld: what if an error occurs here? */
2583 raidwrite_component_area(raidPtr->Disks[c].dev,
2584 raidPtr->raid_cinfo[c].ci_vp, map,
2585 RF_PARITYMAP_NBYTE,
2586 RF_PARITY_MAP_OFFSET, RF_PARITY_MAP_SIZE, 0);
2587 }
2588 }
2589
2590 void
2591 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2592 {
2593 struct rf_paritymap_ondisk tmp;
2594 int c,first;
2595
2596 first=1;
2597 for (c = 0; c < raidPtr->numCol; c++) {
2598 /* Skip dead disks. */
2599 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2600 continue;
2601 raidread_component_area(raidPtr->Disks[c].dev,
2602 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2603 RF_PARITYMAP_NBYTE,
2604 RF_PARITY_MAP_OFFSET, RF_PARITY_MAP_SIZE);
2605 if (first) {
2606 memcpy(map, &tmp, sizeof(*map));
2607 first = 0;
2608 } else {
2609 rf_paritymap_merge(map, &tmp);
2610 }
2611 }
2612 }
2613
2614 void
2615 rf_markalldirty(RF_Raid_t *raidPtr)
2616 {
2617 RF_ComponentLabel_t *clabel;
2618 int sparecol;
2619 int c;
2620 int j;
2621 int scol = -1;
2622
2623 raidPtr->mod_counter++;
2624 for (c = 0; c < raidPtr->numCol; c++) {
2625 /* we don't want to touch (at all) a disk that has
2626 failed */
2627 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2628 clabel = raidget_component_label(raidPtr, c);
2629 if (clabel->status == rf_ds_spared) {
2630 /* XXX do something special...
2631 but whatever you do, don't
2632 try to access it!! */
2633 } else {
2634 raidmarkdirty(raidPtr, c);
2635 }
2636 }
2637 }
2638
2639 for( c = 0; c < raidPtr->numSpare ; c++) {
2640 sparecol = raidPtr->numCol + c;
2641 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2642 /*
2643
2644 we claim this disk is "optimal" if it's
2645 rf_ds_used_spare, as that means it should be
2646 directly substitutable for the disk it replaced.
2647 We note that too...
2648
2649 */
2650
2651 for(j=0;j<raidPtr->numCol;j++) {
2652 if (raidPtr->Disks[j].spareCol == sparecol) {
2653 scol = j;
2654 break;
2655 }
2656 }
2657
2658 clabel = raidget_component_label(raidPtr, sparecol);
2659 /* make sure status is noted */
2660
2661 raid_init_component_label(raidPtr, clabel);
2662
2663 clabel->row = 0;
2664 clabel->column = scol;
2665 /* Note: we *don't* change status from rf_ds_used_spare
2666 to rf_ds_optimal */
2667 /* clabel.status = rf_ds_optimal; */
2668
2669 raidmarkdirty(raidPtr, sparecol);
2670 }
2671 }
2672 }
2673
2674
2675 void
2676 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2677 {
2678 RF_ComponentLabel_t *clabel;
2679 int sparecol;
2680 int c;
2681 int j;
2682 int scol;
2683
2684 scol = -1;
2685
2686 /* XXX should do extra checks to make sure things really are clean,
2687 rather than blindly setting the clean bit... */
2688
2689 raidPtr->mod_counter++;
2690
2691 for (c = 0; c < raidPtr->numCol; c++) {
2692 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2693 clabel = raidget_component_label(raidPtr, c);
2694 /* make sure status is noted */
2695 clabel->status = rf_ds_optimal;
2696
2697 /* note what unit we are configured as */
2698 clabel->last_unit = raidPtr->raidid;
2699
2700 raidflush_component_label(raidPtr, c);
2701 if (final == RF_FINAL_COMPONENT_UPDATE) {
2702 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2703 raidmarkclean(raidPtr, c);
2704 }
2705 }
2706 }
2707 /* else we don't touch it.. */
2708 }
2709
2710 for( c = 0; c < raidPtr->numSpare ; c++) {
2711 sparecol = raidPtr->numCol + c;
2712 /* Need to ensure that the reconstruct actually completed! */
2713 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2714 /*
2715
2716 we claim this disk is "optimal" if it's
2717 rf_ds_used_spare, as that means it should be
2718 directly substitutable for the disk it replaced.
2719 We note that too...
2720
2721 */
2722
2723 for(j=0;j<raidPtr->numCol;j++) {
2724 if (raidPtr->Disks[j].spareCol == sparecol) {
2725 scol = j;
2726 break;
2727 }
2728 }
2729
2730 /* XXX shouldn't *really* need this... */
2731 clabel = raidget_component_label(raidPtr, sparecol);
2732 /* make sure status is noted */
2733
2734 raid_init_component_label(raidPtr, clabel);
2735
2736 clabel->column = scol;
2737 clabel->status = rf_ds_optimal;
2738 clabel->last_unit = raidPtr->raidid;
2739
2740 raidflush_component_label(raidPtr, sparecol);
2741 if (final == RF_FINAL_COMPONENT_UPDATE) {
2742 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2743 raidmarkclean(raidPtr, sparecol);
2744 }
2745 }
2746 }
2747 }
2748 }
2749
2750 void
2751 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2752 {
2753
2754 if (vp != NULL) {
2755 if (auto_configured == 1) {
2756 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2757 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2758 vput(vp);
2759
2760 } else {
2761 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2762 }
2763 }
2764 }
2765
2766
2767 void
2768 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2769 {
2770 int r,c;
2771 struct vnode *vp;
2772 int acd;
2773
2774
2775 /* We take this opportunity to close the vnodes like we should.. */
2776
2777 for (c = 0; c < raidPtr->numCol; c++) {
2778 vp = raidPtr->raid_cinfo[c].ci_vp;
2779 acd = raidPtr->Disks[c].auto_configured;
2780 rf_close_component(raidPtr, vp, acd);
2781 raidPtr->raid_cinfo[c].ci_vp = NULL;
2782 raidPtr->Disks[c].auto_configured = 0;
2783 }
2784
2785 for (r = 0; r < raidPtr->numSpare; r++) {
2786 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2787 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2788 rf_close_component(raidPtr, vp, acd);
2789 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2790 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2791 }
2792 }
2793
2794
2795 void
2796 rf_ReconThread(struct rf_recon_req *req)
2797 {
2798 int s;
2799 RF_Raid_t *raidPtr;
2800
2801 s = splbio();
2802 raidPtr = (RF_Raid_t *) req->raidPtr;
2803 raidPtr->recon_in_progress = 1;
2804
2805 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2806 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2807
2808 RF_Free(req, sizeof(*req));
2809
2810 raidPtr->recon_in_progress = 0;
2811 splx(s);
2812
2813 /* That's all... */
2814 kthread_exit(0); /* does not return */
2815 }
2816
2817 void
2818 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2819 {
2820 int retcode;
2821 int s;
2822
2823 raidPtr->parity_rewrite_stripes_done = 0;
2824 raidPtr->parity_rewrite_in_progress = 1;
2825 s = splbio();
2826 retcode = rf_RewriteParity(raidPtr);
2827 splx(s);
2828 if (retcode) {
2829 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2830 } else {
2831 /* set the clean bit! If we shutdown correctly,
2832 the clean bit on each component label will get
2833 set */
2834 raidPtr->parity_good = RF_RAID_CLEAN;
2835 }
2836 raidPtr->parity_rewrite_in_progress = 0;
2837
2838 /* Anyone waiting for us to stop? If so, inform them... */
2839 if (raidPtr->waitShutdown) {
2840 wakeup(&raidPtr->parity_rewrite_in_progress);
2841 }
2842
2843 /* That's all... */
2844 kthread_exit(0); /* does not return */
2845 }
2846
2847
2848 void
2849 rf_CopybackThread(RF_Raid_t *raidPtr)
2850 {
2851 int s;
2852
2853 raidPtr->copyback_in_progress = 1;
2854 s = splbio();
2855 rf_CopybackReconstructedData(raidPtr);
2856 splx(s);
2857 raidPtr->copyback_in_progress = 0;
2858
2859 /* That's all... */
2860 kthread_exit(0); /* does not return */
2861 }
2862
2863
2864 void
2865 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2866 {
2867 int s;
2868 RF_Raid_t *raidPtr;
2869
2870 s = splbio();
2871 raidPtr = req->raidPtr;
2872 raidPtr->recon_in_progress = 1;
2873 rf_ReconstructInPlace(raidPtr, req->col);
2874 RF_Free(req, sizeof(*req));
2875 raidPtr->recon_in_progress = 0;
2876 splx(s);
2877
2878 /* That's all... */
2879 kthread_exit(0); /* does not return */
2880 }
2881
2882 static RF_AutoConfig_t *
2883 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2884 const char *cname, RF_SectorCount_t size)
2885 {
2886 int good_one = 0;
2887 RF_ComponentLabel_t *clabel;
2888 RF_AutoConfig_t *ac;
2889
2890 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2891 if (clabel == NULL) {
2892 oomem:
2893 while(ac_list) {
2894 ac = ac_list;
2895 if (ac->clabel)
2896 free(ac->clabel, M_RAIDFRAME);
2897 ac_list = ac_list->next;
2898 free(ac, M_RAIDFRAME);
2899 }
2900 printf("RAID auto config: out of memory!\n");
2901 return NULL; /* XXX probably should panic? */
2902 }
2903
2904 if (!raidread_component_label(dev, vp, clabel)) {
2905 /* Got the label. Does it look reasonable? */
2906 if (rf_reasonable_label(clabel) &&
2907 (clabel->partitionSize <= size)) {
2908 #ifdef DEBUG
2909 printf("Component on: %s: %llu\n",
2910 cname, (unsigned long long)size);
2911 rf_print_component_label(clabel);
2912 #endif
2913 /* if it's reasonable, add it, else ignore it. */
2914 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2915 M_NOWAIT);
2916 if (ac == NULL) {
2917 free(clabel, M_RAIDFRAME);
2918 goto oomem;
2919 }
2920 strlcpy(ac->devname, cname, sizeof(ac->devname));
2921 ac->dev = dev;
2922 ac->vp = vp;
2923 ac->clabel = clabel;
2924 ac->next = ac_list;
2925 ac_list = ac;
2926 good_one = 1;
2927 }
2928 }
2929 if (!good_one) {
2930 /* cleanup */
2931 free(clabel, M_RAIDFRAME);
2932 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2933 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2934 vput(vp);
2935 }
2936 return ac_list;
2937 }
2938
2939 RF_AutoConfig_t *
2940 rf_find_raid_components()
2941 {
2942 struct vnode *vp;
2943 struct disklabel label;
2944 struct device *dv;
2945 dev_t dev;
2946 int bmajor, bminor, wedge;
2947 int error;
2948 int i;
2949 RF_AutoConfig_t *ac_list;
2950
2951
2952 /* initialize the AutoConfig list */
2953 ac_list = NULL;
2954
2955 /* we begin by trolling through *all* the devices on the system */
2956
2957 for (dv = alldevs.tqh_first; dv != NULL;
2958 dv = dv->dv_list.tqe_next) {
2959
2960 /* we are only interested in disks... */
2961 if (device_class(dv) != DV_DISK)
2962 continue;
2963
2964 /* we don't care about floppies... */
2965 if (device_is_a(dv, "fd")) {
2966 continue;
2967 }
2968
2969 /* we don't care about CD's... */
2970 if (device_is_a(dv, "cd")) {
2971 continue;
2972 }
2973
2974 /* we don't care about md's... */
2975 if (device_is_a(dv, "md")) {
2976 continue;
2977 }
2978
2979 /* hdfd is the Atari/Hades floppy driver */
2980 if (device_is_a(dv, "hdfd")) {
2981 continue;
2982 }
2983
2984 /* fdisa is the Atari/Milan floppy driver */
2985 if (device_is_a(dv, "fdisa")) {
2986 continue;
2987 }
2988
2989 /* need to find the device_name_to_block_device_major stuff */
2990 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2991
2992 /* get a vnode for the raw partition of this disk */
2993
2994 wedge = device_is_a(dv, "dk");
2995 bminor = minor(device_unit(dv));
2996 dev = wedge ? makedev(bmajor, bminor) :
2997 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2998 if (bdevvp(dev, &vp))
2999 panic("RAID can't alloc vnode");
3000
3001 error = VOP_OPEN(vp, FREAD, NOCRED);
3002
3003 if (error) {
3004 /* "Who cares." Continue looking
3005 for something that exists*/
3006 vput(vp);
3007 continue;
3008 }
3009
3010 if (wedge) {
3011 struct dkwedge_info dkw;
3012 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3013 NOCRED);
3014 if (error) {
3015 printf("RAIDframe: can't get wedge info for "
3016 "dev %s (%d)\n", device_xname(dv), error);
3017 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3018 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3019 vput(vp);
3020 continue;
3021 }
3022
3023 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3024 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3025 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3026 vput(vp);
3027 continue;
3028 }
3029
3030 ac_list = rf_get_component(ac_list, dev, vp,
3031 device_xname(dv), dkw.dkw_size);
3032 continue;
3033 }
3034
3035 /* Ok, the disk exists. Go get the disklabel. */
3036 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3037 if (error) {
3038 /*
3039 * XXX can't happen - open() would
3040 * have errored out (or faked up one)
3041 */
3042 if (error != ENOTTY)
3043 printf("RAIDframe: can't get label for dev "
3044 "%s (%d)\n", device_xname(dv), error);
3045 }
3046
3047 /* don't need this any more. We'll allocate it again
3048 a little later if we really do... */
3049 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3050 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3051 vput(vp);
3052
3053 if (error)
3054 continue;
3055
3056 for (i = 0; i < label.d_npartitions; i++) {
3057 char cname[sizeof(ac_list->devname)];
3058
3059 /* We only support partitions marked as RAID */
3060 if (label.d_partitions[i].p_fstype != FS_RAID)
3061 continue;
3062
3063 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3064 if (bdevvp(dev, &vp))
3065 panic("RAID can't alloc vnode");
3066
3067 error = VOP_OPEN(vp, FREAD, NOCRED);
3068 if (error) {
3069 /* Whatever... */
3070 vput(vp);
3071 continue;
3072 }
3073 snprintf(cname, sizeof(cname), "%s%c",
3074 device_xname(dv), 'a' + i);
3075 ac_list = rf_get_component(ac_list, dev, vp, cname,
3076 label.d_partitions[i].p_size);
3077 }
3078 }
3079 return ac_list;
3080 }
3081
3082
3083 static int
3084 rf_reasonable_label(RF_ComponentLabel_t *clabel)
3085 {
3086
3087 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3088 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3089 ((clabel->clean == RF_RAID_CLEAN) ||
3090 (clabel->clean == RF_RAID_DIRTY)) &&
3091 clabel->row >=0 &&
3092 clabel->column >= 0 &&
3093 clabel->num_rows > 0 &&
3094 clabel->num_columns > 0 &&
3095 clabel->row < clabel->num_rows &&
3096 clabel->column < clabel->num_columns &&
3097 clabel->blockSize > 0 &&
3098 clabel->numBlocks > 0) {
3099 /* label looks reasonable enough... */
3100 return(1);
3101 }
3102 return(0);
3103 }
3104
3105
3106 #ifdef DEBUG
3107 void
3108 rf_print_component_label(RF_ComponentLabel_t *clabel)
3109 {
3110 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3111 clabel->row, clabel->column,
3112 clabel->num_rows, clabel->num_columns);
3113 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3114 clabel->version, clabel->serial_number,
3115 clabel->mod_counter);
3116 printf(" Clean: %s Status: %d\n",
3117 clabel->clean ? "Yes" : "No", clabel->status );
3118 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3119 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3120 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
3121 (char) clabel->parityConfig, clabel->blockSize,
3122 clabel->numBlocks);
3123 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
3124 printf(" Contains root partition: %s\n",
3125 clabel->root_partition ? "Yes" : "No" );
3126 printf(" Last configured as: raid%d\n", clabel->last_unit );
3127 #if 0
3128 printf(" Config order: %d\n", clabel->config_order);
3129 #endif
3130
3131 }
3132 #endif
3133
3134 RF_ConfigSet_t *
3135 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3136 {
3137 RF_AutoConfig_t *ac;
3138 RF_ConfigSet_t *config_sets;
3139 RF_ConfigSet_t *cset;
3140 RF_AutoConfig_t *ac_next;
3141
3142
3143 config_sets = NULL;
3144
3145 /* Go through the AutoConfig list, and figure out which components
3146 belong to what sets. */
3147 ac = ac_list;
3148 while(ac!=NULL) {
3149 /* we're going to putz with ac->next, so save it here
3150 for use at the end of the loop */
3151 ac_next = ac->next;
3152
3153 if (config_sets == NULL) {
3154 /* will need at least this one... */
3155 config_sets = (RF_ConfigSet_t *)
3156 malloc(sizeof(RF_ConfigSet_t),
3157 M_RAIDFRAME, M_NOWAIT);
3158 if (config_sets == NULL) {
3159 panic("rf_create_auto_sets: No memory!");
3160 }
3161 /* this one is easy :) */
3162 config_sets->ac = ac;
3163 config_sets->next = NULL;
3164 config_sets->rootable = 0;
3165 ac->next = NULL;
3166 } else {
3167 /* which set does this component fit into? */
3168 cset = config_sets;
3169 while(cset!=NULL) {
3170 if (rf_does_it_fit(cset, ac)) {
3171 /* looks like it matches... */
3172 ac->next = cset->ac;
3173 cset->ac = ac;
3174 break;
3175 }
3176 cset = cset->next;
3177 }
3178 if (cset==NULL) {
3179 /* didn't find a match above... new set..*/
3180 cset = (RF_ConfigSet_t *)
3181 malloc(sizeof(RF_ConfigSet_t),
3182 M_RAIDFRAME, M_NOWAIT);
3183 if (cset == NULL) {
3184 panic("rf_create_auto_sets: No memory!");
3185 }
3186 cset->ac = ac;
3187 ac->next = NULL;
3188 cset->next = config_sets;
3189 cset->rootable = 0;
3190 config_sets = cset;
3191 }
3192 }
3193 ac = ac_next;
3194 }
3195
3196
3197 return(config_sets);
3198 }
3199
3200 static int
3201 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3202 {
3203 RF_ComponentLabel_t *clabel1, *clabel2;
3204
3205 /* If this one matches the *first* one in the set, that's good
3206 enough, since the other members of the set would have been
3207 through here too... */
3208 /* note that we are not checking partitionSize here..
3209
3210 Note that we are also not checking the mod_counters here.
3211 If everything else matches execpt the mod_counter, that's
3212 good enough for this test. We will deal with the mod_counters
3213 a little later in the autoconfiguration process.
3214
3215 (clabel1->mod_counter == clabel2->mod_counter) &&
3216
3217 The reason we don't check for this is that failed disks
3218 will have lower modification counts. If those disks are
3219 not added to the set they used to belong to, then they will
3220 form their own set, which may result in 2 different sets,
3221 for example, competing to be configured at raid0, and
3222 perhaps competing to be the root filesystem set. If the
3223 wrong ones get configured, or both attempt to become /,
3224 weird behaviour and or serious lossage will occur. Thus we
3225 need to bring them into the fold here, and kick them out at
3226 a later point.
3227
3228 */
3229
3230 clabel1 = cset->ac->clabel;
3231 clabel2 = ac->clabel;
3232 if ((clabel1->version == clabel2->version) &&
3233 (clabel1->serial_number == clabel2->serial_number) &&
3234 (clabel1->num_rows == clabel2->num_rows) &&
3235 (clabel1->num_columns == clabel2->num_columns) &&
3236 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3237 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3238 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3239 (clabel1->parityConfig == clabel2->parityConfig) &&
3240 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3241 (clabel1->blockSize == clabel2->blockSize) &&
3242 (clabel1->numBlocks == clabel2->numBlocks) &&
3243 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3244 (clabel1->root_partition == clabel2->root_partition) &&
3245 (clabel1->last_unit == clabel2->last_unit) &&
3246 (clabel1->config_order == clabel2->config_order)) {
3247 /* if it get's here, it almost *has* to be a match */
3248 } else {
3249 /* it's not consistent with somebody in the set..
3250 punt */
3251 return(0);
3252 }
3253 /* all was fine.. it must fit... */
3254 return(1);
3255 }
3256
3257 int
3258 rf_have_enough_components(RF_ConfigSet_t *cset)
3259 {
3260 RF_AutoConfig_t *ac;
3261 RF_AutoConfig_t *auto_config;
3262 RF_ComponentLabel_t *clabel;
3263 int c;
3264 int num_cols;
3265 int num_missing;
3266 int mod_counter;
3267 int mod_counter_found;
3268 int even_pair_failed;
3269 char parity_type;
3270
3271
3272 /* check to see that we have enough 'live' components
3273 of this set. If so, we can configure it if necessary */
3274
3275 num_cols = cset->ac->clabel->num_columns;
3276 parity_type = cset->ac->clabel->parityConfig;
3277
3278 /* XXX Check for duplicate components!?!?!? */
3279
3280 /* Determine what the mod_counter is supposed to be for this set. */
3281
3282 mod_counter_found = 0;
3283 mod_counter = 0;
3284 ac = cset->ac;
3285 while(ac!=NULL) {
3286 if (mod_counter_found==0) {
3287 mod_counter = ac->clabel->mod_counter;
3288 mod_counter_found = 1;
3289 } else {
3290 if (ac->clabel->mod_counter > mod_counter) {
3291 mod_counter = ac->clabel->mod_counter;
3292 }
3293 }
3294 ac = ac->next;
3295 }
3296
3297 num_missing = 0;
3298 auto_config = cset->ac;
3299
3300 even_pair_failed = 0;
3301 for(c=0; c<num_cols; c++) {
3302 ac = auto_config;
3303 while(ac!=NULL) {
3304 if ((ac->clabel->column == c) &&
3305 (ac->clabel->mod_counter == mod_counter)) {
3306 /* it's this one... */
3307 #ifdef DEBUG
3308 printf("Found: %s at %d\n",
3309 ac->devname,c);
3310 #endif
3311 break;
3312 }
3313 ac=ac->next;
3314 }
3315 if (ac==NULL) {
3316 /* Didn't find one here! */
3317 /* special case for RAID 1, especially
3318 where there are more than 2
3319 components (where RAIDframe treats
3320 things a little differently :( ) */
3321 if (parity_type == '1') {
3322 if (c%2 == 0) { /* even component */
3323 even_pair_failed = 1;
3324 } else { /* odd component. If
3325 we're failed, and
3326 so is the even
3327 component, it's
3328 "Good Night, Charlie" */
3329 if (even_pair_failed == 1) {
3330 return(0);
3331 }
3332 }
3333 } else {
3334 /* normal accounting */
3335 num_missing++;
3336 }
3337 }
3338 if ((parity_type == '1') && (c%2 == 1)) {
3339 /* Just did an even component, and we didn't
3340 bail.. reset the even_pair_failed flag,
3341 and go on to the next component.... */
3342 even_pair_failed = 0;
3343 }
3344 }
3345
3346 clabel = cset->ac->clabel;
3347
3348 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3349 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3350 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3351 /* XXX this needs to be made *much* more general */
3352 /* Too many failures */
3353 return(0);
3354 }
3355 /* otherwise, all is well, and we've got enough to take a kick
3356 at autoconfiguring this set */
3357 return(1);
3358 }
3359
3360 void
3361 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3362 RF_Raid_t *raidPtr)
3363 {
3364 RF_ComponentLabel_t *clabel;
3365 int i;
3366
3367 clabel = ac->clabel;
3368
3369 /* 1. Fill in the common stuff */
3370 config->numRow = clabel->num_rows = 1;
3371 config->numCol = clabel->num_columns;
3372 config->numSpare = 0; /* XXX should this be set here? */
3373 config->sectPerSU = clabel->sectPerSU;
3374 config->SUsPerPU = clabel->SUsPerPU;
3375 config->SUsPerRU = clabel->SUsPerRU;
3376 config->parityConfig = clabel->parityConfig;
3377 /* XXX... */
3378 strcpy(config->diskQueueType,"fifo");
3379 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3380 config->layoutSpecificSize = 0; /* XXX ?? */
3381
3382 while(ac!=NULL) {
3383 /* row/col values will be in range due to the checks
3384 in reasonable_label() */
3385 strcpy(config->devnames[0][ac->clabel->column],
3386 ac->devname);
3387 ac = ac->next;
3388 }
3389
3390 for(i=0;i<RF_MAXDBGV;i++) {
3391 config->debugVars[i][0] = 0;
3392 }
3393 }
3394
3395 int
3396 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3397 {
3398 RF_ComponentLabel_t *clabel;
3399 int column;
3400 int sparecol;
3401
3402 raidPtr->autoconfigure = new_value;
3403
3404 for(column=0; column<raidPtr->numCol; column++) {
3405 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3406 clabel = raidget_component_label(raidPtr, column);
3407 clabel->autoconfigure = new_value;
3408 raidflush_component_label(raidPtr, column);
3409 }
3410 }
3411 for(column = 0; column < raidPtr->numSpare ; column++) {
3412 sparecol = raidPtr->numCol + column;
3413 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3414 clabel = raidget_component_label(raidPtr, sparecol);
3415 clabel->autoconfigure = new_value;
3416 raidflush_component_label(raidPtr, sparecol);
3417 }
3418 }
3419 return(new_value);
3420 }
3421
3422 int
3423 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3424 {
3425 RF_ComponentLabel_t *clabel;
3426 int column;
3427 int sparecol;
3428
3429 raidPtr->root_partition = new_value;
3430 for(column=0; column<raidPtr->numCol; column++) {
3431 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3432 clabel = raidget_component_label(raidPtr, column);
3433 clabel->root_partition = new_value;
3434 raidflush_component_label(raidPtr, column);
3435 }
3436 }
3437 for(column = 0; column < raidPtr->numSpare ; column++) {
3438 sparecol = raidPtr->numCol + column;
3439 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3440 clabel = raidget_component_label(raidPtr, sparecol);
3441 clabel->root_partition = new_value;
3442 raidflush_component_label(raidPtr, sparecol);
3443 }
3444 }
3445 return(new_value);
3446 }
3447
3448 void
3449 rf_release_all_vps(RF_ConfigSet_t *cset)
3450 {
3451 RF_AutoConfig_t *ac;
3452
3453 ac = cset->ac;
3454 while(ac!=NULL) {
3455 /* Close the vp, and give it back */
3456 if (ac->vp) {
3457 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3458 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3459 vput(ac->vp);
3460 ac->vp = NULL;
3461 }
3462 ac = ac->next;
3463 }
3464 }
3465
3466
3467 void
3468 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3469 {
3470 RF_AutoConfig_t *ac;
3471 RF_AutoConfig_t *next_ac;
3472
3473 ac = cset->ac;
3474 while(ac!=NULL) {
3475 next_ac = ac->next;
3476 /* nuke the label */
3477 free(ac->clabel, M_RAIDFRAME);
3478 /* cleanup the config structure */
3479 free(ac, M_RAIDFRAME);
3480 /* "next.." */
3481 ac = next_ac;
3482 }
3483 /* and, finally, nuke the config set */
3484 free(cset, M_RAIDFRAME);
3485 }
3486
3487
3488 void
3489 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3490 {
3491 /* current version number */
3492 clabel->version = RF_COMPONENT_LABEL_VERSION;
3493 clabel->serial_number = raidPtr->serial_number;
3494 clabel->mod_counter = raidPtr->mod_counter;
3495
3496 clabel->num_rows = 1;
3497 clabel->num_columns = raidPtr->numCol;
3498 clabel->clean = RF_RAID_DIRTY; /* not clean */
3499 clabel->status = rf_ds_optimal; /* "It's good!" */
3500
3501 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3502 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3503 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3504
3505 clabel->blockSize = raidPtr->bytesPerSector;
3506 clabel->numBlocks = raidPtr->sectorsPerDisk;
3507
3508 /* XXX not portable */
3509 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3510 clabel->maxOutstanding = raidPtr->maxOutstanding;
3511 clabel->autoconfigure = raidPtr->autoconfigure;
3512 clabel->root_partition = raidPtr->root_partition;
3513 clabel->last_unit = raidPtr->raidid;
3514 clabel->config_order = raidPtr->config_order;
3515
3516 #ifndef RF_NO_PARITY_MAP
3517 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3518 #endif
3519 }
3520
3521 int
3522 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3523 {
3524 RF_Raid_t *raidPtr;
3525 RF_Config_t *config;
3526 int raidID;
3527 int retcode;
3528
3529 #ifdef DEBUG
3530 printf("RAID autoconfigure\n");
3531 #endif
3532
3533 retcode = 0;
3534 *unit = -1;
3535
3536 /* 1. Create a config structure */
3537
3538 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3539 M_RAIDFRAME,
3540 M_NOWAIT);
3541 if (config==NULL) {
3542 printf("Out of mem!?!?\n");
3543 /* XXX do something more intelligent here. */
3544 return(1);
3545 }
3546
3547 memset(config, 0, sizeof(RF_Config_t));
3548
3549 /*
3550 2. Figure out what RAID ID this one is supposed to live at
3551 See if we can get the same RAID dev that it was configured
3552 on last time..
3553 */
3554
3555 raidID = cset->ac->clabel->last_unit;
3556 if ((raidID < 0) || (raidID >= numraid)) {
3557 /* let's not wander off into lala land. */
3558 raidID = numraid - 1;
3559 }
3560 if (raidPtrs[raidID]->valid != 0) {
3561
3562 /*
3563 Nope... Go looking for an alternative...
3564 Start high so we don't immediately use raid0 if that's
3565 not taken.
3566 */
3567
3568 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3569 if (raidPtrs[raidID]->valid == 0) {
3570 /* can use this one! */
3571 break;
3572 }
3573 }
3574 }
3575
3576 if (raidID < 0) {
3577 /* punt... */
3578 printf("Unable to auto configure this set!\n");
3579 printf("(Out of RAID devs!)\n");
3580 free(config, M_RAIDFRAME);
3581 return(1);
3582 }
3583
3584 #ifdef DEBUG
3585 printf("Configuring raid%d:\n",raidID);
3586 #endif
3587
3588 raidPtr = raidPtrs[raidID];
3589
3590 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3591 raidPtr->raidid = raidID;
3592 raidPtr->openings = RAIDOUTSTANDING;
3593
3594 /* 3. Build the configuration structure */
3595 rf_create_configuration(cset->ac, config, raidPtr);
3596
3597 /* 4. Do the configuration */
3598 retcode = rf_Configure(raidPtr, config, cset->ac);
3599
3600 if (retcode == 0) {
3601
3602 raidinit(raidPtrs[raidID]);
3603
3604 rf_markalldirty(raidPtrs[raidID]);
3605 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3606 if (cset->ac->clabel->root_partition==1) {
3607 /* everything configured just fine. Make a note
3608 that this set is eligible to be root. */
3609 cset->rootable = 1;
3610 /* XXX do this here? */
3611 raidPtrs[raidID]->root_partition = 1;
3612 }
3613 }
3614
3615 /* 5. Cleanup */
3616 free(config, M_RAIDFRAME);
3617
3618 *unit = raidID;
3619 return(retcode);
3620 }
3621
3622 void
3623 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3624 {
3625 struct buf *bp;
3626
3627 bp = (struct buf *)desc->bp;
3628 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3629 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3630 }
3631
3632 void
3633 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3634 size_t xmin, size_t xmax)
3635 {
3636 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3637 pool_sethiwat(p, xmax);
3638 pool_prime(p, xmin);
3639 pool_setlowat(p, xmin);
3640 }
3641
3642 /*
3643 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3644 * if there is IO pending and if that IO could possibly be done for a
3645 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3646 * otherwise.
3647 *
3648 */
3649
3650 int
3651 rf_buf_queue_check(int raidid)
3652 {
3653 if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
3654 raidPtrs[raidid]->openings > 0) {
3655 /* there is work to do */
3656 return 0;
3657 }
3658 /* default is nothing to do */
3659 return 1;
3660 }
3661
3662 int
3663 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3664 {
3665 struct partinfo dpart;
3666 struct dkwedge_info dkw;
3667 int error;
3668
3669 error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred);
3670 if (error == 0) {
3671 diskPtr->blockSize = dpart.disklab->d_secsize;
3672 diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3673 diskPtr->partitionSize = dpart.part->p_size;
3674 return 0;
3675 }
3676
3677 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred);
3678 if (error == 0) {
3679 diskPtr->blockSize = 512; /* XXX */
3680 diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3681 diskPtr->partitionSize = dkw.dkw_size;
3682 return 0;
3683 }
3684 return error;
3685 }
3686
3687 static int
3688 raid_match(struct device *self, struct cfdata *cfdata,
3689 void *aux)
3690 {
3691 return 1;
3692 }
3693
3694 static void
3695 raid_attach(struct device *parent, struct device *self,
3696 void *aux)
3697 {
3698
3699 }
3700
3701
3702 static int
3703 raid_detach(struct device *self, int flags)
3704 {
3705 struct raid_softc *rs = (struct raid_softc *)self;
3706
3707 if (rs->sc_flags & RAIDF_INITED)
3708 return EBUSY;
3709
3710 return 0;
3711 }
3712
3713 static void
3714 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3715 {
3716 prop_dictionary_t disk_info, odisk_info, geom;
3717 disk_info = prop_dictionary_create();
3718 geom = prop_dictionary_create();
3719 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3720 raidPtr->totalSectors);
3721 prop_dictionary_set_uint32(geom, "sector-size",
3722 raidPtr->bytesPerSector);
3723
3724 prop_dictionary_set_uint16(geom, "sectors-per-track",
3725 raidPtr->Layout.dataSectorsPerStripe);
3726 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3727 4 * raidPtr->numCol);
3728
3729 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3730 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3731 (4 * raidPtr->numCol)));
3732
3733 prop_dictionary_set(disk_info, "geometry", geom);
3734 prop_object_release(geom);
3735 prop_dictionary_set(device_properties(rs->sc_dev),
3736 "disk-info", disk_info);
3737 odisk_info = rs->sc_dkdev.dk_info;
3738 rs->sc_dkdev.dk_info = disk_info;
3739 if (odisk_info)
3740 prop_object_release(odisk_info);
3741 }
3742
3743 /*
3744 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3745 * We end up returning whatever error was returned by the first cache flush
3746 * that fails.
3747 */
3748
3749 int
3750 rf_sync_component_caches(RF_Raid_t *raidPtr)
3751 {
3752 int c, sparecol;
3753 int e,error;
3754 int force = 1;
3755
3756 error = 0;
3757 for (c = 0; c < raidPtr->numCol; c++) {
3758 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3759 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3760 &force, FWRITE, NOCRED);
3761 if (e) {
3762 if (e != ENODEV)
3763 printf("raid%d: cache flush to component %s failed.\n",
3764 raidPtr->raidid, raidPtr->Disks[c].devname);
3765 if (error == 0) {
3766 error = e;
3767 }
3768 }
3769 }
3770 }
3771
3772 for( c = 0; c < raidPtr->numSpare ; c++) {
3773 sparecol = raidPtr->numCol + c;
3774 /* Need to ensure that the reconstruct actually completed! */
3775 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3776 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3777 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3778 if (e) {
3779 if (e != ENODEV)
3780 printf("raid%d: cache flush to component %s failed.\n",
3781 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3782 if (error == 0) {
3783 error = e;
3784 }
3785 }
3786 }
3787 }
3788 return error;
3789 }
3790