rf_netbsdkintf.c revision 1.250.4.6 1 /* $NetBSD: rf_netbsdkintf.c,v 1.250.4.6 2010/03/06 20:56:15 sborrill Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Copyright (c) 1990, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * the Systems Programming Group of the University of Utah Computer
37 * Science Department.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 * from: Utah $Hdr: cd.c 1.6 90/11/28$
64 *
65 * @(#)cd.c 8.2 (Berkeley) 11/16/93
66 */
67
68 /*
69 * Copyright (c) 1988 University of Utah.
70 *
71 * This code is derived from software contributed to Berkeley by
72 * the Systems Programming Group of the University of Utah Computer
73 * Science Department.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgement:
85 * This product includes software developed by the University of
86 * California, Berkeley and its contributors.
87 * 4. Neither the name of the University nor the names of its contributors
88 * may be used to endorse or promote products derived from this software
89 * without specific prior written permission.
90 *
91 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
94 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
95 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
96 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
97 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
98 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
99 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
100 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
101 * SUCH DAMAGE.
102 *
103 * from: Utah $Hdr: cd.c 1.6 90/11/28$
104 *
105 * @(#)cd.c 8.2 (Berkeley) 11/16/93
106 */
107
108 /*
109 * Copyright (c) 1995 Carnegie-Mellon University.
110 * All rights reserved.
111 *
112 * Authors: Mark Holland, Jim Zelenka
113 *
114 * Permission to use, copy, modify and distribute this software and
115 * its documentation is hereby granted, provided that both the copyright
116 * notice and this permission notice appear in all copies of the
117 * software, derivative works or modified versions, and any portions
118 * thereof, and that both notices appear in supporting documentation.
119 *
120 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
121 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
122 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
123 *
124 * Carnegie Mellon requests users of this software to return to
125 *
126 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
127 * School of Computer Science
128 * Carnegie Mellon University
129 * Pittsburgh PA 15213-3890
130 *
131 * any improvements or extensions that they make and grant Carnegie the
132 * rights to redistribute these changes.
133 */
134
135 /***********************************************************
136 *
137 * rf_kintf.c -- the kernel interface routines for RAIDframe
138 *
139 ***********************************************************/
140
141 #include <sys/cdefs.h>
142 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.250.4.6 2010/03/06 20:56:15 sborrill Exp $");
143
144 #include <sys/param.h>
145 #include <sys/errno.h>
146 #include <sys/pool.h>
147 #include <sys/proc.h>
148 #include <sys/queue.h>
149 #include <sys/disk.h>
150 #include <sys/device.h>
151 #include <sys/stat.h>
152 #include <sys/ioctl.h>
153 #include <sys/fcntl.h>
154 #include <sys/systm.h>
155 #include <sys/vnode.h>
156 #include <sys/disklabel.h>
157 #include <sys/conf.h>
158 #include <sys/buf.h>
159 #include <sys/bufq.h>
160 #include <sys/user.h>
161 #include <sys/reboot.h>
162 #include <sys/kauth.h>
163
164 #include <prop/proplib.h>
165
166 #include <dev/raidframe/raidframevar.h>
167 #include <dev/raidframe/raidframeio.h>
168 #include <dev/raidframe/rf_paritymap.h>
169 #include "raid.h"
170 #include "opt_raid_autoconfig.h"
171 #include "rf_raid.h"
172 #include "rf_copyback.h"
173 #include "rf_dag.h"
174 #include "rf_dagflags.h"
175 #include "rf_desc.h"
176 #include "rf_diskqueue.h"
177 #include "rf_etimer.h"
178 #include "rf_general.h"
179 #include "rf_kintf.h"
180 #include "rf_options.h"
181 #include "rf_driver.h"
182 #include "rf_parityscan.h"
183 #include "rf_threadstuff.h"
184
185 #ifdef DEBUG
186 int rf_kdebug_level = 0;
187 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
188 #else /* DEBUG */
189 #define db1_printf(a) { }
190 #endif /* DEBUG */
191
192 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
193
194 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
195 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
196
197 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
198 * spare table */
199 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
200 * installation process */
201 #endif
202
203 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
204
205 /* prototypes */
206 static void KernelWakeupFunc(struct buf *);
207 static void InitBP(struct buf *, struct vnode *, unsigned,
208 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
209 void *, int, struct proc *);
210 static void raidinit(RF_Raid_t *);
211
212 void raidattach(int);
213 static int raid_match(struct device *, struct cfdata *, void *);
214 static void raid_attach(struct device *, struct device *, void *);
215 static int raid_detach(struct device *, int);
216
217 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
218 daddr_t, daddr_t);
219 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
220 daddr_t, daddr_t, int);
221
222 static int raidwrite_component_label(dev_t, struct vnode *,
223 RF_ComponentLabel_t *);
224 static int raidread_component_label(dev_t, struct vnode *,
225 RF_ComponentLabel_t *);
226
227
228 dev_type_open(raidopen);
229 dev_type_close(raidclose);
230 dev_type_read(raidread);
231 dev_type_write(raidwrite);
232 dev_type_ioctl(raidioctl);
233 dev_type_strategy(raidstrategy);
234 dev_type_dump(raiddump);
235 dev_type_size(raidsize);
236
237 const struct bdevsw raid_bdevsw = {
238 raidopen, raidclose, raidstrategy, raidioctl,
239 raiddump, raidsize, D_DISK
240 };
241
242 const struct cdevsw raid_cdevsw = {
243 raidopen, raidclose, raidread, raidwrite, raidioctl,
244 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
245 };
246
247 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
248
249 /* XXX Not sure if the following should be replacing the raidPtrs above,
250 or if it should be used in conjunction with that...
251 */
252
253 struct raid_softc {
254 struct device *sc_dev;
255 int sc_flags; /* flags */
256 int sc_cflags; /* configuration flags */
257 uint64_t sc_size; /* size of the raid device */
258 char sc_xname[20]; /* XXX external name */
259 struct disk sc_dkdev; /* generic disk device info */
260 struct bufq_state *buf_queue; /* used for the device queue */
261 };
262 /* sc_flags */
263 #define RAIDF_INITED 0x01 /* unit has been initialized */
264 #define RAIDF_WLABEL 0x02 /* label area is writable */
265 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
266 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
267 #define RAIDF_LOCKED 0x80 /* unit is locked */
268
269 #define raidunit(x) DISKUNIT(x)
270 int numraid = 0;
271
272 extern struct cfdriver raid_cd;
273 CFATTACH_DECL_NEW(raid, sizeof(struct raid_softc),
274 raid_match, raid_attach, raid_detach, NULL);
275
276 /*
277 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
278 * Be aware that large numbers can allow the driver to consume a lot of
279 * kernel memory, especially on writes, and in degraded mode reads.
280 *
281 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
282 * a single 64K write will typically require 64K for the old data,
283 * 64K for the old parity, and 64K for the new parity, for a total
284 * of 192K (if the parity buffer is not re-used immediately).
285 * Even it if is used immediately, that's still 128K, which when multiplied
286 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
287 *
288 * Now in degraded mode, for example, a 64K read on the above setup may
289 * require data reconstruction, which will require *all* of the 4 remaining
290 * disks to participate -- 4 * 32K/disk == 128K again.
291 */
292
293 #ifndef RAIDOUTSTANDING
294 #define RAIDOUTSTANDING 6
295 #endif
296
297 #define RAIDLABELDEV(dev) \
298 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
299
300 /* declared here, and made public, for the benefit of KVM stuff.. */
301 struct raid_softc *raid_softc;
302
303 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
304 struct disklabel *);
305 static void raidgetdisklabel(dev_t);
306 static void raidmakedisklabel(struct raid_softc *);
307
308 static int raidlock(struct raid_softc *);
309 static void raidunlock(struct raid_softc *);
310
311 static void rf_markalldirty(RF_Raid_t *);
312 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
313
314 void rf_ReconThread(struct rf_recon_req *);
315 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
316 void rf_CopybackThread(RF_Raid_t *raidPtr);
317 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
318 int rf_autoconfig(struct device *self);
319 void rf_buildroothack(RF_ConfigSet_t *);
320
321 RF_AutoConfig_t *rf_find_raid_components(void);
322 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
323 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
324 static int rf_reasonable_label(RF_ComponentLabel_t *);
325 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
326 int rf_set_autoconfig(RF_Raid_t *, int);
327 int rf_set_rootpartition(RF_Raid_t *, int);
328 void rf_release_all_vps(RF_ConfigSet_t *);
329 void rf_cleanup_config_set(RF_ConfigSet_t *);
330 int rf_have_enough_components(RF_ConfigSet_t *);
331 int rf_auto_config_set(RF_ConfigSet_t *, int *);
332
333 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
334 allow autoconfig to take place.
335 Note that this is overridden by having
336 RAID_AUTOCONFIG as an option in the
337 kernel config file. */
338
339 struct RF_Pools_s rf_pools;
340
341 void
342 raidattach(int num)
343 {
344 int raidID;
345 int i, rc;
346
347 #ifdef DEBUG
348 printf("raidattach: Asked for %d units\n", num);
349 #endif
350
351 if (num <= 0) {
352 #ifdef DIAGNOSTIC
353 panic("raidattach: count <= 0");
354 #endif
355 return;
356 }
357 /* This is where all the initialization stuff gets done. */
358
359 numraid = num;
360
361 /* Make some space for requested number of units... */
362
363 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
364 if (raidPtrs == NULL) {
365 panic("raidPtrs is NULL!!");
366 }
367
368 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
369 rf_mutex_init(&rf_sparet_wait_mutex);
370
371 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
372 #endif
373
374 for (i = 0; i < num; i++)
375 raidPtrs[i] = NULL;
376 rc = rf_BootRaidframe();
377 if (rc == 0)
378 aprint_normal("Kernelized RAIDframe activated\n");
379 else
380 panic("Serious error booting RAID!!");
381
382 /* put together some datastructures like the CCD device does.. This
383 * lets us lock the device and what-not when it gets opened. */
384
385 raid_softc = (struct raid_softc *)
386 malloc(num * sizeof(struct raid_softc),
387 M_RAIDFRAME, M_NOWAIT);
388 if (raid_softc == NULL) {
389 aprint_error("WARNING: no memory for RAIDframe driver\n");
390 return;
391 }
392
393 memset(raid_softc, 0, num * sizeof(struct raid_softc));
394
395 for (raidID = 0; raidID < num; raidID++) {
396 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
397
398 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
399 (RF_Raid_t *));
400 if (raidPtrs[raidID] == NULL) {
401 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
402 numraid = raidID;
403 return;
404 }
405 }
406
407 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
408 aprint_error("raidattach: config_cfattach_attach failed?\n");
409 }
410
411 #ifdef RAID_AUTOCONFIG
412 raidautoconfig = 1;
413 #endif
414
415 /*
416 * Register a finalizer which will be used to auto-config RAID
417 * sets once all real hardware devices have been found.
418 */
419 if (config_finalize_register(NULL, rf_autoconfig) != 0)
420 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
421 }
422
423 int
424 rf_autoconfig(struct device *self)
425 {
426 RF_AutoConfig_t *ac_list;
427 RF_ConfigSet_t *config_sets;
428
429 if (raidautoconfig == 0)
430 return (0);
431
432 /* XXX This code can only be run once. */
433 raidautoconfig = 0;
434
435 /* 1. locate all RAID components on the system */
436 #ifdef DEBUG
437 printf("Searching for RAID components...\n");
438 #endif
439 ac_list = rf_find_raid_components();
440
441 /* 2. Sort them into their respective sets. */
442 config_sets = rf_create_auto_sets(ac_list);
443
444 /*
445 * 3. Evaluate each set andconfigure the valid ones.
446 * This gets done in rf_buildroothack().
447 */
448 rf_buildroothack(config_sets);
449
450 return 1;
451 }
452
453 void
454 rf_buildroothack(RF_ConfigSet_t *config_sets)
455 {
456 RF_ConfigSet_t *cset;
457 RF_ConfigSet_t *next_cset;
458 int retcode;
459 int raidID;
460 int rootID;
461 int col;
462 int num_root;
463 char *devname;
464
465 rootID = 0;
466 num_root = 0;
467 cset = config_sets;
468 while(cset != NULL ) {
469 next_cset = cset->next;
470 if (rf_have_enough_components(cset) &&
471 cset->ac->clabel->autoconfigure==1) {
472 retcode = rf_auto_config_set(cset,&raidID);
473 if (!retcode) {
474 #ifdef DEBUG
475 printf("raid%d: configured ok\n", raidID);
476 #endif
477 if (cset->rootable) {
478 rootID = raidID;
479 num_root++;
480 }
481 } else {
482 /* The autoconfig didn't work :( */
483 #ifdef DEBUG
484 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
485 #endif
486 rf_release_all_vps(cset);
487 }
488 } else {
489 /* we're not autoconfiguring this set...
490 release the associated resources */
491 rf_release_all_vps(cset);
492 }
493 /* cleanup */
494 rf_cleanup_config_set(cset);
495 cset = next_cset;
496 }
497
498 /* if the user has specified what the root device should be
499 then we don't touch booted_device or boothowto... */
500
501 if (rootspec != NULL)
502 return;
503
504 /* we found something bootable... */
505
506 if (num_root == 1) {
507 booted_device = raid_softc[rootID].sc_dev;
508 } else if (num_root > 1) {
509
510 /*
511 * Maybe the MD code can help. If it cannot, then
512 * setroot() will discover that we have no
513 * booted_device and will ask the user if nothing was
514 * hardwired in the kernel config file
515 */
516
517 if (booted_device == NULL)
518 cpu_rootconf();
519 if (booted_device == NULL)
520 return;
521
522 num_root = 0;
523 for (raidID = 0; raidID < numraid; raidID++) {
524 if (raidPtrs[raidID]->valid == 0)
525 continue;
526
527 if (raidPtrs[raidID]->root_partition == 0)
528 continue;
529
530 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
531 devname = raidPtrs[raidID]->Disks[col].devname;
532 devname += sizeof("/dev/") - 1;
533 if (strncmp(devname, device_xname(booted_device),
534 strlen(device_xname(booted_device))) != 0)
535 continue;
536 #ifdef DEBUG
537 printf("raid%d includes boot device %s\n",
538 raidID, devname);
539 #endif
540 num_root++;
541 rootID = raidID;
542 }
543 }
544
545 if (num_root == 1) {
546 booted_device = raid_softc[rootID].sc_dev;
547 } else {
548 /* we can't guess.. require the user to answer... */
549 boothowto |= RB_ASKNAME;
550 }
551 }
552 }
553
554
555 int
556 raidsize(dev_t dev)
557 {
558 struct raid_softc *rs;
559 struct disklabel *lp;
560 int part, unit, omask, size;
561
562 unit = raidunit(dev);
563 if (unit >= numraid)
564 return (-1);
565 rs = &raid_softc[unit];
566
567 if ((rs->sc_flags & RAIDF_INITED) == 0)
568 return (-1);
569
570 part = DISKPART(dev);
571 omask = rs->sc_dkdev.dk_openmask & (1 << part);
572 lp = rs->sc_dkdev.dk_label;
573
574 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
575 return (-1);
576
577 if (lp->d_partitions[part].p_fstype != FS_SWAP)
578 size = -1;
579 else
580 size = lp->d_partitions[part].p_size *
581 (lp->d_secsize / DEV_BSIZE);
582
583 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
584 return (-1);
585
586 return (size);
587
588 }
589
590 int
591 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
592 {
593 int unit = raidunit(dev);
594 struct raid_softc *rs;
595 const struct bdevsw *bdev;
596 struct disklabel *lp;
597 RF_Raid_t *raidPtr;
598 daddr_t offset;
599 int part, c, sparecol, j, scol, dumpto;
600 int error = 0;
601
602 if (unit >= numraid)
603 return (ENXIO);
604
605 rs = &raid_softc[unit];
606 raidPtr = raidPtrs[unit];
607
608 if ((rs->sc_flags & RAIDF_INITED) == 0)
609 return ENXIO;
610
611 /* we only support dumping to RAID 1 sets */
612 if (raidPtr->Layout.numDataCol != 1 ||
613 raidPtr->Layout.numParityCol != 1)
614 return EINVAL;
615
616
617 if ((error = raidlock(rs)) != 0)
618 return error;
619
620 if (size % DEV_BSIZE != 0) {
621 error = EINVAL;
622 goto out;
623 }
624
625 if (blkno + size / DEV_BSIZE > rs->sc_size) {
626 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
627 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
628 size / DEV_BSIZE, rs->sc_size);
629 error = EINVAL;
630 goto out;
631 }
632
633 part = DISKPART(dev);
634 lp = rs->sc_dkdev.dk_label;
635 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
636
637 /* figure out what device is alive.. */
638
639 /*
640 Look for a component to dump to. The preference for the
641 component to dump to is as follows:
642 1) the master
643 2) a used_spare of the master
644 3) the slave
645 4) a used_spare of the slave
646 */
647
648 dumpto = -1;
649 for (c = 0; c < raidPtr->numCol; c++) {
650 if (raidPtr->Disks[c].status == rf_ds_optimal) {
651 /* this might be the one */
652 dumpto = c;
653 break;
654 }
655 }
656
657 /*
658 At this point we have possibly selected a live master or a
659 live slave. We now check to see if there is a spared
660 master (or a spared slave), if we didn't find a live master
661 or a live slave.
662 */
663
664 for (c = 0; c < raidPtr->numSpare; c++) {
665 sparecol = raidPtr->numCol + c;
666 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
667 /* How about this one? */
668 scol = -1;
669 for(j=0;j<raidPtr->numCol;j++) {
670 if (raidPtr->Disks[j].spareCol == sparecol) {
671 scol = j;
672 break;
673 }
674 }
675 if (scol == 0) {
676 /*
677 We must have found a spared master!
678 We'll take that over anything else
679 found so far. (We couldn't have
680 found a real master before, since
681 this is a used spare, and it's
682 saying that it's replacing the
683 master.) On reboot (with
684 autoconfiguration turned on)
685 sparecol will become the 1st
686 component (component0) of this set.
687 */
688 dumpto = sparecol;
689 break;
690 } else if (scol != -1) {
691 /*
692 Must be a spared slave. We'll dump
693 to that if we havn't found anything
694 else so far.
695 */
696 if (dumpto == -1)
697 dumpto = sparecol;
698 }
699 }
700 }
701
702 if (dumpto == -1) {
703 /* we couldn't find any live components to dump to!?!?
704 */
705 error = EINVAL;
706 goto out;
707 }
708
709 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
710
711 /*
712 Note that blkno is relative to this particular partition.
713 By adding the offset of this partition in the RAID
714 set, and also adding RF_PROTECTED_SECTORS, we get a
715 value that is relative to the partition used for the
716 underlying component.
717 */
718
719 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
720 blkno + offset, va, size);
721
722 out:
723 raidunlock(rs);
724
725 return error;
726 }
727 /* ARGSUSED */
728 int
729 raidopen(dev_t dev, int flags, int fmt,
730 struct lwp *l)
731 {
732 int unit = raidunit(dev);
733 struct raid_softc *rs;
734 struct disklabel *lp;
735 int part, pmask;
736 int error = 0;
737
738 if (unit >= numraid)
739 return (ENXIO);
740 rs = &raid_softc[unit];
741
742 if ((error = raidlock(rs)) != 0)
743 return (error);
744 lp = rs->sc_dkdev.dk_label;
745
746 part = DISKPART(dev);
747
748 /*
749 * If there are wedges, and this is not RAW_PART, then we
750 * need to fail.
751 */
752 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
753 error = EBUSY;
754 goto bad;
755 }
756 pmask = (1 << part);
757
758 if ((rs->sc_flags & RAIDF_INITED) &&
759 (rs->sc_dkdev.dk_openmask == 0))
760 raidgetdisklabel(dev);
761
762 /* make sure that this partition exists */
763
764 if (part != RAW_PART) {
765 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
766 ((part >= lp->d_npartitions) ||
767 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
768 error = ENXIO;
769 goto bad;
770 }
771 }
772 /* Prevent this unit from being unconfigured while open. */
773 switch (fmt) {
774 case S_IFCHR:
775 rs->sc_dkdev.dk_copenmask |= pmask;
776 break;
777
778 case S_IFBLK:
779 rs->sc_dkdev.dk_bopenmask |= pmask;
780 break;
781 }
782
783 if ((rs->sc_dkdev.dk_openmask == 0) &&
784 ((rs->sc_flags & RAIDF_INITED) != 0)) {
785 /* First one... mark things as dirty... Note that we *MUST*
786 have done a configure before this. I DO NOT WANT TO BE
787 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
788 THAT THEY BELONG TOGETHER!!!!! */
789 /* XXX should check to see if we're only open for reading
790 here... If so, we needn't do this, but then need some
791 other way of keeping track of what's happened.. */
792
793 rf_markalldirty( raidPtrs[unit] );
794 }
795
796
797 rs->sc_dkdev.dk_openmask =
798 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
799
800 bad:
801 raidunlock(rs);
802
803 return (error);
804
805
806 }
807 /* ARGSUSED */
808 int
809 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
810 {
811 int unit = raidunit(dev);
812 struct cfdata *cf;
813 struct raid_softc *rs;
814 int error = 0;
815 int part;
816
817 if (unit >= numraid)
818 return (ENXIO);
819 rs = &raid_softc[unit];
820
821 if ((error = raidlock(rs)) != 0)
822 return (error);
823
824 part = DISKPART(dev);
825
826 /* ...that much closer to allowing unconfiguration... */
827 switch (fmt) {
828 case S_IFCHR:
829 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
830 break;
831
832 case S_IFBLK:
833 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
834 break;
835 }
836 rs->sc_dkdev.dk_openmask =
837 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
838
839 if ((rs->sc_dkdev.dk_openmask == 0) &&
840 ((rs->sc_flags & RAIDF_INITED) != 0)) {
841 /* Last one... device is not unconfigured yet.
842 Device shutdown has taken care of setting the
843 clean bits if RAIDF_INITED is not set
844 mark things as clean... */
845
846 rf_update_component_labels(raidPtrs[unit],
847 RF_FINAL_COMPONENT_UPDATE);
848 if (doing_shutdown) {
849 /* last one, and we're going down, so
850 lights out for this RAID set too. */
851 error = rf_Shutdown(raidPtrs[unit]);
852
853 /* It's no longer initialized... */
854 rs->sc_flags &= ~RAIDF_INITED;
855
856 /* detach the device */
857
858 cf = device_cfdata(rs->sc_dev);
859 error = config_detach(rs->sc_dev, DETACH_QUIET);
860 free(cf, M_RAIDFRAME);
861
862 /* Detach the disk. */
863 disk_detach(&rs->sc_dkdev);
864 disk_destroy(&rs->sc_dkdev);
865 }
866 }
867
868 raidunlock(rs);
869 return (0);
870
871 }
872
873 void
874 raidstrategy(struct buf *bp)
875 {
876 int s;
877
878 unsigned int raidID = raidunit(bp->b_dev);
879 RF_Raid_t *raidPtr;
880 struct raid_softc *rs = &raid_softc[raidID];
881 int wlabel;
882
883 if ((rs->sc_flags & RAIDF_INITED) ==0) {
884 bp->b_error = ENXIO;
885 goto done;
886 }
887 if (raidID >= numraid || !raidPtrs[raidID]) {
888 bp->b_error = ENODEV;
889 goto done;
890 }
891 raidPtr = raidPtrs[raidID];
892 if (!raidPtr->valid) {
893 bp->b_error = ENODEV;
894 goto done;
895 }
896 if (bp->b_bcount == 0) {
897 db1_printf(("b_bcount is zero..\n"));
898 goto done;
899 }
900
901 /*
902 * Do bounds checking and adjust transfer. If there's an
903 * error, the bounds check will flag that for us.
904 */
905
906 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
907 if (DISKPART(bp->b_dev) == RAW_PART) {
908 uint64_t size; /* device size in DEV_BSIZE unit */
909
910 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
911 size = raidPtr->totalSectors <<
912 (raidPtr->logBytesPerSector - DEV_BSHIFT);
913 } else {
914 size = raidPtr->totalSectors >>
915 (DEV_BSHIFT - raidPtr->logBytesPerSector);
916 }
917 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
918 goto done;
919 }
920 } else {
921 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
922 db1_printf(("Bounds check failed!!:%d %d\n",
923 (int) bp->b_blkno, (int) wlabel));
924 goto done;
925 }
926 }
927 s = splbio();
928
929 bp->b_resid = 0;
930
931 /* stuff it onto our queue */
932 BUFQ_PUT(rs->buf_queue, bp);
933
934 /* scheduled the IO to happen at the next convenient time */
935 wakeup(&(raidPtrs[raidID]->iodone));
936
937 splx(s);
938 return;
939
940 done:
941 bp->b_resid = bp->b_bcount;
942 biodone(bp);
943 }
944 /* ARGSUSED */
945 int
946 raidread(dev_t dev, struct uio *uio, int flags)
947 {
948 int unit = raidunit(dev);
949 struct raid_softc *rs;
950
951 if (unit >= numraid)
952 return (ENXIO);
953 rs = &raid_softc[unit];
954
955 if ((rs->sc_flags & RAIDF_INITED) == 0)
956 return (ENXIO);
957
958 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
959
960 }
961 /* ARGSUSED */
962 int
963 raidwrite(dev_t dev, struct uio *uio, int flags)
964 {
965 int unit = raidunit(dev);
966 struct raid_softc *rs;
967
968 if (unit >= numraid)
969 return (ENXIO);
970 rs = &raid_softc[unit];
971
972 if ((rs->sc_flags & RAIDF_INITED) == 0)
973 return (ENXIO);
974
975 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
976
977 }
978
979 int
980 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
981 {
982 int unit = raidunit(dev);
983 int error = 0;
984 int part, pmask;
985 struct cfdata *cf;
986 struct raid_softc *rs;
987 RF_Config_t *k_cfg, *u_cfg;
988 RF_Raid_t *raidPtr;
989 RF_RaidDisk_t *diskPtr;
990 RF_AccTotals_t *totals;
991 RF_DeviceConfig_t *d_cfg, **ucfgp;
992 u_char *specific_buf;
993 int retcode = 0;
994 int column;
995 /* int raidid; */
996 struct rf_recon_req *rrcopy, *rr;
997 RF_ComponentLabel_t *clabel;
998 RF_ComponentLabel_t *ci_label;
999 RF_ComponentLabel_t **clabel_ptr;
1000 RF_SingleComponent_t *sparePtr,*componentPtr;
1001 RF_SingleComponent_t component;
1002 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1003 int i, j, d;
1004 #ifdef __HAVE_OLD_DISKLABEL
1005 struct disklabel newlabel;
1006 #endif
1007 struct dkwedge_info *dkw;
1008
1009 if (unit >= numraid)
1010 return (ENXIO);
1011 rs = &raid_softc[unit];
1012 raidPtr = raidPtrs[unit];
1013
1014 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
1015 (int) DISKPART(dev), (int) unit, (int) cmd));
1016
1017 /* Must be open for writes for these commands... */
1018 switch (cmd) {
1019 #ifdef DIOCGSECTORSIZE
1020 case DIOCGSECTORSIZE:
1021 *(u_int *)data = raidPtr->bytesPerSector;
1022 return 0;
1023 case DIOCGMEDIASIZE:
1024 *(off_t *)data =
1025 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1026 return 0;
1027 #endif
1028 case DIOCSDINFO:
1029 case DIOCWDINFO:
1030 #ifdef __HAVE_OLD_DISKLABEL
1031 case ODIOCWDINFO:
1032 case ODIOCSDINFO:
1033 #endif
1034 case DIOCWLABEL:
1035 case DIOCAWEDGE:
1036 case DIOCDWEDGE:
1037 if ((flag & FWRITE) == 0)
1038 return (EBADF);
1039 }
1040
1041 /* Must be initialized for these... */
1042 switch (cmd) {
1043 case DIOCGDINFO:
1044 case DIOCSDINFO:
1045 case DIOCWDINFO:
1046 #ifdef __HAVE_OLD_DISKLABEL
1047 case ODIOCGDINFO:
1048 case ODIOCWDINFO:
1049 case ODIOCSDINFO:
1050 case ODIOCGDEFLABEL:
1051 #endif
1052 case DIOCGPART:
1053 case DIOCWLABEL:
1054 case DIOCGDEFLABEL:
1055 case DIOCAWEDGE:
1056 case DIOCDWEDGE:
1057 case DIOCLWEDGES:
1058 case DIOCCACHESYNC:
1059 case RAIDFRAME_SHUTDOWN:
1060 case RAIDFRAME_REWRITEPARITY:
1061 case RAIDFRAME_GET_INFO:
1062 case RAIDFRAME_RESET_ACCTOTALS:
1063 case RAIDFRAME_GET_ACCTOTALS:
1064 case RAIDFRAME_KEEP_ACCTOTALS:
1065 case RAIDFRAME_GET_SIZE:
1066 case RAIDFRAME_FAIL_DISK:
1067 case RAIDFRAME_COPYBACK:
1068 case RAIDFRAME_CHECK_RECON_STATUS:
1069 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1070 case RAIDFRAME_GET_COMPONENT_LABEL:
1071 case RAIDFRAME_SET_COMPONENT_LABEL:
1072 case RAIDFRAME_ADD_HOT_SPARE:
1073 case RAIDFRAME_REMOVE_HOT_SPARE:
1074 case RAIDFRAME_INIT_LABELS:
1075 case RAIDFRAME_REBUILD_IN_PLACE:
1076 case RAIDFRAME_CHECK_PARITY:
1077 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1078 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1079 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1080 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1081 case RAIDFRAME_SET_AUTOCONFIG:
1082 case RAIDFRAME_SET_ROOT:
1083 case RAIDFRAME_DELETE_COMPONENT:
1084 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1085 case RAIDFRAME_PARITYMAP_STATUS:
1086 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1087 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1088 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1089 if ((rs->sc_flags & RAIDF_INITED) == 0)
1090 return (ENXIO);
1091 }
1092
1093 switch (cmd) {
1094
1095 /* configure the system */
1096 case RAIDFRAME_CONFIGURE:
1097
1098 if (raidPtr->valid) {
1099 /* There is a valid RAID set running on this unit! */
1100 printf("raid%d: Device already configured!\n",unit);
1101 return(EINVAL);
1102 }
1103
1104 /* copy-in the configuration information */
1105 /* data points to a pointer to the configuration structure */
1106
1107 u_cfg = *((RF_Config_t **) data);
1108 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1109 if (k_cfg == NULL) {
1110 return (ENOMEM);
1111 }
1112 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1113 if (retcode) {
1114 RF_Free(k_cfg, sizeof(RF_Config_t));
1115 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1116 retcode));
1117 return (retcode);
1118 }
1119 /* allocate a buffer for the layout-specific data, and copy it
1120 * in */
1121 if (k_cfg->layoutSpecificSize) {
1122 if (k_cfg->layoutSpecificSize > 10000) {
1123 /* sanity check */
1124 RF_Free(k_cfg, sizeof(RF_Config_t));
1125 return (EINVAL);
1126 }
1127 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1128 (u_char *));
1129 if (specific_buf == NULL) {
1130 RF_Free(k_cfg, sizeof(RF_Config_t));
1131 return (ENOMEM);
1132 }
1133 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1134 k_cfg->layoutSpecificSize);
1135 if (retcode) {
1136 RF_Free(k_cfg, sizeof(RF_Config_t));
1137 RF_Free(specific_buf,
1138 k_cfg->layoutSpecificSize);
1139 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1140 retcode));
1141 return (retcode);
1142 }
1143 } else
1144 specific_buf = NULL;
1145 k_cfg->layoutSpecific = specific_buf;
1146
1147 /* should do some kind of sanity check on the configuration.
1148 * Store the sum of all the bytes in the last byte? */
1149
1150 /* configure the system */
1151
1152 /*
1153 * Clear the entire RAID descriptor, just to make sure
1154 * there is no stale data left in the case of a
1155 * reconfiguration
1156 */
1157 memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1158 raidPtr->raidid = unit;
1159
1160 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1161
1162 if (retcode == 0) {
1163
1164 /* allow this many simultaneous IO's to
1165 this RAID device */
1166 raidPtr->openings = RAIDOUTSTANDING;
1167
1168 raidinit(raidPtr);
1169 rf_markalldirty(raidPtr);
1170 }
1171 /* free the buffers. No return code here. */
1172 if (k_cfg->layoutSpecificSize) {
1173 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1174 }
1175 RF_Free(k_cfg, sizeof(RF_Config_t));
1176
1177 return (retcode);
1178
1179 /* shutdown the system */
1180 case RAIDFRAME_SHUTDOWN:
1181
1182 if ((error = raidlock(rs)) != 0)
1183 return (error);
1184
1185 /*
1186 * If somebody has a partition mounted, we shouldn't
1187 * shutdown.
1188 */
1189
1190 part = DISKPART(dev);
1191 pmask = (1 << part);
1192 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1193 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1194 (rs->sc_dkdev.dk_copenmask & pmask))) {
1195 raidunlock(rs);
1196 return (EBUSY);
1197 }
1198
1199 retcode = rf_Shutdown(raidPtr);
1200
1201 /* It's no longer initialized... */
1202 rs->sc_flags &= ~RAIDF_INITED;
1203
1204 /* free the pseudo device attach bits */
1205
1206 cf = device_cfdata(rs->sc_dev);
1207 /* XXX this causes us to not return any errors
1208 from the above call to rf_Shutdown() */
1209 retcode = config_detach(rs->sc_dev, DETACH_QUIET);
1210 free(cf, M_RAIDFRAME);
1211
1212 /* Detach the disk. */
1213 disk_detach(&rs->sc_dkdev);
1214 disk_destroy(&rs->sc_dkdev);
1215
1216 raidunlock(rs);
1217
1218 return (retcode);
1219 case RAIDFRAME_GET_COMPONENT_LABEL:
1220 clabel_ptr = (RF_ComponentLabel_t **) data;
1221 /* need to read the component label for the disk indicated
1222 by row,column in clabel */
1223
1224 /*
1225 * Perhaps there should be an option to skip the in-core
1226 * copy and hit the disk, as with disklabel(8).
1227 */
1228 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1229
1230 retcode = copyin( *clabel_ptr, clabel,
1231 sizeof(RF_ComponentLabel_t));
1232
1233 if (retcode) {
1234 return(retcode);
1235 }
1236
1237 clabel->row = 0; /* Don't allow looking at anything else.*/
1238
1239 column = clabel->column;
1240
1241 if ((column < 0) || (column >= raidPtr->numCol +
1242 raidPtr->numSpare)) {
1243 return(EINVAL);
1244 }
1245
1246 RF_Free(clabel, sizeof(*clabel));
1247
1248 clabel = raidget_component_label(raidPtr, column);
1249
1250 if (retcode == 0) {
1251 retcode = copyout(clabel, *clabel_ptr,
1252 sizeof(RF_ComponentLabel_t));
1253 }
1254 return (retcode);
1255
1256 #if 0
1257 case RAIDFRAME_SET_COMPONENT_LABEL:
1258 clabel = (RF_ComponentLabel_t *) data;
1259
1260 /* XXX check the label for valid stuff... */
1261 /* Note that some things *should not* get modified --
1262 the user should be re-initing the labels instead of
1263 trying to patch things.
1264 */
1265
1266 raidid = raidPtr->raidid;
1267 #ifdef DEBUG
1268 printf("raid%d: Got component label:\n", raidid);
1269 printf("raid%d: Version: %d\n", raidid, clabel->version);
1270 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1271 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1272 printf("raid%d: Column: %d\n", raidid, clabel->column);
1273 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1274 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1275 printf("raid%d: Status: %d\n", raidid, clabel->status);
1276 #endif
1277 clabel->row = 0;
1278 column = clabel->column;
1279
1280 if ((column < 0) || (column >= raidPtr->numCol)) {
1281 return(EINVAL);
1282 }
1283
1284 /* XXX this isn't allowed to do anything for now :-) */
1285
1286 /* XXX and before it is, we need to fill in the rest
1287 of the fields!?!?!?! */
1288 memcpy(raidget_component_label(raidPtr, column),
1289 clabel, sizeof(*clabel));
1290 raidflush_component_label(raidPtr, column);
1291 return (0);
1292 #endif
1293
1294 case RAIDFRAME_INIT_LABELS:
1295 clabel = (RF_ComponentLabel_t *) data;
1296 /*
1297 we only want the serial number from
1298 the above. We get all the rest of the information
1299 from the config that was used to create this RAID
1300 set.
1301 */
1302
1303 raidPtr->serial_number = clabel->serial_number;
1304
1305 for(column=0;column<raidPtr->numCol;column++) {
1306 diskPtr = &raidPtr->Disks[column];
1307 if (!RF_DEAD_DISK(diskPtr->status)) {
1308 ci_label = raidget_component_label(raidPtr,
1309 column);
1310 /* Zeroing this is important. */
1311 memset(ci_label, 0, sizeof(*ci_label));
1312 raid_init_component_label(raidPtr, ci_label);
1313 ci_label->serial_number =
1314 raidPtr->serial_number;
1315 ci_label->row = 0; /* we dont' pretend to support more */
1316 ci_label->partitionSize =
1317 diskPtr->partitionSize;
1318 ci_label->column = column;
1319 raidflush_component_label(raidPtr, column);
1320 }
1321 /* XXXjld what about the spares? */
1322 }
1323
1324 return (retcode);
1325 case RAIDFRAME_SET_AUTOCONFIG:
1326 d = rf_set_autoconfig(raidPtr, *(int *) data);
1327 printf("raid%d: New autoconfig value is: %d\n",
1328 raidPtr->raidid, d);
1329 *(int *) data = d;
1330 return (retcode);
1331
1332 case RAIDFRAME_SET_ROOT:
1333 d = rf_set_rootpartition(raidPtr, *(int *) data);
1334 printf("raid%d: New rootpartition value is: %d\n",
1335 raidPtr->raidid, d);
1336 *(int *) data = d;
1337 return (retcode);
1338
1339 /* initialize all parity */
1340 case RAIDFRAME_REWRITEPARITY:
1341
1342 if (raidPtr->Layout.map->faultsTolerated == 0) {
1343 /* Parity for RAID 0 is trivially correct */
1344 raidPtr->parity_good = RF_RAID_CLEAN;
1345 return(0);
1346 }
1347
1348 if (raidPtr->parity_rewrite_in_progress == 1) {
1349 /* Re-write is already in progress! */
1350 return(EINVAL);
1351 }
1352
1353 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1354 rf_RewriteParityThread,
1355 raidPtr,"raid_parity");
1356 return (retcode);
1357
1358
1359 case RAIDFRAME_ADD_HOT_SPARE:
1360 sparePtr = (RF_SingleComponent_t *) data;
1361 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1362 retcode = rf_add_hot_spare(raidPtr, &component);
1363 return(retcode);
1364
1365 case RAIDFRAME_REMOVE_HOT_SPARE:
1366 return(retcode);
1367
1368 case RAIDFRAME_DELETE_COMPONENT:
1369 componentPtr = (RF_SingleComponent_t *)data;
1370 memcpy( &component, componentPtr,
1371 sizeof(RF_SingleComponent_t));
1372 retcode = rf_delete_component(raidPtr, &component);
1373 return(retcode);
1374
1375 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1376 componentPtr = (RF_SingleComponent_t *)data;
1377 memcpy( &component, componentPtr,
1378 sizeof(RF_SingleComponent_t));
1379 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1380 return(retcode);
1381
1382 case RAIDFRAME_REBUILD_IN_PLACE:
1383
1384 if (raidPtr->Layout.map->faultsTolerated == 0) {
1385 /* Can't do this on a RAID 0!! */
1386 return(EINVAL);
1387 }
1388
1389 if (raidPtr->recon_in_progress == 1) {
1390 /* a reconstruct is already in progress! */
1391 return(EINVAL);
1392 }
1393
1394 componentPtr = (RF_SingleComponent_t *) data;
1395 memcpy( &component, componentPtr,
1396 sizeof(RF_SingleComponent_t));
1397 component.row = 0; /* we don't support any more */
1398 column = component.column;
1399
1400 if ((column < 0) || (column >= raidPtr->numCol)) {
1401 return(EINVAL);
1402 }
1403
1404 RF_LOCK_MUTEX(raidPtr->mutex);
1405 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1406 (raidPtr->numFailures > 0)) {
1407 /* XXX 0 above shouldn't be constant!!! */
1408 /* some component other than this has failed.
1409 Let's not make things worse than they already
1410 are... */
1411 printf("raid%d: Unable to reconstruct to disk at:\n",
1412 raidPtr->raidid);
1413 printf("raid%d: Col: %d Too many failures.\n",
1414 raidPtr->raidid, column);
1415 RF_UNLOCK_MUTEX(raidPtr->mutex);
1416 return (EINVAL);
1417 }
1418 if (raidPtr->Disks[column].status ==
1419 rf_ds_reconstructing) {
1420 printf("raid%d: Unable to reconstruct to disk at:\n",
1421 raidPtr->raidid);
1422 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1423
1424 RF_UNLOCK_MUTEX(raidPtr->mutex);
1425 return (EINVAL);
1426 }
1427 if (raidPtr->Disks[column].status == rf_ds_spared) {
1428 RF_UNLOCK_MUTEX(raidPtr->mutex);
1429 return (EINVAL);
1430 }
1431 RF_UNLOCK_MUTEX(raidPtr->mutex);
1432
1433 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1434 if (rrcopy == NULL)
1435 return(ENOMEM);
1436
1437 rrcopy->raidPtr = (void *) raidPtr;
1438 rrcopy->col = column;
1439
1440 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1441 rf_ReconstructInPlaceThread,
1442 rrcopy,"raid_reconip");
1443 return(retcode);
1444
1445 case RAIDFRAME_GET_INFO:
1446 if (!raidPtr->valid)
1447 return (ENODEV);
1448 ucfgp = (RF_DeviceConfig_t **) data;
1449 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1450 (RF_DeviceConfig_t *));
1451 if (d_cfg == NULL)
1452 return (ENOMEM);
1453 d_cfg->rows = 1; /* there is only 1 row now */
1454 d_cfg->cols = raidPtr->numCol;
1455 d_cfg->ndevs = raidPtr->numCol;
1456 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1457 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1458 return (ENOMEM);
1459 }
1460 d_cfg->nspares = raidPtr->numSpare;
1461 if (d_cfg->nspares >= RF_MAX_DISKS) {
1462 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1463 return (ENOMEM);
1464 }
1465 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1466 d = 0;
1467 for (j = 0; j < d_cfg->cols; j++) {
1468 d_cfg->devs[d] = raidPtr->Disks[j];
1469 d++;
1470 }
1471 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1472 d_cfg->spares[i] = raidPtr->Disks[j];
1473 }
1474 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1475 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1476
1477 return (retcode);
1478
1479 case RAIDFRAME_CHECK_PARITY:
1480 *(int *) data = raidPtr->parity_good;
1481 return (0);
1482
1483 case RAIDFRAME_PARITYMAP_STATUS:
1484 rf_paritymap_status(raidPtr->parity_map,
1485 (struct rf_pmstat *)data);
1486 return 0;
1487
1488 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1489 if (raidPtr->parity_map == NULL)
1490 return ENOENT; /* ??? */
1491 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1492 (struct rf_pmparams *)data, 1))
1493 return EINVAL;
1494 return 0;
1495
1496 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1497 *(int *) data = rf_paritymap_get_disable(raidPtr);
1498 return 0;
1499
1500 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1501 rf_paritymap_set_disable(raidPtr, *(int *)data);
1502 /* XXX should errors be passed up? */
1503 return 0;
1504
1505 case RAIDFRAME_RESET_ACCTOTALS:
1506 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1507 return (0);
1508
1509 case RAIDFRAME_GET_ACCTOTALS:
1510 totals = (RF_AccTotals_t *) data;
1511 *totals = raidPtr->acc_totals;
1512 return (0);
1513
1514 case RAIDFRAME_KEEP_ACCTOTALS:
1515 raidPtr->keep_acc_totals = *(int *)data;
1516 return (0);
1517
1518 case RAIDFRAME_GET_SIZE:
1519 *(int *) data = raidPtr->totalSectors;
1520 return (0);
1521
1522 /* fail a disk & optionally start reconstruction */
1523 case RAIDFRAME_FAIL_DISK:
1524
1525 if (raidPtr->Layout.map->faultsTolerated == 0) {
1526 /* Can't do this on a RAID 0!! */
1527 return(EINVAL);
1528 }
1529
1530 rr = (struct rf_recon_req *) data;
1531 rr->row = 0;
1532 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1533 return (EINVAL);
1534
1535
1536 RF_LOCK_MUTEX(raidPtr->mutex);
1537 if (raidPtr->status == rf_rs_reconstructing) {
1538 /* you can't fail a disk while we're reconstructing! */
1539 /* XXX wrong for RAID6 */
1540 RF_UNLOCK_MUTEX(raidPtr->mutex);
1541 return (EINVAL);
1542 }
1543 if ((raidPtr->Disks[rr->col].status ==
1544 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1545 /* some other component has failed. Let's not make
1546 things worse. XXX wrong for RAID6 */
1547 RF_UNLOCK_MUTEX(raidPtr->mutex);
1548 return (EINVAL);
1549 }
1550 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1551 /* Can't fail a spared disk! */
1552 RF_UNLOCK_MUTEX(raidPtr->mutex);
1553 return (EINVAL);
1554 }
1555 RF_UNLOCK_MUTEX(raidPtr->mutex);
1556
1557 /* make a copy of the recon request so that we don't rely on
1558 * the user's buffer */
1559 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1560 if (rrcopy == NULL)
1561 return(ENOMEM);
1562 memcpy(rrcopy, rr, sizeof(*rr));
1563 rrcopy->raidPtr = (void *) raidPtr;
1564
1565 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1566 rf_ReconThread,
1567 rrcopy,"raid_recon");
1568 return (0);
1569
1570 /* invoke a copyback operation after recon on whatever disk
1571 * needs it, if any */
1572 case RAIDFRAME_COPYBACK:
1573
1574 if (raidPtr->Layout.map->faultsTolerated == 0) {
1575 /* This makes no sense on a RAID 0!! */
1576 return(EINVAL);
1577 }
1578
1579 if (raidPtr->copyback_in_progress == 1) {
1580 /* Copyback is already in progress! */
1581 return(EINVAL);
1582 }
1583
1584 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1585 rf_CopybackThread,
1586 raidPtr,"raid_copyback");
1587 return (retcode);
1588
1589 /* return the percentage completion of reconstruction */
1590 case RAIDFRAME_CHECK_RECON_STATUS:
1591 if (raidPtr->Layout.map->faultsTolerated == 0) {
1592 /* This makes no sense on a RAID 0, so tell the
1593 user it's done. */
1594 *(int *) data = 100;
1595 return(0);
1596 }
1597 if (raidPtr->status != rf_rs_reconstructing)
1598 *(int *) data = 100;
1599 else {
1600 if (raidPtr->reconControl->numRUsTotal > 0) {
1601 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1602 } else {
1603 *(int *) data = 0;
1604 }
1605 }
1606 return (0);
1607 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1608 progressInfoPtr = (RF_ProgressInfo_t **) data;
1609 if (raidPtr->status != rf_rs_reconstructing) {
1610 progressInfo.remaining = 0;
1611 progressInfo.completed = 100;
1612 progressInfo.total = 100;
1613 } else {
1614 progressInfo.total =
1615 raidPtr->reconControl->numRUsTotal;
1616 progressInfo.completed =
1617 raidPtr->reconControl->numRUsComplete;
1618 progressInfo.remaining = progressInfo.total -
1619 progressInfo.completed;
1620 }
1621 retcode = copyout(&progressInfo, *progressInfoPtr,
1622 sizeof(RF_ProgressInfo_t));
1623 return (retcode);
1624
1625 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1626 if (raidPtr->Layout.map->faultsTolerated == 0) {
1627 /* This makes no sense on a RAID 0, so tell the
1628 user it's done. */
1629 *(int *) data = 100;
1630 return(0);
1631 }
1632 if (raidPtr->parity_rewrite_in_progress == 1) {
1633 *(int *) data = 100 *
1634 raidPtr->parity_rewrite_stripes_done /
1635 raidPtr->Layout.numStripe;
1636 } else {
1637 *(int *) data = 100;
1638 }
1639 return (0);
1640
1641 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1642 progressInfoPtr = (RF_ProgressInfo_t **) data;
1643 if (raidPtr->parity_rewrite_in_progress == 1) {
1644 progressInfo.total = raidPtr->Layout.numStripe;
1645 progressInfo.completed =
1646 raidPtr->parity_rewrite_stripes_done;
1647 progressInfo.remaining = progressInfo.total -
1648 progressInfo.completed;
1649 } else {
1650 progressInfo.remaining = 0;
1651 progressInfo.completed = 100;
1652 progressInfo.total = 100;
1653 }
1654 retcode = copyout(&progressInfo, *progressInfoPtr,
1655 sizeof(RF_ProgressInfo_t));
1656 return (retcode);
1657
1658 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1659 if (raidPtr->Layout.map->faultsTolerated == 0) {
1660 /* This makes no sense on a RAID 0 */
1661 *(int *) data = 100;
1662 return(0);
1663 }
1664 if (raidPtr->copyback_in_progress == 1) {
1665 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1666 raidPtr->Layout.numStripe;
1667 } else {
1668 *(int *) data = 100;
1669 }
1670 return (0);
1671
1672 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1673 progressInfoPtr = (RF_ProgressInfo_t **) data;
1674 if (raidPtr->copyback_in_progress == 1) {
1675 progressInfo.total = raidPtr->Layout.numStripe;
1676 progressInfo.completed =
1677 raidPtr->copyback_stripes_done;
1678 progressInfo.remaining = progressInfo.total -
1679 progressInfo.completed;
1680 } else {
1681 progressInfo.remaining = 0;
1682 progressInfo.completed = 100;
1683 progressInfo.total = 100;
1684 }
1685 retcode = copyout(&progressInfo, *progressInfoPtr,
1686 sizeof(RF_ProgressInfo_t));
1687 return (retcode);
1688
1689 /* the sparetable daemon calls this to wait for the kernel to
1690 * need a spare table. this ioctl does not return until a
1691 * spare table is needed. XXX -- calling mpsleep here in the
1692 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1693 * -- I should either compute the spare table in the kernel,
1694 * or have a different -- XXX XXX -- interface (a different
1695 * character device) for delivering the table -- XXX */
1696 #if 0
1697 case RAIDFRAME_SPARET_WAIT:
1698 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1699 while (!rf_sparet_wait_queue)
1700 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1701 waitreq = rf_sparet_wait_queue;
1702 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1703 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1704
1705 /* structure assignment */
1706 *((RF_SparetWait_t *) data) = *waitreq;
1707
1708 RF_Free(waitreq, sizeof(*waitreq));
1709 return (0);
1710
1711 /* wakes up a process waiting on SPARET_WAIT and puts an error
1712 * code in it that will cause the dameon to exit */
1713 case RAIDFRAME_ABORT_SPARET_WAIT:
1714 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1715 waitreq->fcol = -1;
1716 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1717 waitreq->next = rf_sparet_wait_queue;
1718 rf_sparet_wait_queue = waitreq;
1719 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1720 wakeup(&rf_sparet_wait_queue);
1721 return (0);
1722
1723 /* used by the spare table daemon to deliver a spare table
1724 * into the kernel */
1725 case RAIDFRAME_SEND_SPARET:
1726
1727 /* install the spare table */
1728 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1729
1730 /* respond to the requestor. the return status of the spare
1731 * table installation is passed in the "fcol" field */
1732 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1733 waitreq->fcol = retcode;
1734 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1735 waitreq->next = rf_sparet_resp_queue;
1736 rf_sparet_resp_queue = waitreq;
1737 wakeup(&rf_sparet_resp_queue);
1738 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1739
1740 return (retcode);
1741 #endif
1742
1743 default:
1744 break; /* fall through to the os-specific code below */
1745
1746 }
1747
1748 if (!raidPtr->valid)
1749 return (EINVAL);
1750
1751 /*
1752 * Add support for "regular" device ioctls here.
1753 */
1754
1755 switch (cmd) {
1756 case DIOCGDINFO:
1757 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1758 break;
1759 #ifdef __HAVE_OLD_DISKLABEL
1760 case ODIOCGDINFO:
1761 newlabel = *(rs->sc_dkdev.dk_label);
1762 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1763 return ENOTTY;
1764 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1765 break;
1766 #endif
1767
1768 case DIOCGPART:
1769 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1770 ((struct partinfo *) data)->part =
1771 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1772 break;
1773
1774 case DIOCWDINFO:
1775 case DIOCSDINFO:
1776 #ifdef __HAVE_OLD_DISKLABEL
1777 case ODIOCWDINFO:
1778 case ODIOCSDINFO:
1779 #endif
1780 {
1781 struct disklabel *lp;
1782 #ifdef __HAVE_OLD_DISKLABEL
1783 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1784 memset(&newlabel, 0, sizeof newlabel);
1785 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1786 lp = &newlabel;
1787 } else
1788 #endif
1789 lp = (struct disklabel *)data;
1790
1791 if ((error = raidlock(rs)) != 0)
1792 return (error);
1793
1794 rs->sc_flags |= RAIDF_LABELLING;
1795
1796 error = setdisklabel(rs->sc_dkdev.dk_label,
1797 lp, 0, rs->sc_dkdev.dk_cpulabel);
1798 if (error == 0) {
1799 if (cmd == DIOCWDINFO
1800 #ifdef __HAVE_OLD_DISKLABEL
1801 || cmd == ODIOCWDINFO
1802 #endif
1803 )
1804 error = writedisklabel(RAIDLABELDEV(dev),
1805 raidstrategy, rs->sc_dkdev.dk_label,
1806 rs->sc_dkdev.dk_cpulabel);
1807 }
1808 rs->sc_flags &= ~RAIDF_LABELLING;
1809
1810 raidunlock(rs);
1811
1812 if (error)
1813 return (error);
1814 break;
1815 }
1816
1817 case DIOCWLABEL:
1818 if (*(int *) data != 0)
1819 rs->sc_flags |= RAIDF_WLABEL;
1820 else
1821 rs->sc_flags &= ~RAIDF_WLABEL;
1822 break;
1823
1824 case DIOCGDEFLABEL:
1825 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1826 break;
1827
1828 #ifdef __HAVE_OLD_DISKLABEL
1829 case ODIOCGDEFLABEL:
1830 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1831 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1832 return ENOTTY;
1833 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1834 break;
1835 #endif
1836
1837 case DIOCAWEDGE:
1838 case DIOCDWEDGE:
1839 dkw = (void *)data;
1840
1841 /* If the ioctl happens here, the parent is us. */
1842 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1843 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1844
1845 case DIOCLWEDGES:
1846 return dkwedge_list(&rs->sc_dkdev,
1847 (struct dkwedge_list *)data, l);
1848 case DIOCCACHESYNC:
1849 return rf_sync_component_caches(raidPtr);
1850 default:
1851 retcode = ENOTTY;
1852 }
1853 return (retcode);
1854
1855 }
1856
1857
1858 /* raidinit -- complete the rest of the initialization for the
1859 RAIDframe device. */
1860
1861
1862 static void
1863 raidinit(RF_Raid_t *raidPtr)
1864 {
1865 struct cfdata *cf;
1866 struct raid_softc *rs;
1867 int unit;
1868
1869 unit = raidPtr->raidid;
1870
1871 rs = &raid_softc[unit];
1872
1873 /* XXX should check return code first... */
1874 rs->sc_flags |= RAIDF_INITED;
1875
1876 /* XXX doesn't check bounds. */
1877 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1878
1879 /* attach the pseudo device */
1880 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1881 cf->cf_name = raid_cd.cd_name;
1882 cf->cf_atname = raid_cd.cd_name;
1883 cf->cf_unit = unit;
1884 cf->cf_fstate = FSTATE_STAR;
1885
1886 rs->sc_dev = config_attach_pseudo(cf);
1887
1888 if (rs->sc_dev==NULL) {
1889 printf("raid%d: config_attach_pseudo failed\n",
1890 raidPtr->raidid);
1891 }
1892
1893 /* disk_attach actually creates space for the CPU disklabel, among
1894 * other things, so it's critical to call this *BEFORE* we try putzing
1895 * with disklabels. */
1896
1897 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1898 disk_attach(&rs->sc_dkdev);
1899
1900 /* XXX There may be a weird interaction here between this, and
1901 * protectedSectors, as used in RAIDframe. */
1902
1903 rs->sc_size = raidPtr->totalSectors;
1904
1905 dkwedge_discover(&rs->sc_dkdev);
1906
1907 rf_set_properties(rs, raidPtr);
1908
1909 }
1910 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1911 /* wake up the daemon & tell it to get us a spare table
1912 * XXX
1913 * the entries in the queues should be tagged with the raidPtr
1914 * so that in the extremely rare case that two recons happen at once,
1915 * we know for which device were requesting a spare table
1916 * XXX
1917 *
1918 * XXX This code is not currently used. GO
1919 */
1920 int
1921 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1922 {
1923 int retcode;
1924
1925 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1926 req->next = rf_sparet_wait_queue;
1927 rf_sparet_wait_queue = req;
1928 wakeup(&rf_sparet_wait_queue);
1929
1930 /* mpsleep unlocks the mutex */
1931 while (!rf_sparet_resp_queue) {
1932 tsleep(&rf_sparet_resp_queue, PRIBIO,
1933 "raidframe getsparetable", 0);
1934 }
1935 req = rf_sparet_resp_queue;
1936 rf_sparet_resp_queue = req->next;
1937 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1938
1939 retcode = req->fcol;
1940 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1941 * alloc'd */
1942 return (retcode);
1943 }
1944 #endif
1945
1946 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1947 * bp & passes it down.
1948 * any calls originating in the kernel must use non-blocking I/O
1949 * do some extra sanity checking to return "appropriate" error values for
1950 * certain conditions (to make some standard utilities work)
1951 *
1952 * Formerly known as: rf_DoAccessKernel
1953 */
1954 void
1955 raidstart(RF_Raid_t *raidPtr)
1956 {
1957 RF_SectorCount_t num_blocks, pb, sum;
1958 RF_RaidAddr_t raid_addr;
1959 struct partition *pp;
1960 daddr_t blocknum;
1961 int unit;
1962 struct raid_softc *rs;
1963 int do_async;
1964 struct buf *bp;
1965 int rc;
1966
1967 unit = raidPtr->raidid;
1968 rs = &raid_softc[unit];
1969
1970 /* quick check to see if anything has died recently */
1971 RF_LOCK_MUTEX(raidPtr->mutex);
1972 if (raidPtr->numNewFailures > 0) {
1973 RF_UNLOCK_MUTEX(raidPtr->mutex);
1974 rf_update_component_labels(raidPtr,
1975 RF_NORMAL_COMPONENT_UPDATE);
1976 RF_LOCK_MUTEX(raidPtr->mutex);
1977 raidPtr->numNewFailures--;
1978 }
1979
1980 /* Check to see if we're at the limit... */
1981 while (raidPtr->openings > 0) {
1982 RF_UNLOCK_MUTEX(raidPtr->mutex);
1983
1984 /* get the next item, if any, from the queue */
1985 if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
1986 /* nothing more to do */
1987 return;
1988 }
1989
1990 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1991 * partition.. Need to make it absolute to the underlying
1992 * device.. */
1993
1994 blocknum = bp->b_blkno;
1995 if (DISKPART(bp->b_dev) != RAW_PART) {
1996 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1997 blocknum += pp->p_offset;
1998 }
1999
2000 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2001 (int) blocknum));
2002
2003 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2004 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2005
2006 /* *THIS* is where we adjust what block we're going to...
2007 * but DO NOT TOUCH bp->b_blkno!!! */
2008 raid_addr = blocknum;
2009
2010 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2011 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2012 sum = raid_addr + num_blocks + pb;
2013 if (1 || rf_debugKernelAccess) {
2014 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2015 (int) raid_addr, (int) sum, (int) num_blocks,
2016 (int) pb, (int) bp->b_resid));
2017 }
2018 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2019 || (sum < num_blocks) || (sum < pb)) {
2020 bp->b_error = ENOSPC;
2021 bp->b_resid = bp->b_bcount;
2022 biodone(bp);
2023 RF_LOCK_MUTEX(raidPtr->mutex);
2024 continue;
2025 }
2026 /*
2027 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2028 */
2029
2030 if (bp->b_bcount & raidPtr->sectorMask) {
2031 bp->b_error = EINVAL;
2032 bp->b_resid = bp->b_bcount;
2033 biodone(bp);
2034 RF_LOCK_MUTEX(raidPtr->mutex);
2035 continue;
2036
2037 }
2038 db1_printf(("Calling DoAccess..\n"));
2039
2040
2041 RF_LOCK_MUTEX(raidPtr->mutex);
2042 raidPtr->openings--;
2043 RF_UNLOCK_MUTEX(raidPtr->mutex);
2044
2045 /*
2046 * Everything is async.
2047 */
2048 do_async = 1;
2049
2050 disk_busy(&rs->sc_dkdev);
2051
2052 /* XXX we're still at splbio() here... do we *really*
2053 need to be? */
2054
2055 /* don't ever condition on bp->b_flags & B_WRITE.
2056 * always condition on B_READ instead */
2057
2058 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2059 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2060 do_async, raid_addr, num_blocks,
2061 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2062
2063 if (rc) {
2064 bp->b_error = rc;
2065 bp->b_resid = bp->b_bcount;
2066 biodone(bp);
2067 /* continue loop */
2068 }
2069
2070 RF_LOCK_MUTEX(raidPtr->mutex);
2071 }
2072 RF_UNLOCK_MUTEX(raidPtr->mutex);
2073 }
2074
2075
2076
2077
2078 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2079
2080 int
2081 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2082 {
2083 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2084 struct buf *bp;
2085
2086 req->queue = queue;
2087
2088 #if DIAGNOSTIC
2089 if (queue->raidPtr->raidid >= numraid) {
2090 printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
2091 numraid);
2092 panic("Invalid Unit number in rf_DispatchKernelIO");
2093 }
2094 #endif
2095
2096 bp = req->bp;
2097
2098 switch (req->type) {
2099 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2100 /* XXX need to do something extra here.. */
2101 /* I'm leaving this in, as I've never actually seen it used,
2102 * and I'd like folks to report it... GO */
2103 printf(("WAKEUP CALLED\n"));
2104 queue->numOutstanding++;
2105
2106 bp->b_flags = 0;
2107 bp->b_private = req;
2108
2109 KernelWakeupFunc(bp);
2110 break;
2111
2112 case RF_IO_TYPE_READ:
2113 case RF_IO_TYPE_WRITE:
2114 #if RF_ACC_TRACE > 0
2115 if (req->tracerec) {
2116 RF_ETIMER_START(req->tracerec->timer);
2117 }
2118 #endif
2119 InitBP(bp, queue->rf_cinfo->ci_vp,
2120 op, queue->rf_cinfo->ci_dev,
2121 req->sectorOffset, req->numSector,
2122 req->buf, KernelWakeupFunc, (void *) req,
2123 queue->raidPtr->logBytesPerSector, req->b_proc);
2124
2125 if (rf_debugKernelAccess) {
2126 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2127 (long) bp->b_blkno));
2128 }
2129 queue->numOutstanding++;
2130 queue->last_deq_sector = req->sectorOffset;
2131 /* acc wouldn't have been let in if there were any pending
2132 * reqs at any other priority */
2133 queue->curPriority = req->priority;
2134
2135 db1_printf(("Going for %c to unit %d col %d\n",
2136 req->type, queue->raidPtr->raidid,
2137 queue->col));
2138 db1_printf(("sector %d count %d (%d bytes) %d\n",
2139 (int) req->sectorOffset, (int) req->numSector,
2140 (int) (req->numSector <<
2141 queue->raidPtr->logBytesPerSector),
2142 (int) queue->raidPtr->logBytesPerSector));
2143
2144 /*
2145 * XXX: drop lock here since this can block at
2146 * least with backing SCSI devices. Retake it
2147 * to minimize fuss with calling interfaces.
2148 */
2149
2150 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2151 bdev_strategy(bp);
2152 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2153 break;
2154
2155 default:
2156 panic("bad req->type in rf_DispatchKernelIO");
2157 }
2158 db1_printf(("Exiting from DispatchKernelIO\n"));
2159
2160 return (0);
2161 }
2162 /* this is the callback function associated with a I/O invoked from
2163 kernel code.
2164 */
2165 static void
2166 KernelWakeupFunc(struct buf *bp)
2167 {
2168 RF_DiskQueueData_t *req = NULL;
2169 RF_DiskQueue_t *queue;
2170 int s;
2171
2172 s = splbio();
2173 db1_printf(("recovering the request queue:\n"));
2174 req = bp->b_private;
2175
2176 queue = (RF_DiskQueue_t *) req->queue;
2177
2178 #if RF_ACC_TRACE > 0
2179 if (req->tracerec) {
2180 RF_ETIMER_STOP(req->tracerec->timer);
2181 RF_ETIMER_EVAL(req->tracerec->timer);
2182 RF_LOCK_MUTEX(rf_tracing_mutex);
2183 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2184 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2185 req->tracerec->num_phys_ios++;
2186 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2187 }
2188 #endif
2189
2190 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2191 * ballistic, and mark the component as hosed... */
2192
2193 if (bp->b_error != 0) {
2194 /* Mark the disk as dead */
2195 /* but only mark it once... */
2196 /* and only if it wouldn't leave this RAID set
2197 completely broken */
2198 if (((queue->raidPtr->Disks[queue->col].status ==
2199 rf_ds_optimal) ||
2200 (queue->raidPtr->Disks[queue->col].status ==
2201 rf_ds_used_spare)) &&
2202 (queue->raidPtr->numFailures <
2203 queue->raidPtr->Layout.map->faultsTolerated)) {
2204 printf("raid%d: IO Error. Marking %s as failed.\n",
2205 queue->raidPtr->raidid,
2206 queue->raidPtr->Disks[queue->col].devname);
2207 queue->raidPtr->Disks[queue->col].status =
2208 rf_ds_failed;
2209 queue->raidPtr->status = rf_rs_degraded;
2210 queue->raidPtr->numFailures++;
2211 queue->raidPtr->numNewFailures++;
2212 } else { /* Disk is already dead... */
2213 /* printf("Disk already marked as dead!\n"); */
2214 }
2215
2216 }
2217
2218 /* Fill in the error value */
2219
2220 req->error = bp->b_error;
2221
2222 simple_lock(&queue->raidPtr->iodone_lock);
2223
2224 /* Drop this one on the "finished" queue... */
2225 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2226
2227 /* Let the raidio thread know there is work to be done. */
2228 wakeup(&(queue->raidPtr->iodone));
2229
2230 simple_unlock(&queue->raidPtr->iodone_lock);
2231
2232 splx(s);
2233 }
2234
2235
2236
2237 /*
2238 * initialize a buf structure for doing an I/O in the kernel.
2239 */
2240 static void
2241 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2242 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2243 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2244 struct proc *b_proc)
2245 {
2246 /* bp->b_flags = B_PHYS | rw_flag; */
2247 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2248 bp->b_oflags = 0;
2249 bp->b_cflags = 0;
2250 bp->b_bcount = numSect << logBytesPerSector;
2251 bp->b_bufsize = bp->b_bcount;
2252 bp->b_error = 0;
2253 bp->b_dev = dev;
2254 bp->b_data = bf;
2255 bp->b_blkno = startSect;
2256 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2257 if (bp->b_bcount == 0) {
2258 panic("bp->b_bcount is zero in InitBP!!");
2259 }
2260 bp->b_proc = b_proc;
2261 bp->b_iodone = cbFunc;
2262 bp->b_private = cbArg;
2263 }
2264
2265 static void
2266 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2267 struct disklabel *lp)
2268 {
2269 memset(lp, 0, sizeof(*lp));
2270
2271 /* fabricate a label... */
2272 lp->d_secperunit = raidPtr->totalSectors;
2273 lp->d_secsize = raidPtr->bytesPerSector;
2274 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2275 lp->d_ntracks = 4 * raidPtr->numCol;
2276 lp->d_ncylinders = raidPtr->totalSectors /
2277 (lp->d_nsectors * lp->d_ntracks);
2278 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2279
2280 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2281 lp->d_type = DTYPE_RAID;
2282 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2283 lp->d_rpm = 3600;
2284 lp->d_interleave = 1;
2285 lp->d_flags = 0;
2286
2287 lp->d_partitions[RAW_PART].p_offset = 0;
2288 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2289 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2290 lp->d_npartitions = RAW_PART + 1;
2291
2292 lp->d_magic = DISKMAGIC;
2293 lp->d_magic2 = DISKMAGIC;
2294 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2295
2296 }
2297 /*
2298 * Read the disklabel from the raid device. If one is not present, fake one
2299 * up.
2300 */
2301 static void
2302 raidgetdisklabel(dev_t dev)
2303 {
2304 int unit = raidunit(dev);
2305 struct raid_softc *rs = &raid_softc[unit];
2306 const char *errstring;
2307 struct disklabel *lp = rs->sc_dkdev.dk_label;
2308 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2309 RF_Raid_t *raidPtr;
2310
2311 db1_printf(("Getting the disklabel...\n"));
2312
2313 memset(clp, 0, sizeof(*clp));
2314
2315 raidPtr = raidPtrs[unit];
2316
2317 raidgetdefaultlabel(raidPtr, rs, lp);
2318
2319 /*
2320 * Call the generic disklabel extraction routine.
2321 */
2322 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2323 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2324 if (errstring)
2325 raidmakedisklabel(rs);
2326 else {
2327 int i;
2328 struct partition *pp;
2329
2330 /*
2331 * Sanity check whether the found disklabel is valid.
2332 *
2333 * This is necessary since total size of the raid device
2334 * may vary when an interleave is changed even though exactly
2335 * same components are used, and old disklabel may used
2336 * if that is found.
2337 */
2338 if (lp->d_secperunit != rs->sc_size)
2339 printf("raid%d: WARNING: %s: "
2340 "total sector size in disklabel (%" PRIu32 ") != "
2341 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2342 lp->d_secperunit, rs->sc_size);
2343 for (i = 0; i < lp->d_npartitions; i++) {
2344 pp = &lp->d_partitions[i];
2345 if (pp->p_offset + pp->p_size > rs->sc_size)
2346 printf("raid%d: WARNING: %s: end of partition `%c' "
2347 "exceeds the size of raid (%" PRIu64 ")\n",
2348 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2349 }
2350 }
2351
2352 }
2353 /*
2354 * Take care of things one might want to take care of in the event
2355 * that a disklabel isn't present.
2356 */
2357 static void
2358 raidmakedisklabel(struct raid_softc *rs)
2359 {
2360 struct disklabel *lp = rs->sc_dkdev.dk_label;
2361 db1_printf(("Making a label..\n"));
2362
2363 /*
2364 * For historical reasons, if there's no disklabel present
2365 * the raw partition must be marked FS_BSDFFS.
2366 */
2367
2368 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2369
2370 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2371
2372 lp->d_checksum = dkcksum(lp);
2373 }
2374 /*
2375 * Wait interruptibly for an exclusive lock.
2376 *
2377 * XXX
2378 * Several drivers do this; it should be abstracted and made MP-safe.
2379 * (Hmm... where have we seen this warning before :-> GO )
2380 */
2381 static int
2382 raidlock(struct raid_softc *rs)
2383 {
2384 int error;
2385
2386 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2387 rs->sc_flags |= RAIDF_WANTED;
2388 if ((error =
2389 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2390 return (error);
2391 }
2392 rs->sc_flags |= RAIDF_LOCKED;
2393 return (0);
2394 }
2395 /*
2396 * Unlock and wake up any waiters.
2397 */
2398 static void
2399 raidunlock(struct raid_softc *rs)
2400 {
2401
2402 rs->sc_flags &= ~RAIDF_LOCKED;
2403 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2404 rs->sc_flags &= ~RAIDF_WANTED;
2405 wakeup(rs);
2406 }
2407 }
2408
2409
2410 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2411 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2412 #define RF_PARITY_MAP_OFFSET \
2413 (RF_COMPONENT_INFO_OFFSET + RF_COMPONENT_INFO_SIZE)
2414 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2415
2416 int
2417 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2418 {
2419 RF_ComponentLabel_t *clabel;
2420
2421 clabel = raidget_component_label(raidPtr, col);
2422 clabel->clean = RF_RAID_CLEAN;
2423 raidflush_component_label(raidPtr, col);
2424 return(0);
2425 }
2426
2427
2428 int
2429 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2430 {
2431 RF_ComponentLabel_t *clabel;
2432
2433 clabel = raidget_component_label(raidPtr, col);
2434 clabel->clean = RF_RAID_DIRTY;
2435 raidflush_component_label(raidPtr, col);
2436 return(0);
2437 }
2438
2439 int
2440 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2441 {
2442 return raidread_component_label(raidPtr->Disks[col].dev,
2443 raidPtr->raid_cinfo[col].ci_vp,
2444 &raidPtr->raid_cinfo[col].ci_label);
2445 }
2446
2447 RF_ComponentLabel_t *
2448 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2449 {
2450 return &raidPtr->raid_cinfo[col].ci_label;
2451 }
2452
2453 int
2454 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2455 {
2456 RF_ComponentLabel_t *label;
2457
2458 label = &raidPtr->raid_cinfo[col].ci_label;
2459 label->mod_counter = raidPtr->mod_counter;
2460 #ifndef RF_NO_PARITY_MAP
2461 label->parity_map_modcount = label->mod_counter;
2462 #endif
2463 return raidwrite_component_label(raidPtr->Disks[col].dev,
2464 raidPtr->raid_cinfo[col].ci_vp, label);
2465 }
2466
2467
2468 static int
2469 raidread_component_label(dev_t dev, struct vnode *b_vp,
2470 RF_ComponentLabel_t *clabel)
2471 {
2472 return raidread_component_area(dev, b_vp, clabel,
2473 sizeof(RF_ComponentLabel_t),
2474 RF_COMPONENT_INFO_OFFSET, RF_COMPONENT_INFO_SIZE);
2475 }
2476
2477 /* ARGSUSED */
2478 static int
2479 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2480 size_t msize, daddr_t offset, daddr_t dsize)
2481 {
2482 struct buf *bp;
2483 const struct bdevsw *bdev;
2484 int error;
2485
2486 /* XXX should probably ensure that we don't try to do this if
2487 someone has changed rf_protected_sectors. */
2488
2489 if (b_vp == NULL) {
2490 /* For whatever reason, this component is not valid.
2491 Don't try to read a component label from it. */
2492 return(EINVAL);
2493 }
2494
2495 /* get a block of the appropriate size... */
2496 bp = geteblk((int)dsize);
2497 bp->b_dev = dev;
2498
2499 /* get our ducks in a row for the read */
2500 bp->b_blkno = offset / DEV_BSIZE;
2501 bp->b_bcount = dsize;
2502 bp->b_flags |= B_READ;
2503 bp->b_resid = dsize;
2504
2505 bdev = bdevsw_lookup(bp->b_dev);
2506 if (bdev == NULL)
2507 return (ENXIO);
2508 (*bdev->d_strategy)(bp);
2509
2510 error = biowait(bp);
2511
2512 if (!error) {
2513 memcpy(data, bp->b_data, msize);
2514 }
2515
2516 brelse(bp, 0);
2517 return(error);
2518 }
2519
2520
2521 static int
2522 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2523 RF_ComponentLabel_t *clabel)
2524 {
2525 return raidwrite_component_area(dev, b_vp, clabel,
2526 sizeof(RF_ComponentLabel_t),
2527 RF_COMPONENT_INFO_OFFSET, RF_COMPONENT_INFO_SIZE, 0);
2528 }
2529
2530 /* ARGSUSED */
2531 static int
2532 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2533 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2534 {
2535 struct buf *bp;
2536 const struct bdevsw *bdev;
2537 int error;
2538
2539 /* get a block of the appropriate size... */
2540 bp = geteblk((int)dsize);
2541 bp->b_dev = dev;
2542
2543 /* get our ducks in a row for the write */
2544 bp->b_blkno = offset / DEV_BSIZE;
2545 bp->b_bcount = dsize;
2546 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2547 bp->b_resid = dsize;
2548
2549 memset(bp->b_data, 0, dsize);
2550 memcpy(bp->b_data, data, msize);
2551
2552 bdev = bdevsw_lookup(bp->b_dev);
2553 if (bdev == NULL)
2554 return (ENXIO);
2555 (*bdev->d_strategy)(bp);
2556 if (asyncp)
2557 return 0;
2558 error = biowait(bp);
2559 brelse(bp, 0);
2560 if (error) {
2561 #if 1
2562 printf("Failed to write RAID component info!\n");
2563 #endif
2564 }
2565
2566 return(error);
2567 }
2568
2569 void
2570 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2571 {
2572 int c;
2573
2574 for (c = 0; c < raidPtr->numCol; c++) {
2575 /* Skip dead disks. */
2576 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2577 continue;
2578 /* XXXjld: what if an error occurs here? */
2579 raidwrite_component_area(raidPtr->Disks[c].dev,
2580 raidPtr->raid_cinfo[c].ci_vp, map,
2581 RF_PARITYMAP_NBYTE,
2582 RF_PARITY_MAP_OFFSET, RF_PARITY_MAP_SIZE, 0);
2583 }
2584 }
2585
2586 void
2587 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2588 {
2589 struct rf_paritymap_ondisk tmp;
2590 int c,first;
2591
2592 first=1;
2593 for (c = 0; c < raidPtr->numCol; c++) {
2594 /* Skip dead disks. */
2595 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2596 continue;
2597 raidread_component_area(raidPtr->Disks[c].dev,
2598 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2599 RF_PARITYMAP_NBYTE,
2600 RF_PARITY_MAP_OFFSET, RF_PARITY_MAP_SIZE);
2601 if (first) {
2602 memcpy(map, &tmp, sizeof(*map));
2603 first = 0;
2604 } else {
2605 rf_paritymap_merge(map, &tmp);
2606 }
2607 }
2608 }
2609
2610 void
2611 rf_markalldirty(RF_Raid_t *raidPtr)
2612 {
2613 RF_ComponentLabel_t *clabel;
2614 int sparecol;
2615 int c;
2616 int j;
2617 int scol = -1;
2618
2619 raidPtr->mod_counter++;
2620 for (c = 0; c < raidPtr->numCol; c++) {
2621 /* we don't want to touch (at all) a disk that has
2622 failed */
2623 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2624 clabel = raidget_component_label(raidPtr, c);
2625 if (clabel->status == rf_ds_spared) {
2626 /* XXX do something special...
2627 but whatever you do, don't
2628 try to access it!! */
2629 } else {
2630 raidmarkdirty(raidPtr, c);
2631 }
2632 }
2633 }
2634
2635 for( c = 0; c < raidPtr->numSpare ; c++) {
2636 sparecol = raidPtr->numCol + c;
2637 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2638 /*
2639
2640 we claim this disk is "optimal" if it's
2641 rf_ds_used_spare, as that means it should be
2642 directly substitutable for the disk it replaced.
2643 We note that too...
2644
2645 */
2646
2647 for(j=0;j<raidPtr->numCol;j++) {
2648 if (raidPtr->Disks[j].spareCol == sparecol) {
2649 scol = j;
2650 break;
2651 }
2652 }
2653
2654 clabel = raidget_component_label(raidPtr, sparecol);
2655 /* make sure status is noted */
2656
2657 raid_init_component_label(raidPtr, clabel);
2658
2659 clabel->row = 0;
2660 clabel->column = scol;
2661 /* Note: we *don't* change status from rf_ds_used_spare
2662 to rf_ds_optimal */
2663 /* clabel.status = rf_ds_optimal; */
2664
2665 raidmarkdirty(raidPtr, sparecol);
2666 }
2667 }
2668 }
2669
2670
2671 void
2672 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2673 {
2674 RF_ComponentLabel_t *clabel;
2675 int sparecol;
2676 int c;
2677 int j;
2678 int scol;
2679
2680 scol = -1;
2681
2682 /* XXX should do extra checks to make sure things really are clean,
2683 rather than blindly setting the clean bit... */
2684
2685 raidPtr->mod_counter++;
2686
2687 for (c = 0; c < raidPtr->numCol; c++) {
2688 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2689 clabel = raidget_component_label(raidPtr, c);
2690 /* make sure status is noted */
2691 clabel->status = rf_ds_optimal;
2692
2693 /* note what unit we are configured as */
2694 clabel->last_unit = raidPtr->raidid;
2695
2696 raidflush_component_label(raidPtr, c);
2697 if (final == RF_FINAL_COMPONENT_UPDATE) {
2698 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2699 raidmarkclean(raidPtr, c);
2700 }
2701 }
2702 }
2703 /* else we don't touch it.. */
2704 }
2705
2706 for( c = 0; c < raidPtr->numSpare ; c++) {
2707 sparecol = raidPtr->numCol + c;
2708 /* Need to ensure that the reconstruct actually completed! */
2709 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2710 /*
2711
2712 we claim this disk is "optimal" if it's
2713 rf_ds_used_spare, as that means it should be
2714 directly substitutable for the disk it replaced.
2715 We note that too...
2716
2717 */
2718
2719 for(j=0;j<raidPtr->numCol;j++) {
2720 if (raidPtr->Disks[j].spareCol == sparecol) {
2721 scol = j;
2722 break;
2723 }
2724 }
2725
2726 /* XXX shouldn't *really* need this... */
2727 clabel = raidget_component_label(raidPtr, sparecol);
2728 /* make sure status is noted */
2729
2730 raid_init_component_label(raidPtr, clabel);
2731
2732 clabel->column = scol;
2733 clabel->status = rf_ds_optimal;
2734 clabel->last_unit = raidPtr->raidid;
2735
2736 raidflush_component_label(raidPtr, sparecol);
2737 if (final == RF_FINAL_COMPONENT_UPDATE) {
2738 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2739 raidmarkclean(raidPtr, sparecol);
2740 }
2741 }
2742 }
2743 }
2744 }
2745
2746 void
2747 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2748 {
2749
2750 if (vp != NULL) {
2751 if (auto_configured == 1) {
2752 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2753 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2754 vput(vp);
2755
2756 } else {
2757 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2758 }
2759 }
2760 }
2761
2762
2763 void
2764 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2765 {
2766 int r,c;
2767 struct vnode *vp;
2768 int acd;
2769
2770
2771 /* We take this opportunity to close the vnodes like we should.. */
2772
2773 for (c = 0; c < raidPtr->numCol; c++) {
2774 vp = raidPtr->raid_cinfo[c].ci_vp;
2775 acd = raidPtr->Disks[c].auto_configured;
2776 rf_close_component(raidPtr, vp, acd);
2777 raidPtr->raid_cinfo[c].ci_vp = NULL;
2778 raidPtr->Disks[c].auto_configured = 0;
2779 }
2780
2781 for (r = 0; r < raidPtr->numSpare; r++) {
2782 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2783 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2784 rf_close_component(raidPtr, vp, acd);
2785 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2786 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2787 }
2788 }
2789
2790
2791 void
2792 rf_ReconThread(struct rf_recon_req *req)
2793 {
2794 int s;
2795 RF_Raid_t *raidPtr;
2796
2797 s = splbio();
2798 raidPtr = (RF_Raid_t *) req->raidPtr;
2799 raidPtr->recon_in_progress = 1;
2800
2801 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2802 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2803
2804 RF_Free(req, sizeof(*req));
2805
2806 raidPtr->recon_in_progress = 0;
2807 splx(s);
2808
2809 /* That's all... */
2810 kthread_exit(0); /* does not return */
2811 }
2812
2813 void
2814 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2815 {
2816 int retcode;
2817 int s;
2818
2819 raidPtr->parity_rewrite_stripes_done = 0;
2820 raidPtr->parity_rewrite_in_progress = 1;
2821 s = splbio();
2822 retcode = rf_RewriteParity(raidPtr);
2823 splx(s);
2824 if (retcode) {
2825 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2826 } else {
2827 /* set the clean bit! If we shutdown correctly,
2828 the clean bit on each component label will get
2829 set */
2830 raidPtr->parity_good = RF_RAID_CLEAN;
2831 }
2832 raidPtr->parity_rewrite_in_progress = 0;
2833
2834 /* Anyone waiting for us to stop? If so, inform them... */
2835 if (raidPtr->waitShutdown) {
2836 wakeup(&raidPtr->parity_rewrite_in_progress);
2837 }
2838
2839 /* That's all... */
2840 kthread_exit(0); /* does not return */
2841 }
2842
2843
2844 void
2845 rf_CopybackThread(RF_Raid_t *raidPtr)
2846 {
2847 int s;
2848
2849 raidPtr->copyback_in_progress = 1;
2850 s = splbio();
2851 rf_CopybackReconstructedData(raidPtr);
2852 splx(s);
2853 raidPtr->copyback_in_progress = 0;
2854
2855 /* That's all... */
2856 kthread_exit(0); /* does not return */
2857 }
2858
2859
2860 void
2861 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2862 {
2863 int s;
2864 RF_Raid_t *raidPtr;
2865
2866 s = splbio();
2867 raidPtr = req->raidPtr;
2868 raidPtr->recon_in_progress = 1;
2869 rf_ReconstructInPlace(raidPtr, req->col);
2870 RF_Free(req, sizeof(*req));
2871 raidPtr->recon_in_progress = 0;
2872 splx(s);
2873
2874 /* That's all... */
2875 kthread_exit(0); /* does not return */
2876 }
2877
2878 static RF_AutoConfig_t *
2879 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2880 const char *cname, RF_SectorCount_t size)
2881 {
2882 int good_one = 0;
2883 RF_ComponentLabel_t *clabel;
2884 RF_AutoConfig_t *ac;
2885
2886 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2887 if (clabel == NULL) {
2888 oomem:
2889 while(ac_list) {
2890 ac = ac_list;
2891 if (ac->clabel)
2892 free(ac->clabel, M_RAIDFRAME);
2893 ac_list = ac_list->next;
2894 free(ac, M_RAIDFRAME);
2895 }
2896 printf("RAID auto config: out of memory!\n");
2897 return NULL; /* XXX probably should panic? */
2898 }
2899
2900 if (!raidread_component_label(dev, vp, clabel)) {
2901 /* Got the label. Does it look reasonable? */
2902 if (rf_reasonable_label(clabel) &&
2903 (clabel->partitionSize <= size)) {
2904 #ifdef DEBUG
2905 printf("Component on: %s: %llu\n",
2906 cname, (unsigned long long)size);
2907 rf_print_component_label(clabel);
2908 #endif
2909 /* if it's reasonable, add it, else ignore it. */
2910 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2911 M_NOWAIT);
2912 if (ac == NULL) {
2913 free(clabel, M_RAIDFRAME);
2914 goto oomem;
2915 }
2916 strlcpy(ac->devname, cname, sizeof(ac->devname));
2917 ac->dev = dev;
2918 ac->vp = vp;
2919 ac->clabel = clabel;
2920 ac->next = ac_list;
2921 ac_list = ac;
2922 good_one = 1;
2923 }
2924 }
2925 if (!good_one) {
2926 /* cleanup */
2927 free(clabel, M_RAIDFRAME);
2928 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2929 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2930 vput(vp);
2931 }
2932 return ac_list;
2933 }
2934
2935 RF_AutoConfig_t *
2936 rf_find_raid_components()
2937 {
2938 struct vnode *vp;
2939 struct disklabel label;
2940 struct device *dv;
2941 dev_t dev;
2942 int bmajor, bminor, wedge;
2943 int error;
2944 int i;
2945 RF_AutoConfig_t *ac_list;
2946
2947
2948 /* initialize the AutoConfig list */
2949 ac_list = NULL;
2950
2951 /* we begin by trolling through *all* the devices on the system */
2952
2953 for (dv = alldevs.tqh_first; dv != NULL;
2954 dv = dv->dv_list.tqe_next) {
2955
2956 /* we are only interested in disks... */
2957 if (device_class(dv) != DV_DISK)
2958 continue;
2959
2960 /* we don't care about floppies... */
2961 if (device_is_a(dv, "fd")) {
2962 continue;
2963 }
2964
2965 /* we don't care about CD's... */
2966 if (device_is_a(dv, "cd")) {
2967 continue;
2968 }
2969
2970 /* we don't care about md's... */
2971 if (device_is_a(dv, "md")) {
2972 continue;
2973 }
2974
2975 /* hdfd is the Atari/Hades floppy driver */
2976 if (device_is_a(dv, "hdfd")) {
2977 continue;
2978 }
2979
2980 /* fdisa is the Atari/Milan floppy driver */
2981 if (device_is_a(dv, "fdisa")) {
2982 continue;
2983 }
2984
2985 /* need to find the device_name_to_block_device_major stuff */
2986 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2987
2988 /* get a vnode for the raw partition of this disk */
2989
2990 wedge = device_is_a(dv, "dk");
2991 bminor = minor(device_unit(dv));
2992 dev = wedge ? makedev(bmajor, bminor) :
2993 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2994 if (bdevvp(dev, &vp))
2995 panic("RAID can't alloc vnode");
2996
2997 error = VOP_OPEN(vp, FREAD, NOCRED);
2998
2999 if (error) {
3000 /* "Who cares." Continue looking
3001 for something that exists*/
3002 vput(vp);
3003 continue;
3004 }
3005
3006 if (wedge) {
3007 struct dkwedge_info dkw;
3008 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3009 NOCRED);
3010 if (error) {
3011 printf("RAIDframe: can't get wedge info for "
3012 "dev %s (%d)\n", device_xname(dv), error);
3013 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3014 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3015 vput(vp);
3016 continue;
3017 }
3018
3019 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3020 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3021 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3022 vput(vp);
3023 continue;
3024 }
3025
3026 ac_list = rf_get_component(ac_list, dev, vp,
3027 device_xname(dv), dkw.dkw_size);
3028 continue;
3029 }
3030
3031 /* Ok, the disk exists. Go get the disklabel. */
3032 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3033 if (error) {
3034 /*
3035 * XXX can't happen - open() would
3036 * have errored out (or faked up one)
3037 */
3038 if (error != ENOTTY)
3039 printf("RAIDframe: can't get label for dev "
3040 "%s (%d)\n", device_xname(dv), error);
3041 }
3042
3043 /* don't need this any more. We'll allocate it again
3044 a little later if we really do... */
3045 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3046 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3047 vput(vp);
3048
3049 if (error)
3050 continue;
3051
3052 for (i = 0; i < label.d_npartitions; i++) {
3053 char cname[sizeof(ac_list->devname)];
3054
3055 /* We only support partitions marked as RAID */
3056 if (label.d_partitions[i].p_fstype != FS_RAID)
3057 continue;
3058
3059 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3060 if (bdevvp(dev, &vp))
3061 panic("RAID can't alloc vnode");
3062
3063 error = VOP_OPEN(vp, FREAD, NOCRED);
3064 if (error) {
3065 /* Whatever... */
3066 vput(vp);
3067 continue;
3068 }
3069 snprintf(cname, sizeof(cname), "%s%c",
3070 device_xname(dv), 'a' + i);
3071 ac_list = rf_get_component(ac_list, dev, vp, cname,
3072 label.d_partitions[i].p_size);
3073 }
3074 }
3075 return ac_list;
3076 }
3077
3078
3079 static int
3080 rf_reasonable_label(RF_ComponentLabel_t *clabel)
3081 {
3082
3083 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3084 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3085 ((clabel->clean == RF_RAID_CLEAN) ||
3086 (clabel->clean == RF_RAID_DIRTY)) &&
3087 clabel->row >=0 &&
3088 clabel->column >= 0 &&
3089 clabel->num_rows > 0 &&
3090 clabel->num_columns > 0 &&
3091 clabel->row < clabel->num_rows &&
3092 clabel->column < clabel->num_columns &&
3093 clabel->blockSize > 0 &&
3094 clabel->numBlocks > 0) {
3095 /* label looks reasonable enough... */
3096 return(1);
3097 }
3098 return(0);
3099 }
3100
3101
3102 #ifdef DEBUG
3103 void
3104 rf_print_component_label(RF_ComponentLabel_t *clabel)
3105 {
3106 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3107 clabel->row, clabel->column,
3108 clabel->num_rows, clabel->num_columns);
3109 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3110 clabel->version, clabel->serial_number,
3111 clabel->mod_counter);
3112 printf(" Clean: %s Status: %d\n",
3113 clabel->clean ? "Yes" : "No", clabel->status );
3114 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3115 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3116 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
3117 (char) clabel->parityConfig, clabel->blockSize,
3118 clabel->numBlocks);
3119 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
3120 printf(" Contains root partition: %s\n",
3121 clabel->root_partition ? "Yes" : "No" );
3122 printf(" Last configured as: raid%d\n", clabel->last_unit );
3123 #if 0
3124 printf(" Config order: %d\n", clabel->config_order);
3125 #endif
3126
3127 }
3128 #endif
3129
3130 RF_ConfigSet_t *
3131 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3132 {
3133 RF_AutoConfig_t *ac;
3134 RF_ConfigSet_t *config_sets;
3135 RF_ConfigSet_t *cset;
3136 RF_AutoConfig_t *ac_next;
3137
3138
3139 config_sets = NULL;
3140
3141 /* Go through the AutoConfig list, and figure out which components
3142 belong to what sets. */
3143 ac = ac_list;
3144 while(ac!=NULL) {
3145 /* we're going to putz with ac->next, so save it here
3146 for use at the end of the loop */
3147 ac_next = ac->next;
3148
3149 if (config_sets == NULL) {
3150 /* will need at least this one... */
3151 config_sets = (RF_ConfigSet_t *)
3152 malloc(sizeof(RF_ConfigSet_t),
3153 M_RAIDFRAME, M_NOWAIT);
3154 if (config_sets == NULL) {
3155 panic("rf_create_auto_sets: No memory!");
3156 }
3157 /* this one is easy :) */
3158 config_sets->ac = ac;
3159 config_sets->next = NULL;
3160 config_sets->rootable = 0;
3161 ac->next = NULL;
3162 } else {
3163 /* which set does this component fit into? */
3164 cset = config_sets;
3165 while(cset!=NULL) {
3166 if (rf_does_it_fit(cset, ac)) {
3167 /* looks like it matches... */
3168 ac->next = cset->ac;
3169 cset->ac = ac;
3170 break;
3171 }
3172 cset = cset->next;
3173 }
3174 if (cset==NULL) {
3175 /* didn't find a match above... new set..*/
3176 cset = (RF_ConfigSet_t *)
3177 malloc(sizeof(RF_ConfigSet_t),
3178 M_RAIDFRAME, M_NOWAIT);
3179 if (cset == NULL) {
3180 panic("rf_create_auto_sets: No memory!");
3181 }
3182 cset->ac = ac;
3183 ac->next = NULL;
3184 cset->next = config_sets;
3185 cset->rootable = 0;
3186 config_sets = cset;
3187 }
3188 }
3189 ac = ac_next;
3190 }
3191
3192
3193 return(config_sets);
3194 }
3195
3196 static int
3197 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3198 {
3199 RF_ComponentLabel_t *clabel1, *clabel2;
3200
3201 /* If this one matches the *first* one in the set, that's good
3202 enough, since the other members of the set would have been
3203 through here too... */
3204 /* note that we are not checking partitionSize here..
3205
3206 Note that we are also not checking the mod_counters here.
3207 If everything else matches execpt the mod_counter, that's
3208 good enough for this test. We will deal with the mod_counters
3209 a little later in the autoconfiguration process.
3210
3211 (clabel1->mod_counter == clabel2->mod_counter) &&
3212
3213 The reason we don't check for this is that failed disks
3214 will have lower modification counts. If those disks are
3215 not added to the set they used to belong to, then they will
3216 form their own set, which may result in 2 different sets,
3217 for example, competing to be configured at raid0, and
3218 perhaps competing to be the root filesystem set. If the
3219 wrong ones get configured, or both attempt to become /,
3220 weird behaviour and or serious lossage will occur. Thus we
3221 need to bring them into the fold here, and kick them out at
3222 a later point.
3223
3224 */
3225
3226 clabel1 = cset->ac->clabel;
3227 clabel2 = ac->clabel;
3228 if ((clabel1->version == clabel2->version) &&
3229 (clabel1->serial_number == clabel2->serial_number) &&
3230 (clabel1->num_rows == clabel2->num_rows) &&
3231 (clabel1->num_columns == clabel2->num_columns) &&
3232 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3233 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3234 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3235 (clabel1->parityConfig == clabel2->parityConfig) &&
3236 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3237 (clabel1->blockSize == clabel2->blockSize) &&
3238 (clabel1->numBlocks == clabel2->numBlocks) &&
3239 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3240 (clabel1->root_partition == clabel2->root_partition) &&
3241 (clabel1->last_unit == clabel2->last_unit) &&
3242 (clabel1->config_order == clabel2->config_order)) {
3243 /* if it get's here, it almost *has* to be a match */
3244 } else {
3245 /* it's not consistent with somebody in the set..
3246 punt */
3247 return(0);
3248 }
3249 /* all was fine.. it must fit... */
3250 return(1);
3251 }
3252
3253 int
3254 rf_have_enough_components(RF_ConfigSet_t *cset)
3255 {
3256 RF_AutoConfig_t *ac;
3257 RF_AutoConfig_t *auto_config;
3258 RF_ComponentLabel_t *clabel;
3259 int c;
3260 int num_cols;
3261 int num_missing;
3262 int mod_counter;
3263 int mod_counter_found;
3264 int even_pair_failed;
3265 char parity_type;
3266
3267
3268 /* check to see that we have enough 'live' components
3269 of this set. If so, we can configure it if necessary */
3270
3271 num_cols = cset->ac->clabel->num_columns;
3272 parity_type = cset->ac->clabel->parityConfig;
3273
3274 /* XXX Check for duplicate components!?!?!? */
3275
3276 /* Determine what the mod_counter is supposed to be for this set. */
3277
3278 mod_counter_found = 0;
3279 mod_counter = 0;
3280 ac = cset->ac;
3281 while(ac!=NULL) {
3282 if (mod_counter_found==0) {
3283 mod_counter = ac->clabel->mod_counter;
3284 mod_counter_found = 1;
3285 } else {
3286 if (ac->clabel->mod_counter > mod_counter) {
3287 mod_counter = ac->clabel->mod_counter;
3288 }
3289 }
3290 ac = ac->next;
3291 }
3292
3293 num_missing = 0;
3294 auto_config = cset->ac;
3295
3296 even_pair_failed = 0;
3297 for(c=0; c<num_cols; c++) {
3298 ac = auto_config;
3299 while(ac!=NULL) {
3300 if ((ac->clabel->column == c) &&
3301 (ac->clabel->mod_counter == mod_counter)) {
3302 /* it's this one... */
3303 #ifdef DEBUG
3304 printf("Found: %s at %d\n",
3305 ac->devname,c);
3306 #endif
3307 break;
3308 }
3309 ac=ac->next;
3310 }
3311 if (ac==NULL) {
3312 /* Didn't find one here! */
3313 /* special case for RAID 1, especially
3314 where there are more than 2
3315 components (where RAIDframe treats
3316 things a little differently :( ) */
3317 if (parity_type == '1') {
3318 if (c%2 == 0) { /* even component */
3319 even_pair_failed = 1;
3320 } else { /* odd component. If
3321 we're failed, and
3322 so is the even
3323 component, it's
3324 "Good Night, Charlie" */
3325 if (even_pair_failed == 1) {
3326 return(0);
3327 }
3328 }
3329 } else {
3330 /* normal accounting */
3331 num_missing++;
3332 }
3333 }
3334 if ((parity_type == '1') && (c%2 == 1)) {
3335 /* Just did an even component, and we didn't
3336 bail.. reset the even_pair_failed flag,
3337 and go on to the next component.... */
3338 even_pair_failed = 0;
3339 }
3340 }
3341
3342 clabel = cset->ac->clabel;
3343
3344 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3345 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3346 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3347 /* XXX this needs to be made *much* more general */
3348 /* Too many failures */
3349 return(0);
3350 }
3351 /* otherwise, all is well, and we've got enough to take a kick
3352 at autoconfiguring this set */
3353 return(1);
3354 }
3355
3356 void
3357 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3358 RF_Raid_t *raidPtr)
3359 {
3360 RF_ComponentLabel_t *clabel;
3361 int i;
3362
3363 clabel = ac->clabel;
3364
3365 /* 1. Fill in the common stuff */
3366 config->numRow = clabel->num_rows = 1;
3367 config->numCol = clabel->num_columns;
3368 config->numSpare = 0; /* XXX should this be set here? */
3369 config->sectPerSU = clabel->sectPerSU;
3370 config->SUsPerPU = clabel->SUsPerPU;
3371 config->SUsPerRU = clabel->SUsPerRU;
3372 config->parityConfig = clabel->parityConfig;
3373 /* XXX... */
3374 strcpy(config->diskQueueType,"fifo");
3375 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3376 config->layoutSpecificSize = 0; /* XXX ?? */
3377
3378 while(ac!=NULL) {
3379 /* row/col values will be in range due to the checks
3380 in reasonable_label() */
3381 strcpy(config->devnames[0][ac->clabel->column],
3382 ac->devname);
3383 ac = ac->next;
3384 }
3385
3386 for(i=0;i<RF_MAXDBGV;i++) {
3387 config->debugVars[i][0] = 0;
3388 }
3389 }
3390
3391 int
3392 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3393 {
3394 RF_ComponentLabel_t *clabel;
3395 int column;
3396 int sparecol;
3397
3398 raidPtr->autoconfigure = new_value;
3399
3400 for(column=0; column<raidPtr->numCol; column++) {
3401 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3402 clabel = raidget_component_label(raidPtr, column);
3403 clabel->autoconfigure = new_value;
3404 raidflush_component_label(raidPtr, column);
3405 }
3406 }
3407 for(column = 0; column < raidPtr->numSpare ; column++) {
3408 sparecol = raidPtr->numCol + column;
3409 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3410 clabel = raidget_component_label(raidPtr, sparecol);
3411 clabel->autoconfigure = new_value;
3412 raidflush_component_label(raidPtr, sparecol);
3413 }
3414 }
3415 return(new_value);
3416 }
3417
3418 int
3419 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3420 {
3421 RF_ComponentLabel_t *clabel;
3422 int column;
3423 int sparecol;
3424
3425 raidPtr->root_partition = new_value;
3426 for(column=0; column<raidPtr->numCol; column++) {
3427 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3428 clabel = raidget_component_label(raidPtr, column);
3429 clabel->root_partition = new_value;
3430 raidflush_component_label(raidPtr, column);
3431 }
3432 }
3433 for(column = 0; column < raidPtr->numSpare ; column++) {
3434 sparecol = raidPtr->numCol + column;
3435 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3436 clabel = raidget_component_label(raidPtr, sparecol);
3437 clabel->root_partition = new_value;
3438 raidflush_component_label(raidPtr, sparecol);
3439 }
3440 }
3441 return(new_value);
3442 }
3443
3444 void
3445 rf_release_all_vps(RF_ConfigSet_t *cset)
3446 {
3447 RF_AutoConfig_t *ac;
3448
3449 ac = cset->ac;
3450 while(ac!=NULL) {
3451 /* Close the vp, and give it back */
3452 if (ac->vp) {
3453 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3454 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3455 vput(ac->vp);
3456 ac->vp = NULL;
3457 }
3458 ac = ac->next;
3459 }
3460 }
3461
3462
3463 void
3464 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3465 {
3466 RF_AutoConfig_t *ac;
3467 RF_AutoConfig_t *next_ac;
3468
3469 ac = cset->ac;
3470 while(ac!=NULL) {
3471 next_ac = ac->next;
3472 /* nuke the label */
3473 free(ac->clabel, M_RAIDFRAME);
3474 /* cleanup the config structure */
3475 free(ac, M_RAIDFRAME);
3476 /* "next.." */
3477 ac = next_ac;
3478 }
3479 /* and, finally, nuke the config set */
3480 free(cset, M_RAIDFRAME);
3481 }
3482
3483
3484 void
3485 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3486 {
3487 /* current version number */
3488 clabel->version = RF_COMPONENT_LABEL_VERSION;
3489 clabel->serial_number = raidPtr->serial_number;
3490 clabel->mod_counter = raidPtr->mod_counter;
3491
3492 clabel->num_rows = 1;
3493 clabel->num_columns = raidPtr->numCol;
3494 clabel->clean = RF_RAID_DIRTY; /* not clean */
3495 clabel->status = rf_ds_optimal; /* "It's good!" */
3496
3497 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3498 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3499 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3500
3501 clabel->blockSize = raidPtr->bytesPerSector;
3502 clabel->numBlocks = raidPtr->sectorsPerDisk;
3503
3504 /* XXX not portable */
3505 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3506 clabel->maxOutstanding = raidPtr->maxOutstanding;
3507 clabel->autoconfigure = raidPtr->autoconfigure;
3508 clabel->root_partition = raidPtr->root_partition;
3509 clabel->last_unit = raidPtr->raidid;
3510 clabel->config_order = raidPtr->config_order;
3511
3512 #ifndef RF_NO_PARITY_MAP
3513 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3514 #endif
3515 }
3516
3517 int
3518 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3519 {
3520 RF_Raid_t *raidPtr;
3521 RF_Config_t *config;
3522 int raidID;
3523 int retcode;
3524
3525 #ifdef DEBUG
3526 printf("RAID autoconfigure\n");
3527 #endif
3528
3529 retcode = 0;
3530 *unit = -1;
3531
3532 /* 1. Create a config structure */
3533
3534 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3535 M_RAIDFRAME,
3536 M_NOWAIT);
3537 if (config==NULL) {
3538 printf("Out of mem!?!?\n");
3539 /* XXX do something more intelligent here. */
3540 return(1);
3541 }
3542
3543 memset(config, 0, sizeof(RF_Config_t));
3544
3545 /*
3546 2. Figure out what RAID ID this one is supposed to live at
3547 See if we can get the same RAID dev that it was configured
3548 on last time..
3549 */
3550
3551 raidID = cset->ac->clabel->last_unit;
3552 if ((raidID < 0) || (raidID >= numraid)) {
3553 /* let's not wander off into lala land. */
3554 raidID = numraid - 1;
3555 }
3556 if (raidPtrs[raidID]->valid != 0) {
3557
3558 /*
3559 Nope... Go looking for an alternative...
3560 Start high so we don't immediately use raid0 if that's
3561 not taken.
3562 */
3563
3564 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3565 if (raidPtrs[raidID]->valid == 0) {
3566 /* can use this one! */
3567 break;
3568 }
3569 }
3570 }
3571
3572 if (raidID < 0) {
3573 /* punt... */
3574 printf("Unable to auto configure this set!\n");
3575 printf("(Out of RAID devs!)\n");
3576 free(config, M_RAIDFRAME);
3577 return(1);
3578 }
3579
3580 #ifdef DEBUG
3581 printf("Configuring raid%d:\n",raidID);
3582 #endif
3583
3584 raidPtr = raidPtrs[raidID];
3585
3586 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3587 raidPtr->raidid = raidID;
3588 raidPtr->openings = RAIDOUTSTANDING;
3589
3590 /* 3. Build the configuration structure */
3591 rf_create_configuration(cset->ac, config, raidPtr);
3592
3593 /* 4. Do the configuration */
3594 retcode = rf_Configure(raidPtr, config, cset->ac);
3595
3596 if (retcode == 0) {
3597
3598 raidinit(raidPtrs[raidID]);
3599
3600 rf_markalldirty(raidPtrs[raidID]);
3601 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3602 if (cset->ac->clabel->root_partition==1) {
3603 /* everything configured just fine. Make a note
3604 that this set is eligible to be root. */
3605 cset->rootable = 1;
3606 /* XXX do this here? */
3607 raidPtrs[raidID]->root_partition = 1;
3608 }
3609 }
3610
3611 /* 5. Cleanup */
3612 free(config, M_RAIDFRAME);
3613
3614 *unit = raidID;
3615 return(retcode);
3616 }
3617
3618 void
3619 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3620 {
3621 struct buf *bp;
3622
3623 bp = (struct buf *)desc->bp;
3624 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3625 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3626 }
3627
3628 void
3629 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3630 size_t xmin, size_t xmax)
3631 {
3632 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3633 pool_sethiwat(p, xmax);
3634 pool_prime(p, xmin);
3635 pool_setlowat(p, xmin);
3636 }
3637
3638 /*
3639 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3640 * if there is IO pending and if that IO could possibly be done for a
3641 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3642 * otherwise.
3643 *
3644 */
3645
3646 int
3647 rf_buf_queue_check(int raidid)
3648 {
3649 if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
3650 raidPtrs[raidid]->openings > 0) {
3651 /* there is work to do */
3652 return 0;
3653 }
3654 /* default is nothing to do */
3655 return 1;
3656 }
3657
3658 int
3659 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3660 {
3661 struct partinfo dpart;
3662 struct dkwedge_info dkw;
3663 int error;
3664
3665 error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred);
3666 if (error == 0) {
3667 diskPtr->blockSize = dpart.disklab->d_secsize;
3668 diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3669 diskPtr->partitionSize = dpart.part->p_size;
3670 return 0;
3671 }
3672
3673 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred);
3674 if (error == 0) {
3675 diskPtr->blockSize = 512; /* XXX */
3676 diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3677 diskPtr->partitionSize = dkw.dkw_size;
3678 return 0;
3679 }
3680 return error;
3681 }
3682
3683 static int
3684 raid_match(struct device *self, struct cfdata *cfdata,
3685 void *aux)
3686 {
3687 return 1;
3688 }
3689
3690 static void
3691 raid_attach(struct device *parent, struct device *self,
3692 void *aux)
3693 {
3694
3695 }
3696
3697
3698 static int
3699 raid_detach(struct device *self, int flags)
3700 {
3701 struct raid_softc *rs = (struct raid_softc *)self;
3702
3703 if (rs->sc_flags & RAIDF_INITED)
3704 return EBUSY;
3705
3706 return 0;
3707 }
3708
3709 static void
3710 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3711 {
3712 prop_dictionary_t disk_info, odisk_info, geom;
3713 disk_info = prop_dictionary_create();
3714 geom = prop_dictionary_create();
3715 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3716 raidPtr->totalSectors);
3717 prop_dictionary_set_uint32(geom, "sector-size",
3718 raidPtr->bytesPerSector);
3719
3720 prop_dictionary_set_uint16(geom, "sectors-per-track",
3721 raidPtr->Layout.dataSectorsPerStripe);
3722 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3723 4 * raidPtr->numCol);
3724
3725 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3726 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3727 (4 * raidPtr->numCol)));
3728
3729 prop_dictionary_set(disk_info, "geometry", geom);
3730 prop_object_release(geom);
3731 prop_dictionary_set(device_properties(rs->sc_dev),
3732 "disk-info", disk_info);
3733 odisk_info = rs->sc_dkdev.dk_info;
3734 rs->sc_dkdev.dk_info = disk_info;
3735 if (odisk_info)
3736 prop_object_release(odisk_info);
3737 }
3738
3739 /*
3740 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3741 * We end up returning whatever error was returned by the first cache flush
3742 * that fails.
3743 */
3744
3745 int
3746 rf_sync_component_caches(RF_Raid_t *raidPtr)
3747 {
3748 int c, sparecol;
3749 int e,error;
3750 int force = 1;
3751
3752 error = 0;
3753 for (c = 0; c < raidPtr->numCol; c++) {
3754 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3755 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3756 &force, FWRITE, NOCRED);
3757 if (e) {
3758 if (e != ENODEV)
3759 printf("raid%d: cache flush to component %s failed.\n",
3760 raidPtr->raidid, raidPtr->Disks[c].devname);
3761 if (error == 0) {
3762 error = e;
3763 }
3764 }
3765 }
3766 }
3767
3768 for( c = 0; c < raidPtr->numSpare ; c++) {
3769 sparecol = raidPtr->numCol + c;
3770 /* Need to ensure that the reconstruct actually completed! */
3771 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3772 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3773 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3774 if (e) {
3775 if (e != ENODEV)
3776 printf("raid%d: cache flush to component %s failed.\n",
3777 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3778 if (error == 0) {
3779 error = e;
3780 }
3781 }
3782 }
3783 }
3784 return error;
3785 }
3786