rf_netbsdkintf.c revision 1.248 1 /* $NetBSD: rf_netbsdkintf.c,v 1.248 2008/06/16 16:58:26 oster Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Copyright (c) 1990, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * the Systems Programming Group of the University of Utah Computer
37 * Science Department.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 * from: Utah $Hdr: cd.c 1.6 90/11/28$
64 *
65 * @(#)cd.c 8.2 (Berkeley) 11/16/93
66 */
67
68 /*
69 * Copyright (c) 1988 University of Utah.
70 *
71 * This code is derived from software contributed to Berkeley by
72 * the Systems Programming Group of the University of Utah Computer
73 * Science Department.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgement:
85 * This product includes software developed by the University of
86 * California, Berkeley and its contributors.
87 * 4. Neither the name of the University nor the names of its contributors
88 * may be used to endorse or promote products derived from this software
89 * without specific prior written permission.
90 *
91 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
94 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
95 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
96 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
97 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
98 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
99 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
100 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
101 * SUCH DAMAGE.
102 *
103 * from: Utah $Hdr: cd.c 1.6 90/11/28$
104 *
105 * @(#)cd.c 8.2 (Berkeley) 11/16/93
106 */
107
108 /*
109 * Copyright (c) 1995 Carnegie-Mellon University.
110 * All rights reserved.
111 *
112 * Authors: Mark Holland, Jim Zelenka
113 *
114 * Permission to use, copy, modify and distribute this software and
115 * its documentation is hereby granted, provided that both the copyright
116 * notice and this permission notice appear in all copies of the
117 * software, derivative works or modified versions, and any portions
118 * thereof, and that both notices appear in supporting documentation.
119 *
120 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
121 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
122 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
123 *
124 * Carnegie Mellon requests users of this software to return to
125 *
126 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
127 * School of Computer Science
128 * Carnegie Mellon University
129 * Pittsburgh PA 15213-3890
130 *
131 * any improvements or extensions that they make and grant Carnegie the
132 * rights to redistribute these changes.
133 */
134
135 /***********************************************************
136 *
137 * rf_kintf.c -- the kernel interface routines for RAIDframe
138 *
139 ***********************************************************/
140
141 #include <sys/cdefs.h>
142 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.248 2008/06/16 16:58:26 oster Exp $");
143
144 #include <sys/param.h>
145 #include <sys/errno.h>
146 #include <sys/pool.h>
147 #include <sys/proc.h>
148 #include <sys/queue.h>
149 #include <sys/disk.h>
150 #include <sys/device.h>
151 #include <sys/stat.h>
152 #include <sys/ioctl.h>
153 #include <sys/fcntl.h>
154 #include <sys/systm.h>
155 #include <sys/vnode.h>
156 #include <sys/disklabel.h>
157 #include <sys/conf.h>
158 #include <sys/buf.h>
159 #include <sys/bufq.h>
160 #include <sys/user.h>
161 #include <sys/reboot.h>
162 #include <sys/kauth.h>
163
164 #include <prop/proplib.h>
165
166 #include <dev/raidframe/raidframevar.h>
167 #include <dev/raidframe/raidframeio.h>
168 #include "raid.h"
169 #include "opt_raid_autoconfig.h"
170 #include "rf_raid.h"
171 #include "rf_copyback.h"
172 #include "rf_dag.h"
173 #include "rf_dagflags.h"
174 #include "rf_desc.h"
175 #include "rf_diskqueue.h"
176 #include "rf_etimer.h"
177 #include "rf_general.h"
178 #include "rf_kintf.h"
179 #include "rf_options.h"
180 #include "rf_driver.h"
181 #include "rf_parityscan.h"
182 #include "rf_threadstuff.h"
183
184 #ifdef DEBUG
185 int rf_kdebug_level = 0;
186 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
187 #else /* DEBUG */
188 #define db1_printf(a) { }
189 #endif /* DEBUG */
190
191 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
192
193 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
194
195 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
196 * spare table */
197 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
198 * installation process */
199
200 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
201
202 /* prototypes */
203 static void KernelWakeupFunc(struct buf *);
204 static void InitBP(struct buf *, struct vnode *, unsigned,
205 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
206 void *, int, struct proc *);
207 static void raidinit(RF_Raid_t *);
208
209 void raidattach(int);
210 static int raid_match(struct device *, struct cfdata *, void *);
211 static void raid_attach(struct device *, struct device *, void *);
212 static int raid_detach(struct device *, int);
213
214 dev_type_open(raidopen);
215 dev_type_close(raidclose);
216 dev_type_read(raidread);
217 dev_type_write(raidwrite);
218 dev_type_ioctl(raidioctl);
219 dev_type_strategy(raidstrategy);
220 dev_type_dump(raiddump);
221 dev_type_size(raidsize);
222
223 const struct bdevsw raid_bdevsw = {
224 raidopen, raidclose, raidstrategy, raidioctl,
225 raiddump, raidsize, D_DISK
226 };
227
228 const struct cdevsw raid_cdevsw = {
229 raidopen, raidclose, raidread, raidwrite, raidioctl,
230 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
231 };
232
233 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
234
235 /* XXX Not sure if the following should be replacing the raidPtrs above,
236 or if it should be used in conjunction with that...
237 */
238
239 struct raid_softc {
240 struct device *sc_dev;
241 int sc_flags; /* flags */
242 int sc_cflags; /* configuration flags */
243 uint64_t sc_size; /* size of the raid device */
244 char sc_xname[20]; /* XXX external name */
245 struct disk sc_dkdev; /* generic disk device info */
246 struct bufq_state *buf_queue; /* used for the device queue */
247 };
248 /* sc_flags */
249 #define RAIDF_INITED 0x01 /* unit has been initialized */
250 #define RAIDF_WLABEL 0x02 /* label area is writable */
251 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
252 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
253 #define RAIDF_LOCKED 0x80 /* unit is locked */
254
255 #define raidunit(x) DISKUNIT(x)
256 int numraid = 0;
257
258 extern struct cfdriver raid_cd;
259 CFATTACH_DECL_NEW(raid, sizeof(struct raid_softc),
260 raid_match, raid_attach, raid_detach, NULL);
261
262 /*
263 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
264 * Be aware that large numbers can allow the driver to consume a lot of
265 * kernel memory, especially on writes, and in degraded mode reads.
266 *
267 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
268 * a single 64K write will typically require 64K for the old data,
269 * 64K for the old parity, and 64K for the new parity, for a total
270 * of 192K (if the parity buffer is not re-used immediately).
271 * Even it if is used immediately, that's still 128K, which when multiplied
272 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
273 *
274 * Now in degraded mode, for example, a 64K read on the above setup may
275 * require data reconstruction, which will require *all* of the 4 remaining
276 * disks to participate -- 4 * 32K/disk == 128K again.
277 */
278
279 #ifndef RAIDOUTSTANDING
280 #define RAIDOUTSTANDING 6
281 #endif
282
283 #define RAIDLABELDEV(dev) \
284 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
285
286 /* declared here, and made public, for the benefit of KVM stuff.. */
287 struct raid_softc *raid_softc;
288
289 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
290 struct disklabel *);
291 static void raidgetdisklabel(dev_t);
292 static void raidmakedisklabel(struct raid_softc *);
293
294 static int raidlock(struct raid_softc *);
295 static void raidunlock(struct raid_softc *);
296
297 static void rf_markalldirty(RF_Raid_t *);
298 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
299
300 void rf_ReconThread(struct rf_recon_req *);
301 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
302 void rf_CopybackThread(RF_Raid_t *raidPtr);
303 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
304 int rf_autoconfig(struct device *self);
305 void rf_buildroothack(RF_ConfigSet_t *);
306
307 RF_AutoConfig_t *rf_find_raid_components(void);
308 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
309 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
310 static int rf_reasonable_label(RF_ComponentLabel_t *);
311 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
312 int rf_set_autoconfig(RF_Raid_t *, int);
313 int rf_set_rootpartition(RF_Raid_t *, int);
314 void rf_release_all_vps(RF_ConfigSet_t *);
315 void rf_cleanup_config_set(RF_ConfigSet_t *);
316 int rf_have_enough_components(RF_ConfigSet_t *);
317 int rf_auto_config_set(RF_ConfigSet_t *, int *);
318
319 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
320 allow autoconfig to take place.
321 Note that this is overridden by having
322 RAID_AUTOCONFIG as an option in the
323 kernel config file. */
324
325 struct RF_Pools_s rf_pools;
326
327 void
328 raidattach(int num)
329 {
330 int raidID;
331 int i, rc;
332
333 #ifdef DEBUG
334 printf("raidattach: Asked for %d units\n", num);
335 #endif
336
337 if (num <= 0) {
338 #ifdef DIAGNOSTIC
339 panic("raidattach: count <= 0");
340 #endif
341 return;
342 }
343 /* This is where all the initialization stuff gets done. */
344
345 numraid = num;
346
347 /* Make some space for requested number of units... */
348
349 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
350 if (raidPtrs == NULL) {
351 panic("raidPtrs is NULL!!");
352 }
353
354 rf_mutex_init(&rf_sparet_wait_mutex);
355
356 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
357
358 for (i = 0; i < num; i++)
359 raidPtrs[i] = NULL;
360 rc = rf_BootRaidframe();
361 if (rc == 0)
362 aprint_normal("Kernelized RAIDframe activated\n");
363 else
364 panic("Serious error booting RAID!!");
365
366 /* put together some datastructures like the CCD device does.. This
367 * lets us lock the device and what-not when it gets opened. */
368
369 raid_softc = (struct raid_softc *)
370 malloc(num * sizeof(struct raid_softc),
371 M_RAIDFRAME, M_NOWAIT);
372 if (raid_softc == NULL) {
373 aprint_error("WARNING: no memory for RAIDframe driver\n");
374 return;
375 }
376
377 memset(raid_softc, 0, num * sizeof(struct raid_softc));
378
379 for (raidID = 0; raidID < num; raidID++) {
380 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
381
382 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
383 (RF_Raid_t *));
384 if (raidPtrs[raidID] == NULL) {
385 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
386 numraid = raidID;
387 return;
388 }
389 }
390
391 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
392 aprint_error("raidattach: config_cfattach_attach failed?\n");
393 }
394
395 #ifdef RAID_AUTOCONFIG
396 raidautoconfig = 1;
397 #endif
398
399 /*
400 * Register a finalizer which will be used to auto-config RAID
401 * sets once all real hardware devices have been found.
402 */
403 if (config_finalize_register(NULL, rf_autoconfig) != 0)
404 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
405 }
406
407 int
408 rf_autoconfig(struct device *self)
409 {
410 RF_AutoConfig_t *ac_list;
411 RF_ConfigSet_t *config_sets;
412
413 if (raidautoconfig == 0)
414 return (0);
415
416 /* XXX This code can only be run once. */
417 raidautoconfig = 0;
418
419 /* 1. locate all RAID components on the system */
420 #ifdef DEBUG
421 printf("Searching for RAID components...\n");
422 #endif
423 ac_list = rf_find_raid_components();
424
425 /* 2. Sort them into their respective sets. */
426 config_sets = rf_create_auto_sets(ac_list);
427
428 /*
429 * 3. Evaluate each set andconfigure the valid ones.
430 * This gets done in rf_buildroothack().
431 */
432 rf_buildroothack(config_sets);
433
434 return 1;
435 }
436
437 void
438 rf_buildroothack(RF_ConfigSet_t *config_sets)
439 {
440 RF_ConfigSet_t *cset;
441 RF_ConfigSet_t *next_cset;
442 int retcode;
443 int raidID;
444 int rootID;
445 int col;
446 int num_root;
447 char *devname;
448
449 rootID = 0;
450 num_root = 0;
451 cset = config_sets;
452 while(cset != NULL ) {
453 next_cset = cset->next;
454 if (rf_have_enough_components(cset) &&
455 cset->ac->clabel->autoconfigure==1) {
456 retcode = rf_auto_config_set(cset,&raidID);
457 if (!retcode) {
458 #ifdef DEBUG
459 printf("raid%d: configured ok\n", raidID);
460 #endif
461 if (cset->rootable) {
462 rootID = raidID;
463 num_root++;
464 }
465 } else {
466 /* The autoconfig didn't work :( */
467 #ifdef DEBUG
468 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
469 #endif
470 rf_release_all_vps(cset);
471 }
472 } else {
473 /* we're not autoconfiguring this set...
474 release the associated resources */
475 rf_release_all_vps(cset);
476 }
477 /* cleanup */
478 rf_cleanup_config_set(cset);
479 cset = next_cset;
480 }
481
482 /* if the user has specified what the root device should be
483 then we don't touch booted_device or boothowto... */
484
485 if (rootspec != NULL)
486 return;
487
488 /* we found something bootable... */
489
490 if (num_root == 1) {
491 booted_device = raid_softc[rootID].sc_dev;
492 } else if (num_root > 1) {
493
494 /*
495 * Maybe the MD code can help. If it cannot, then
496 * setroot() will discover that we have no
497 * booted_device and will ask the user if nothing was
498 * hardwired in the kernel config file
499 */
500
501 if (booted_device == NULL)
502 cpu_rootconf();
503 if (booted_device == NULL)
504 return;
505
506 num_root = 0;
507 for (raidID = 0; raidID < numraid; raidID++) {
508 if (raidPtrs[raidID]->valid == 0)
509 continue;
510
511 if (raidPtrs[raidID]->root_partition == 0)
512 continue;
513
514 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
515 devname = raidPtrs[raidID]->Disks[col].devname;
516 devname += sizeof("/dev/") - 1;
517 if (strncmp(devname, device_xname(booted_device),
518 strlen(device_xname(booted_device))) != 0)
519 continue;
520 #ifdef DEBUG
521 printf("raid%d includes boot device %s\n",
522 raidID, devname);
523 #endif
524 num_root++;
525 rootID = raidID;
526 }
527 }
528
529 if (num_root == 1) {
530 booted_device = raid_softc[rootID].sc_dev;
531 } else {
532 /* we can't guess.. require the user to answer... */
533 boothowto |= RB_ASKNAME;
534 }
535 }
536 }
537
538
539 int
540 raidsize(dev_t dev)
541 {
542 struct raid_softc *rs;
543 struct disklabel *lp;
544 int part, unit, omask, size;
545
546 unit = raidunit(dev);
547 if (unit >= numraid)
548 return (-1);
549 rs = &raid_softc[unit];
550
551 if ((rs->sc_flags & RAIDF_INITED) == 0)
552 return (-1);
553
554 part = DISKPART(dev);
555 omask = rs->sc_dkdev.dk_openmask & (1 << part);
556 lp = rs->sc_dkdev.dk_label;
557
558 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
559 return (-1);
560
561 if (lp->d_partitions[part].p_fstype != FS_SWAP)
562 size = -1;
563 else
564 size = lp->d_partitions[part].p_size *
565 (lp->d_secsize / DEV_BSIZE);
566
567 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
568 return (-1);
569
570 return (size);
571
572 }
573
574 int
575 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
576 {
577 int unit = raidunit(dev);
578 struct raid_softc *rs;
579 const struct bdevsw *bdev;
580 struct disklabel *lp;
581 RF_Raid_t *raidPtr;
582 daddr_t offset;
583 int part, c, sparecol, j, scol, dumpto;
584 int error = 0;
585
586 if (unit >= numraid)
587 return (ENXIO);
588
589 rs = &raid_softc[unit];
590 raidPtr = raidPtrs[unit];
591
592 if ((rs->sc_flags & RAIDF_INITED) == 0)
593 return ENXIO;
594
595 /* we only support dumping to RAID 1 sets */
596 if (raidPtr->Layout.numDataCol != 1 ||
597 raidPtr->Layout.numParityCol != 1)
598 return EINVAL;
599
600
601 if ((error = raidlock(rs)) != 0)
602 return error;
603
604 if (size % DEV_BSIZE != 0) {
605 error = EINVAL;
606 goto out;
607 }
608
609 if (blkno + size / DEV_BSIZE > rs->sc_size) {
610 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
611 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
612 size / DEV_BSIZE, rs->sc_size);
613 error = EINVAL;
614 goto out;
615 }
616
617 part = DISKPART(dev);
618 lp = rs->sc_dkdev.dk_label;
619 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
620
621 /* figure out what device is alive.. */
622
623 /*
624 Look for a component to dump to. The preference for the
625 component to dump to is as follows:
626 1) the master
627 2) a used_spare of the master
628 3) the slave
629 4) a used_spare of the slave
630 */
631
632 dumpto = -1;
633 for (c = 0; c < raidPtr->numCol; c++) {
634 if (raidPtr->Disks[c].status == rf_ds_optimal) {
635 /* this might be the one */
636 dumpto = c;
637 break;
638 }
639 }
640
641 /*
642 At this point we have possibly selected a live master or a
643 live slave. We now check to see if there is a spared
644 master (or a spared slave), if we didn't find a live master
645 or a live slave.
646 */
647
648 for (c = 0; c < raidPtr->numSpare; c++) {
649 sparecol = raidPtr->numCol + c;
650 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
651 /* How about this one? */
652 scol = -1;
653 for(j=0;j<raidPtr->numCol;j++) {
654 if (raidPtr->Disks[j].spareCol == sparecol) {
655 scol = j;
656 break;
657 }
658 }
659 if (scol == 0) {
660 /*
661 We must have found a spared master!
662 We'll take that over anything else
663 found so far. (We couldn't have
664 found a real master before, since
665 this is a used spare, and it's
666 saying that it's replacing the
667 master.) On reboot (with
668 autoconfiguration turned on)
669 sparecol will become the 1st
670 component (component0) of this set.
671 */
672 dumpto = sparecol;
673 break;
674 } else if (scol != -1) {
675 /*
676 Must be a spared slave. We'll dump
677 to that if we havn't found anything
678 else so far.
679 */
680 if (dumpto == -1)
681 dumpto = sparecol;
682 }
683 }
684 }
685
686 if (dumpto == -1) {
687 /* we couldn't find any live components to dump to!?!?
688 */
689 error = EINVAL;
690 goto out;
691 }
692
693 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
694
695 /*
696 Note that blkno is relative to this particular partition.
697 By adding the offset of this partition in the RAID
698 set, and also adding RF_PROTECTED_SECTORS, we get a
699 value that is relative to the partition used for the
700 underlying component.
701 */
702
703 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
704 blkno + offset, va, size);
705
706 out:
707 raidunlock(rs);
708
709 return error;
710 }
711 /* ARGSUSED */
712 int
713 raidopen(dev_t dev, int flags, int fmt,
714 struct lwp *l)
715 {
716 int unit = raidunit(dev);
717 struct raid_softc *rs;
718 struct disklabel *lp;
719 int part, pmask;
720 int error = 0;
721
722 if (unit >= numraid)
723 return (ENXIO);
724 rs = &raid_softc[unit];
725
726 if ((error = raidlock(rs)) != 0)
727 return (error);
728 lp = rs->sc_dkdev.dk_label;
729
730 part = DISKPART(dev);
731
732 /*
733 * If there are wedges, and this is not RAW_PART, then we
734 * need to fail.
735 */
736 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
737 error = EBUSY;
738 goto bad;
739 }
740 pmask = (1 << part);
741
742 if ((rs->sc_flags & RAIDF_INITED) &&
743 (rs->sc_dkdev.dk_openmask == 0))
744 raidgetdisklabel(dev);
745
746 /* make sure that this partition exists */
747
748 if (part != RAW_PART) {
749 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
750 ((part >= lp->d_npartitions) ||
751 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
752 error = ENXIO;
753 goto bad;
754 }
755 }
756 /* Prevent this unit from being unconfigured while open. */
757 switch (fmt) {
758 case S_IFCHR:
759 rs->sc_dkdev.dk_copenmask |= pmask;
760 break;
761
762 case S_IFBLK:
763 rs->sc_dkdev.dk_bopenmask |= pmask;
764 break;
765 }
766
767 if ((rs->sc_dkdev.dk_openmask == 0) &&
768 ((rs->sc_flags & RAIDF_INITED) != 0)) {
769 /* First one... mark things as dirty... Note that we *MUST*
770 have done a configure before this. I DO NOT WANT TO BE
771 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
772 THAT THEY BELONG TOGETHER!!!!! */
773 /* XXX should check to see if we're only open for reading
774 here... If so, we needn't do this, but then need some
775 other way of keeping track of what's happened.. */
776
777 rf_markalldirty( raidPtrs[unit] );
778 }
779
780
781 rs->sc_dkdev.dk_openmask =
782 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
783
784 bad:
785 raidunlock(rs);
786
787 return (error);
788
789
790 }
791 /* ARGSUSED */
792 int
793 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
794 {
795 int unit = raidunit(dev);
796 struct cfdata *cf;
797 struct raid_softc *rs;
798 int error = 0;
799 int part;
800
801 if (unit >= numraid)
802 return (ENXIO);
803 rs = &raid_softc[unit];
804
805 if ((error = raidlock(rs)) != 0)
806 return (error);
807
808 part = DISKPART(dev);
809
810 /* ...that much closer to allowing unconfiguration... */
811 switch (fmt) {
812 case S_IFCHR:
813 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
814 break;
815
816 case S_IFBLK:
817 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
818 break;
819 }
820 rs->sc_dkdev.dk_openmask =
821 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
822
823 if ((rs->sc_dkdev.dk_openmask == 0) &&
824 ((rs->sc_flags & RAIDF_INITED) != 0)) {
825 /* Last one... device is not unconfigured yet.
826 Device shutdown has taken care of setting the
827 clean bits if RAIDF_INITED is not set
828 mark things as clean... */
829
830 rf_update_component_labels(raidPtrs[unit],
831 RF_FINAL_COMPONENT_UPDATE);
832 if (doing_shutdown) {
833 /* last one, and we're going down, so
834 lights out for this RAID set too. */
835 error = rf_Shutdown(raidPtrs[unit]);
836
837 /* It's no longer initialized... */
838 rs->sc_flags &= ~RAIDF_INITED;
839
840 /* detach the device */
841
842 cf = device_cfdata(rs->sc_dev);
843 error = config_detach(rs->sc_dev, DETACH_QUIET);
844 free(cf, M_RAIDFRAME);
845
846 /* Detach the disk. */
847 disk_detach(&rs->sc_dkdev);
848 disk_destroy(&rs->sc_dkdev);
849 }
850 }
851
852 raidunlock(rs);
853 return (0);
854
855 }
856
857 void
858 raidstrategy(struct buf *bp)
859 {
860 int s;
861
862 unsigned int raidID = raidunit(bp->b_dev);
863 RF_Raid_t *raidPtr;
864 struct raid_softc *rs = &raid_softc[raidID];
865 int wlabel;
866
867 if ((rs->sc_flags & RAIDF_INITED) ==0) {
868 bp->b_error = ENXIO;
869 goto done;
870 }
871 if (raidID >= numraid || !raidPtrs[raidID]) {
872 bp->b_error = ENODEV;
873 goto done;
874 }
875 raidPtr = raidPtrs[raidID];
876 if (!raidPtr->valid) {
877 bp->b_error = ENODEV;
878 goto done;
879 }
880 if (bp->b_bcount == 0) {
881 db1_printf(("b_bcount is zero..\n"));
882 goto done;
883 }
884
885 /*
886 * Do bounds checking and adjust transfer. If there's an
887 * error, the bounds check will flag that for us.
888 */
889
890 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
891 if (DISKPART(bp->b_dev) == RAW_PART) {
892 uint64_t size; /* device size in DEV_BSIZE unit */
893
894 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
895 size = raidPtr->totalSectors <<
896 (raidPtr->logBytesPerSector - DEV_BSHIFT);
897 } else {
898 size = raidPtr->totalSectors >>
899 (DEV_BSHIFT - raidPtr->logBytesPerSector);
900 }
901 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
902 goto done;
903 }
904 } else {
905 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
906 db1_printf(("Bounds check failed!!:%d %d\n",
907 (int) bp->b_blkno, (int) wlabel));
908 goto done;
909 }
910 }
911 s = splbio();
912
913 bp->b_resid = 0;
914
915 /* stuff it onto our queue */
916 BUFQ_PUT(rs->buf_queue, bp);
917
918 /* scheduled the IO to happen at the next convenient time */
919 wakeup(&(raidPtrs[raidID]->iodone));
920
921 splx(s);
922 return;
923
924 done:
925 bp->b_resid = bp->b_bcount;
926 biodone(bp);
927 }
928 /* ARGSUSED */
929 int
930 raidread(dev_t dev, struct uio *uio, int flags)
931 {
932 int unit = raidunit(dev);
933 struct raid_softc *rs;
934
935 if (unit >= numraid)
936 return (ENXIO);
937 rs = &raid_softc[unit];
938
939 if ((rs->sc_flags & RAIDF_INITED) == 0)
940 return (ENXIO);
941
942 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
943
944 }
945 /* ARGSUSED */
946 int
947 raidwrite(dev_t dev, struct uio *uio, int flags)
948 {
949 int unit = raidunit(dev);
950 struct raid_softc *rs;
951
952 if (unit >= numraid)
953 return (ENXIO);
954 rs = &raid_softc[unit];
955
956 if ((rs->sc_flags & RAIDF_INITED) == 0)
957 return (ENXIO);
958
959 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
960
961 }
962
963 int
964 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
965 {
966 int unit = raidunit(dev);
967 int error = 0;
968 int part, pmask;
969 struct cfdata *cf;
970 struct raid_softc *rs;
971 RF_Config_t *k_cfg, *u_cfg;
972 RF_Raid_t *raidPtr;
973 RF_RaidDisk_t *diskPtr;
974 RF_AccTotals_t *totals;
975 RF_DeviceConfig_t *d_cfg, **ucfgp;
976 u_char *specific_buf;
977 int retcode = 0;
978 int column;
979 int raidid;
980 struct rf_recon_req *rrcopy, *rr;
981 RF_ComponentLabel_t *clabel;
982 RF_ComponentLabel_t *ci_label;
983 RF_ComponentLabel_t **clabel_ptr;
984 RF_SingleComponent_t *sparePtr,*componentPtr;
985 RF_SingleComponent_t component;
986 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
987 int i, j, d;
988 #ifdef __HAVE_OLD_DISKLABEL
989 struct disklabel newlabel;
990 #endif
991 struct dkwedge_info *dkw;
992
993 if (unit >= numraid)
994 return (ENXIO);
995 rs = &raid_softc[unit];
996 raidPtr = raidPtrs[unit];
997
998 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
999 (int) DISKPART(dev), (int) unit, (int) cmd));
1000
1001 /* Must be open for writes for these commands... */
1002 switch (cmd) {
1003 #ifdef DIOCGSECTORSIZE
1004 case DIOCGSECTORSIZE:
1005 *(u_int *)data = raidPtr->bytesPerSector;
1006 return 0;
1007 case DIOCGMEDIASIZE:
1008 *(off_t *)data =
1009 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1010 return 0;
1011 #endif
1012 case DIOCSDINFO:
1013 case DIOCWDINFO:
1014 #ifdef __HAVE_OLD_DISKLABEL
1015 case ODIOCWDINFO:
1016 case ODIOCSDINFO:
1017 #endif
1018 case DIOCWLABEL:
1019 case DIOCAWEDGE:
1020 case DIOCDWEDGE:
1021 if ((flag & FWRITE) == 0)
1022 return (EBADF);
1023 }
1024
1025 /* Must be initialized for these... */
1026 switch (cmd) {
1027 case DIOCGDINFO:
1028 case DIOCSDINFO:
1029 case DIOCWDINFO:
1030 #ifdef __HAVE_OLD_DISKLABEL
1031 case ODIOCGDINFO:
1032 case ODIOCWDINFO:
1033 case ODIOCSDINFO:
1034 case ODIOCGDEFLABEL:
1035 #endif
1036 case DIOCGPART:
1037 case DIOCWLABEL:
1038 case DIOCGDEFLABEL:
1039 case DIOCAWEDGE:
1040 case DIOCDWEDGE:
1041 case DIOCLWEDGES:
1042 case RAIDFRAME_SHUTDOWN:
1043 case RAIDFRAME_REWRITEPARITY:
1044 case RAIDFRAME_GET_INFO:
1045 case RAIDFRAME_RESET_ACCTOTALS:
1046 case RAIDFRAME_GET_ACCTOTALS:
1047 case RAIDFRAME_KEEP_ACCTOTALS:
1048 case RAIDFRAME_GET_SIZE:
1049 case RAIDFRAME_FAIL_DISK:
1050 case RAIDFRAME_COPYBACK:
1051 case RAIDFRAME_CHECK_RECON_STATUS:
1052 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1053 case RAIDFRAME_GET_COMPONENT_LABEL:
1054 case RAIDFRAME_SET_COMPONENT_LABEL:
1055 case RAIDFRAME_ADD_HOT_SPARE:
1056 case RAIDFRAME_REMOVE_HOT_SPARE:
1057 case RAIDFRAME_INIT_LABELS:
1058 case RAIDFRAME_REBUILD_IN_PLACE:
1059 case RAIDFRAME_CHECK_PARITY:
1060 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1061 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1062 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1063 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1064 case RAIDFRAME_SET_AUTOCONFIG:
1065 case RAIDFRAME_SET_ROOT:
1066 case RAIDFRAME_DELETE_COMPONENT:
1067 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1068 if ((rs->sc_flags & RAIDF_INITED) == 0)
1069 return (ENXIO);
1070 }
1071
1072 switch (cmd) {
1073
1074 /* configure the system */
1075 case RAIDFRAME_CONFIGURE:
1076
1077 if (raidPtr->valid) {
1078 /* There is a valid RAID set running on this unit! */
1079 printf("raid%d: Device already configured!\n",unit);
1080 return(EINVAL);
1081 }
1082
1083 /* copy-in the configuration information */
1084 /* data points to a pointer to the configuration structure */
1085
1086 u_cfg = *((RF_Config_t **) data);
1087 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1088 if (k_cfg == NULL) {
1089 return (ENOMEM);
1090 }
1091 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1092 if (retcode) {
1093 RF_Free(k_cfg, sizeof(RF_Config_t));
1094 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1095 retcode));
1096 return (retcode);
1097 }
1098 /* allocate a buffer for the layout-specific data, and copy it
1099 * in */
1100 if (k_cfg->layoutSpecificSize) {
1101 if (k_cfg->layoutSpecificSize > 10000) {
1102 /* sanity check */
1103 RF_Free(k_cfg, sizeof(RF_Config_t));
1104 return (EINVAL);
1105 }
1106 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1107 (u_char *));
1108 if (specific_buf == NULL) {
1109 RF_Free(k_cfg, sizeof(RF_Config_t));
1110 return (ENOMEM);
1111 }
1112 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1113 k_cfg->layoutSpecificSize);
1114 if (retcode) {
1115 RF_Free(k_cfg, sizeof(RF_Config_t));
1116 RF_Free(specific_buf,
1117 k_cfg->layoutSpecificSize);
1118 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1119 retcode));
1120 return (retcode);
1121 }
1122 } else
1123 specific_buf = NULL;
1124 k_cfg->layoutSpecific = specific_buf;
1125
1126 /* should do some kind of sanity check on the configuration.
1127 * Store the sum of all the bytes in the last byte? */
1128
1129 /* configure the system */
1130
1131 /*
1132 * Clear the entire RAID descriptor, just to make sure
1133 * there is no stale data left in the case of a
1134 * reconfiguration
1135 */
1136 memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1137 raidPtr->raidid = unit;
1138
1139 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1140
1141 if (retcode == 0) {
1142
1143 /* allow this many simultaneous IO's to
1144 this RAID device */
1145 raidPtr->openings = RAIDOUTSTANDING;
1146
1147 raidinit(raidPtr);
1148 rf_markalldirty(raidPtr);
1149 }
1150 /* free the buffers. No return code here. */
1151 if (k_cfg->layoutSpecificSize) {
1152 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1153 }
1154 RF_Free(k_cfg, sizeof(RF_Config_t));
1155
1156 return (retcode);
1157
1158 /* shutdown the system */
1159 case RAIDFRAME_SHUTDOWN:
1160
1161 if ((error = raidlock(rs)) != 0)
1162 return (error);
1163
1164 /*
1165 * If somebody has a partition mounted, we shouldn't
1166 * shutdown.
1167 */
1168
1169 part = DISKPART(dev);
1170 pmask = (1 << part);
1171 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1172 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1173 (rs->sc_dkdev.dk_copenmask & pmask))) {
1174 raidunlock(rs);
1175 return (EBUSY);
1176 }
1177
1178 retcode = rf_Shutdown(raidPtr);
1179
1180 /* It's no longer initialized... */
1181 rs->sc_flags &= ~RAIDF_INITED;
1182
1183 /* free the pseudo device attach bits */
1184
1185 cf = device_cfdata(rs->sc_dev);
1186 /* XXX this causes us to not return any errors
1187 from the above call to rf_Shutdown() */
1188 retcode = config_detach(rs->sc_dev, DETACH_QUIET);
1189 free(cf, M_RAIDFRAME);
1190
1191 /* Detach the disk. */
1192 disk_detach(&rs->sc_dkdev);
1193 disk_destroy(&rs->sc_dkdev);
1194
1195 raidunlock(rs);
1196
1197 return (retcode);
1198 case RAIDFRAME_GET_COMPONENT_LABEL:
1199 clabel_ptr = (RF_ComponentLabel_t **) data;
1200 /* need to read the component label for the disk indicated
1201 by row,column in clabel */
1202
1203 /* For practice, let's get it directly fromdisk, rather
1204 than from the in-core copy */
1205 RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
1206 (RF_ComponentLabel_t *));
1207 if (clabel == NULL)
1208 return (ENOMEM);
1209
1210 retcode = copyin( *clabel_ptr, clabel,
1211 sizeof(RF_ComponentLabel_t));
1212
1213 if (retcode) {
1214 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1215 return(retcode);
1216 }
1217
1218 clabel->row = 0; /* Don't allow looking at anything else.*/
1219
1220 column = clabel->column;
1221
1222 if ((column < 0) || (column >= raidPtr->numCol +
1223 raidPtr->numSpare)) {
1224 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1225 return(EINVAL);
1226 }
1227
1228 retcode = raidread_component_label(raidPtr->Disks[column].dev,
1229 raidPtr->raid_cinfo[column].ci_vp,
1230 clabel );
1231
1232 if (retcode == 0) {
1233 retcode = copyout(clabel, *clabel_ptr,
1234 sizeof(RF_ComponentLabel_t));
1235 }
1236 RF_Free(clabel, sizeof(RF_ComponentLabel_t));
1237 return (retcode);
1238
1239 case RAIDFRAME_SET_COMPONENT_LABEL:
1240 clabel = (RF_ComponentLabel_t *) data;
1241
1242 /* XXX check the label for valid stuff... */
1243 /* Note that some things *should not* get modified --
1244 the user should be re-initing the labels instead of
1245 trying to patch things.
1246 */
1247
1248 raidid = raidPtr->raidid;
1249 #ifdef DEBUG
1250 printf("raid%d: Got component label:\n", raidid);
1251 printf("raid%d: Version: %d\n", raidid, clabel->version);
1252 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1253 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1254 printf("raid%d: Column: %d\n", raidid, clabel->column);
1255 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1256 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1257 printf("raid%d: Status: %d\n", raidid, clabel->status);
1258 #endif
1259 clabel->row = 0;
1260 column = clabel->column;
1261
1262 if ((column < 0) || (column >= raidPtr->numCol)) {
1263 return(EINVAL);
1264 }
1265
1266 /* XXX this isn't allowed to do anything for now :-) */
1267
1268 /* XXX and before it is, we need to fill in the rest
1269 of the fields!?!?!?! */
1270 #if 0
1271 raidwrite_component_label(
1272 raidPtr->Disks[column].dev,
1273 raidPtr->raid_cinfo[column].ci_vp,
1274 clabel );
1275 #endif
1276 return (0);
1277
1278 case RAIDFRAME_INIT_LABELS:
1279 clabel = (RF_ComponentLabel_t *) data;
1280 /*
1281 we only want the serial number from
1282 the above. We get all the rest of the information
1283 from the config that was used to create this RAID
1284 set.
1285 */
1286
1287 raidPtr->serial_number = clabel->serial_number;
1288
1289 RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t),
1290 (RF_ComponentLabel_t *));
1291 if (ci_label == NULL)
1292 return (ENOMEM);
1293
1294 raid_init_component_label(raidPtr, ci_label);
1295 ci_label->serial_number = clabel->serial_number;
1296 ci_label->row = 0; /* we dont' pretend to support more */
1297
1298 for(column=0;column<raidPtr->numCol;column++) {
1299 diskPtr = &raidPtr->Disks[column];
1300 if (!RF_DEAD_DISK(diskPtr->status)) {
1301 ci_label->partitionSize = diskPtr->partitionSize;
1302 ci_label->column = column;
1303 raidwrite_component_label(
1304 raidPtr->Disks[column].dev,
1305 raidPtr->raid_cinfo[column].ci_vp,
1306 ci_label );
1307 }
1308 }
1309 RF_Free(ci_label, sizeof(RF_ComponentLabel_t));
1310
1311 return (retcode);
1312 case RAIDFRAME_SET_AUTOCONFIG:
1313 d = rf_set_autoconfig(raidPtr, *(int *) data);
1314 printf("raid%d: New autoconfig value is: %d\n",
1315 raidPtr->raidid, d);
1316 *(int *) data = d;
1317 return (retcode);
1318
1319 case RAIDFRAME_SET_ROOT:
1320 d = rf_set_rootpartition(raidPtr, *(int *) data);
1321 printf("raid%d: New rootpartition value is: %d\n",
1322 raidPtr->raidid, d);
1323 *(int *) data = d;
1324 return (retcode);
1325
1326 /* initialize all parity */
1327 case RAIDFRAME_REWRITEPARITY:
1328
1329 if (raidPtr->Layout.map->faultsTolerated == 0) {
1330 /* Parity for RAID 0 is trivially correct */
1331 raidPtr->parity_good = RF_RAID_CLEAN;
1332 return(0);
1333 }
1334
1335 if (raidPtr->parity_rewrite_in_progress == 1) {
1336 /* Re-write is already in progress! */
1337 return(EINVAL);
1338 }
1339
1340 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1341 rf_RewriteParityThread,
1342 raidPtr,"raid_parity");
1343 return (retcode);
1344
1345
1346 case RAIDFRAME_ADD_HOT_SPARE:
1347 sparePtr = (RF_SingleComponent_t *) data;
1348 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1349 retcode = rf_add_hot_spare(raidPtr, &component);
1350 return(retcode);
1351
1352 case RAIDFRAME_REMOVE_HOT_SPARE:
1353 return(retcode);
1354
1355 case RAIDFRAME_DELETE_COMPONENT:
1356 componentPtr = (RF_SingleComponent_t *)data;
1357 memcpy( &component, componentPtr,
1358 sizeof(RF_SingleComponent_t));
1359 retcode = rf_delete_component(raidPtr, &component);
1360 return(retcode);
1361
1362 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1363 componentPtr = (RF_SingleComponent_t *)data;
1364 memcpy( &component, componentPtr,
1365 sizeof(RF_SingleComponent_t));
1366 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1367 return(retcode);
1368
1369 case RAIDFRAME_REBUILD_IN_PLACE:
1370
1371 if (raidPtr->Layout.map->faultsTolerated == 0) {
1372 /* Can't do this on a RAID 0!! */
1373 return(EINVAL);
1374 }
1375
1376 if (raidPtr->recon_in_progress == 1) {
1377 /* a reconstruct is already in progress! */
1378 return(EINVAL);
1379 }
1380
1381 componentPtr = (RF_SingleComponent_t *) data;
1382 memcpy( &component, componentPtr,
1383 sizeof(RF_SingleComponent_t));
1384 component.row = 0; /* we don't support any more */
1385 column = component.column;
1386
1387 if ((column < 0) || (column >= raidPtr->numCol)) {
1388 return(EINVAL);
1389 }
1390
1391 RF_LOCK_MUTEX(raidPtr->mutex);
1392 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1393 (raidPtr->numFailures > 0)) {
1394 /* XXX 0 above shouldn't be constant!!! */
1395 /* some component other than this has failed.
1396 Let's not make things worse than they already
1397 are... */
1398 printf("raid%d: Unable to reconstruct to disk at:\n",
1399 raidPtr->raidid);
1400 printf("raid%d: Col: %d Too many failures.\n",
1401 raidPtr->raidid, column);
1402 RF_UNLOCK_MUTEX(raidPtr->mutex);
1403 return (EINVAL);
1404 }
1405 if (raidPtr->Disks[column].status ==
1406 rf_ds_reconstructing) {
1407 printf("raid%d: Unable to reconstruct to disk at:\n",
1408 raidPtr->raidid);
1409 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1410
1411 RF_UNLOCK_MUTEX(raidPtr->mutex);
1412 return (EINVAL);
1413 }
1414 if (raidPtr->Disks[column].status == rf_ds_spared) {
1415 RF_UNLOCK_MUTEX(raidPtr->mutex);
1416 return (EINVAL);
1417 }
1418 RF_UNLOCK_MUTEX(raidPtr->mutex);
1419
1420 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1421 if (rrcopy == NULL)
1422 return(ENOMEM);
1423
1424 rrcopy->raidPtr = (void *) raidPtr;
1425 rrcopy->col = column;
1426
1427 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1428 rf_ReconstructInPlaceThread,
1429 rrcopy,"raid_reconip");
1430 return(retcode);
1431
1432 case RAIDFRAME_GET_INFO:
1433 if (!raidPtr->valid)
1434 return (ENODEV);
1435 ucfgp = (RF_DeviceConfig_t **) data;
1436 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1437 (RF_DeviceConfig_t *));
1438 if (d_cfg == NULL)
1439 return (ENOMEM);
1440 d_cfg->rows = 1; /* there is only 1 row now */
1441 d_cfg->cols = raidPtr->numCol;
1442 d_cfg->ndevs = raidPtr->numCol;
1443 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1444 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1445 return (ENOMEM);
1446 }
1447 d_cfg->nspares = raidPtr->numSpare;
1448 if (d_cfg->nspares >= RF_MAX_DISKS) {
1449 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1450 return (ENOMEM);
1451 }
1452 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1453 d = 0;
1454 for (j = 0; j < d_cfg->cols; j++) {
1455 d_cfg->devs[d] = raidPtr->Disks[j];
1456 d++;
1457 }
1458 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1459 d_cfg->spares[i] = raidPtr->Disks[j];
1460 }
1461 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1462 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1463
1464 return (retcode);
1465
1466 case RAIDFRAME_CHECK_PARITY:
1467 *(int *) data = raidPtr->parity_good;
1468 return (0);
1469
1470 case RAIDFRAME_RESET_ACCTOTALS:
1471 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1472 return (0);
1473
1474 case RAIDFRAME_GET_ACCTOTALS:
1475 totals = (RF_AccTotals_t *) data;
1476 *totals = raidPtr->acc_totals;
1477 return (0);
1478
1479 case RAIDFRAME_KEEP_ACCTOTALS:
1480 raidPtr->keep_acc_totals = *(int *)data;
1481 return (0);
1482
1483 case RAIDFRAME_GET_SIZE:
1484 *(int *) data = raidPtr->totalSectors;
1485 return (0);
1486
1487 /* fail a disk & optionally start reconstruction */
1488 case RAIDFRAME_FAIL_DISK:
1489
1490 if (raidPtr->Layout.map->faultsTolerated == 0) {
1491 /* Can't do this on a RAID 0!! */
1492 return(EINVAL);
1493 }
1494
1495 rr = (struct rf_recon_req *) data;
1496 rr->row = 0;
1497 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1498 return (EINVAL);
1499
1500
1501 RF_LOCK_MUTEX(raidPtr->mutex);
1502 if (raidPtr->status == rf_rs_reconstructing) {
1503 /* you can't fail a disk while we're reconstructing! */
1504 /* XXX wrong for RAID6 */
1505 RF_UNLOCK_MUTEX(raidPtr->mutex);
1506 return (EINVAL);
1507 }
1508 if ((raidPtr->Disks[rr->col].status ==
1509 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1510 /* some other component has failed. Let's not make
1511 things worse. XXX wrong for RAID6 */
1512 RF_UNLOCK_MUTEX(raidPtr->mutex);
1513 return (EINVAL);
1514 }
1515 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1516 /* Can't fail a spared disk! */
1517 RF_UNLOCK_MUTEX(raidPtr->mutex);
1518 return (EINVAL);
1519 }
1520 RF_UNLOCK_MUTEX(raidPtr->mutex);
1521
1522 /* make a copy of the recon request so that we don't rely on
1523 * the user's buffer */
1524 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1525 if (rrcopy == NULL)
1526 return(ENOMEM);
1527 memcpy(rrcopy, rr, sizeof(*rr));
1528 rrcopy->raidPtr = (void *) raidPtr;
1529
1530 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1531 rf_ReconThread,
1532 rrcopy,"raid_recon");
1533 return (0);
1534
1535 /* invoke a copyback operation after recon on whatever disk
1536 * needs it, if any */
1537 case RAIDFRAME_COPYBACK:
1538
1539 if (raidPtr->Layout.map->faultsTolerated == 0) {
1540 /* This makes no sense on a RAID 0!! */
1541 return(EINVAL);
1542 }
1543
1544 if (raidPtr->copyback_in_progress == 1) {
1545 /* Copyback is already in progress! */
1546 return(EINVAL);
1547 }
1548
1549 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1550 rf_CopybackThread,
1551 raidPtr,"raid_copyback");
1552 return (retcode);
1553
1554 /* return the percentage completion of reconstruction */
1555 case RAIDFRAME_CHECK_RECON_STATUS:
1556 if (raidPtr->Layout.map->faultsTolerated == 0) {
1557 /* This makes no sense on a RAID 0, so tell the
1558 user it's done. */
1559 *(int *) data = 100;
1560 return(0);
1561 }
1562 if (raidPtr->status != rf_rs_reconstructing)
1563 *(int *) data = 100;
1564 else {
1565 if (raidPtr->reconControl->numRUsTotal > 0) {
1566 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1567 } else {
1568 *(int *) data = 0;
1569 }
1570 }
1571 return (0);
1572 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1573 progressInfoPtr = (RF_ProgressInfo_t **) data;
1574 if (raidPtr->status != rf_rs_reconstructing) {
1575 progressInfo.remaining = 0;
1576 progressInfo.completed = 100;
1577 progressInfo.total = 100;
1578 } else {
1579 progressInfo.total =
1580 raidPtr->reconControl->numRUsTotal;
1581 progressInfo.completed =
1582 raidPtr->reconControl->numRUsComplete;
1583 progressInfo.remaining = progressInfo.total -
1584 progressInfo.completed;
1585 }
1586 retcode = copyout(&progressInfo, *progressInfoPtr,
1587 sizeof(RF_ProgressInfo_t));
1588 return (retcode);
1589
1590 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1591 if (raidPtr->Layout.map->faultsTolerated == 0) {
1592 /* This makes no sense on a RAID 0, so tell the
1593 user it's done. */
1594 *(int *) data = 100;
1595 return(0);
1596 }
1597 if (raidPtr->parity_rewrite_in_progress == 1) {
1598 *(int *) data = 100 *
1599 raidPtr->parity_rewrite_stripes_done /
1600 raidPtr->Layout.numStripe;
1601 } else {
1602 *(int *) data = 100;
1603 }
1604 return (0);
1605
1606 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1607 progressInfoPtr = (RF_ProgressInfo_t **) data;
1608 if (raidPtr->parity_rewrite_in_progress == 1) {
1609 progressInfo.total = raidPtr->Layout.numStripe;
1610 progressInfo.completed =
1611 raidPtr->parity_rewrite_stripes_done;
1612 progressInfo.remaining = progressInfo.total -
1613 progressInfo.completed;
1614 } else {
1615 progressInfo.remaining = 0;
1616 progressInfo.completed = 100;
1617 progressInfo.total = 100;
1618 }
1619 retcode = copyout(&progressInfo, *progressInfoPtr,
1620 sizeof(RF_ProgressInfo_t));
1621 return (retcode);
1622
1623 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1624 if (raidPtr->Layout.map->faultsTolerated == 0) {
1625 /* This makes no sense on a RAID 0 */
1626 *(int *) data = 100;
1627 return(0);
1628 }
1629 if (raidPtr->copyback_in_progress == 1) {
1630 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1631 raidPtr->Layout.numStripe;
1632 } else {
1633 *(int *) data = 100;
1634 }
1635 return (0);
1636
1637 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1638 progressInfoPtr = (RF_ProgressInfo_t **) data;
1639 if (raidPtr->copyback_in_progress == 1) {
1640 progressInfo.total = raidPtr->Layout.numStripe;
1641 progressInfo.completed =
1642 raidPtr->copyback_stripes_done;
1643 progressInfo.remaining = progressInfo.total -
1644 progressInfo.completed;
1645 } else {
1646 progressInfo.remaining = 0;
1647 progressInfo.completed = 100;
1648 progressInfo.total = 100;
1649 }
1650 retcode = copyout(&progressInfo, *progressInfoPtr,
1651 sizeof(RF_ProgressInfo_t));
1652 return (retcode);
1653
1654 /* the sparetable daemon calls this to wait for the kernel to
1655 * need a spare table. this ioctl does not return until a
1656 * spare table is needed. XXX -- calling mpsleep here in the
1657 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1658 * -- I should either compute the spare table in the kernel,
1659 * or have a different -- XXX XXX -- interface (a different
1660 * character device) for delivering the table -- XXX */
1661 #if 0
1662 case RAIDFRAME_SPARET_WAIT:
1663 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1664 while (!rf_sparet_wait_queue)
1665 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1666 waitreq = rf_sparet_wait_queue;
1667 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1668 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1669
1670 /* structure assignment */
1671 *((RF_SparetWait_t *) data) = *waitreq;
1672
1673 RF_Free(waitreq, sizeof(*waitreq));
1674 return (0);
1675
1676 /* wakes up a process waiting on SPARET_WAIT and puts an error
1677 * code in it that will cause the dameon to exit */
1678 case RAIDFRAME_ABORT_SPARET_WAIT:
1679 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1680 waitreq->fcol = -1;
1681 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1682 waitreq->next = rf_sparet_wait_queue;
1683 rf_sparet_wait_queue = waitreq;
1684 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1685 wakeup(&rf_sparet_wait_queue);
1686 return (0);
1687
1688 /* used by the spare table daemon to deliver a spare table
1689 * into the kernel */
1690 case RAIDFRAME_SEND_SPARET:
1691
1692 /* install the spare table */
1693 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1694
1695 /* respond to the requestor. the return status of the spare
1696 * table installation is passed in the "fcol" field */
1697 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1698 waitreq->fcol = retcode;
1699 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1700 waitreq->next = rf_sparet_resp_queue;
1701 rf_sparet_resp_queue = waitreq;
1702 wakeup(&rf_sparet_resp_queue);
1703 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1704
1705 return (retcode);
1706 #endif
1707
1708 default:
1709 break; /* fall through to the os-specific code below */
1710
1711 }
1712
1713 if (!raidPtr->valid)
1714 return (EINVAL);
1715
1716 /*
1717 * Add support for "regular" device ioctls here.
1718 */
1719
1720 switch (cmd) {
1721 case DIOCGDINFO:
1722 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1723 break;
1724 #ifdef __HAVE_OLD_DISKLABEL
1725 case ODIOCGDINFO:
1726 newlabel = *(rs->sc_dkdev.dk_label);
1727 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1728 return ENOTTY;
1729 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1730 break;
1731 #endif
1732
1733 case DIOCGPART:
1734 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1735 ((struct partinfo *) data)->part =
1736 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1737 break;
1738
1739 case DIOCWDINFO:
1740 case DIOCSDINFO:
1741 #ifdef __HAVE_OLD_DISKLABEL
1742 case ODIOCWDINFO:
1743 case ODIOCSDINFO:
1744 #endif
1745 {
1746 struct disklabel *lp;
1747 #ifdef __HAVE_OLD_DISKLABEL
1748 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1749 memset(&newlabel, 0, sizeof newlabel);
1750 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1751 lp = &newlabel;
1752 } else
1753 #endif
1754 lp = (struct disklabel *)data;
1755
1756 if ((error = raidlock(rs)) != 0)
1757 return (error);
1758
1759 rs->sc_flags |= RAIDF_LABELLING;
1760
1761 error = setdisklabel(rs->sc_dkdev.dk_label,
1762 lp, 0, rs->sc_dkdev.dk_cpulabel);
1763 if (error == 0) {
1764 if (cmd == DIOCWDINFO
1765 #ifdef __HAVE_OLD_DISKLABEL
1766 || cmd == ODIOCWDINFO
1767 #endif
1768 )
1769 error = writedisklabel(RAIDLABELDEV(dev),
1770 raidstrategy, rs->sc_dkdev.dk_label,
1771 rs->sc_dkdev.dk_cpulabel);
1772 }
1773 rs->sc_flags &= ~RAIDF_LABELLING;
1774
1775 raidunlock(rs);
1776
1777 if (error)
1778 return (error);
1779 break;
1780 }
1781
1782 case DIOCWLABEL:
1783 if (*(int *) data != 0)
1784 rs->sc_flags |= RAIDF_WLABEL;
1785 else
1786 rs->sc_flags &= ~RAIDF_WLABEL;
1787 break;
1788
1789 case DIOCGDEFLABEL:
1790 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1791 break;
1792
1793 #ifdef __HAVE_OLD_DISKLABEL
1794 case ODIOCGDEFLABEL:
1795 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1796 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1797 return ENOTTY;
1798 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1799 break;
1800 #endif
1801
1802 case DIOCAWEDGE:
1803 case DIOCDWEDGE:
1804 dkw = (void *)data;
1805
1806 /* If the ioctl happens here, the parent is us. */
1807 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1808 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1809
1810 case DIOCLWEDGES:
1811 return dkwedge_list(&rs->sc_dkdev,
1812 (struct dkwedge_list *)data, l);
1813
1814 default:
1815 retcode = ENOTTY;
1816 }
1817 return (retcode);
1818
1819 }
1820
1821
1822 /* raidinit -- complete the rest of the initialization for the
1823 RAIDframe device. */
1824
1825
1826 static void
1827 raidinit(RF_Raid_t *raidPtr)
1828 {
1829 struct cfdata *cf;
1830 struct raid_softc *rs;
1831 int unit;
1832
1833 unit = raidPtr->raidid;
1834
1835 rs = &raid_softc[unit];
1836
1837 /* XXX should check return code first... */
1838 rs->sc_flags |= RAIDF_INITED;
1839
1840 /* XXX doesn't check bounds. */
1841 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1842
1843 /* attach the pseudo device */
1844 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1845 cf->cf_name = raid_cd.cd_name;
1846 cf->cf_atname = raid_cd.cd_name;
1847 cf->cf_unit = unit;
1848 cf->cf_fstate = FSTATE_STAR;
1849
1850 rs->sc_dev = config_attach_pseudo(cf);
1851
1852 if (rs->sc_dev==NULL) {
1853 printf("raid%d: config_attach_pseudo failed\n",
1854 raidPtr->raidid);
1855 }
1856
1857 /* disk_attach actually creates space for the CPU disklabel, among
1858 * other things, so it's critical to call this *BEFORE* we try putzing
1859 * with disklabels. */
1860
1861 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1862 disk_attach(&rs->sc_dkdev);
1863
1864 /* XXX There may be a weird interaction here between this, and
1865 * protectedSectors, as used in RAIDframe. */
1866
1867 rs->sc_size = raidPtr->totalSectors;
1868
1869 dkwedge_discover(&rs->sc_dkdev);
1870
1871 rf_set_properties(rs, raidPtr);
1872
1873 }
1874 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1875 /* wake up the daemon & tell it to get us a spare table
1876 * XXX
1877 * the entries in the queues should be tagged with the raidPtr
1878 * so that in the extremely rare case that two recons happen at once,
1879 * we know for which device were requesting a spare table
1880 * XXX
1881 *
1882 * XXX This code is not currently used. GO
1883 */
1884 int
1885 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1886 {
1887 int retcode;
1888
1889 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1890 req->next = rf_sparet_wait_queue;
1891 rf_sparet_wait_queue = req;
1892 wakeup(&rf_sparet_wait_queue);
1893
1894 /* mpsleep unlocks the mutex */
1895 while (!rf_sparet_resp_queue) {
1896 tsleep(&rf_sparet_resp_queue, PRIBIO,
1897 "raidframe getsparetable", 0);
1898 }
1899 req = rf_sparet_resp_queue;
1900 rf_sparet_resp_queue = req->next;
1901 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1902
1903 retcode = req->fcol;
1904 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1905 * alloc'd */
1906 return (retcode);
1907 }
1908 #endif
1909
1910 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1911 * bp & passes it down.
1912 * any calls originating in the kernel must use non-blocking I/O
1913 * do some extra sanity checking to return "appropriate" error values for
1914 * certain conditions (to make some standard utilities work)
1915 *
1916 * Formerly known as: rf_DoAccessKernel
1917 */
1918 void
1919 raidstart(RF_Raid_t *raidPtr)
1920 {
1921 RF_SectorCount_t num_blocks, pb, sum;
1922 RF_RaidAddr_t raid_addr;
1923 struct partition *pp;
1924 daddr_t blocknum;
1925 int unit;
1926 struct raid_softc *rs;
1927 int do_async;
1928 struct buf *bp;
1929 int rc;
1930
1931 unit = raidPtr->raidid;
1932 rs = &raid_softc[unit];
1933
1934 /* quick check to see if anything has died recently */
1935 RF_LOCK_MUTEX(raidPtr->mutex);
1936 if (raidPtr->numNewFailures > 0) {
1937 RF_UNLOCK_MUTEX(raidPtr->mutex);
1938 rf_update_component_labels(raidPtr,
1939 RF_NORMAL_COMPONENT_UPDATE);
1940 RF_LOCK_MUTEX(raidPtr->mutex);
1941 raidPtr->numNewFailures--;
1942 }
1943
1944 /* Check to see if we're at the limit... */
1945 while (raidPtr->openings > 0) {
1946 RF_UNLOCK_MUTEX(raidPtr->mutex);
1947
1948 /* get the next item, if any, from the queue */
1949 if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
1950 /* nothing more to do */
1951 return;
1952 }
1953
1954 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1955 * partition.. Need to make it absolute to the underlying
1956 * device.. */
1957
1958 blocknum = bp->b_blkno;
1959 if (DISKPART(bp->b_dev) != RAW_PART) {
1960 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1961 blocknum += pp->p_offset;
1962 }
1963
1964 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1965 (int) blocknum));
1966
1967 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1968 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1969
1970 /* *THIS* is where we adjust what block we're going to...
1971 * but DO NOT TOUCH bp->b_blkno!!! */
1972 raid_addr = blocknum;
1973
1974 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1975 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1976 sum = raid_addr + num_blocks + pb;
1977 if (1 || rf_debugKernelAccess) {
1978 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1979 (int) raid_addr, (int) sum, (int) num_blocks,
1980 (int) pb, (int) bp->b_resid));
1981 }
1982 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1983 || (sum < num_blocks) || (sum < pb)) {
1984 bp->b_error = ENOSPC;
1985 bp->b_resid = bp->b_bcount;
1986 biodone(bp);
1987 RF_LOCK_MUTEX(raidPtr->mutex);
1988 continue;
1989 }
1990 /*
1991 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1992 */
1993
1994 if (bp->b_bcount & raidPtr->sectorMask) {
1995 bp->b_error = EINVAL;
1996 bp->b_resid = bp->b_bcount;
1997 biodone(bp);
1998 RF_LOCK_MUTEX(raidPtr->mutex);
1999 continue;
2000
2001 }
2002 db1_printf(("Calling DoAccess..\n"));
2003
2004
2005 RF_LOCK_MUTEX(raidPtr->mutex);
2006 raidPtr->openings--;
2007 RF_UNLOCK_MUTEX(raidPtr->mutex);
2008
2009 /*
2010 * Everything is async.
2011 */
2012 do_async = 1;
2013
2014 disk_busy(&rs->sc_dkdev);
2015
2016 /* XXX we're still at splbio() here... do we *really*
2017 need to be? */
2018
2019 /* don't ever condition on bp->b_flags & B_WRITE.
2020 * always condition on B_READ instead */
2021
2022 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2023 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2024 do_async, raid_addr, num_blocks,
2025 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2026
2027 if (rc) {
2028 bp->b_error = rc;
2029 bp->b_resid = bp->b_bcount;
2030 biodone(bp);
2031 /* continue loop */
2032 }
2033
2034 RF_LOCK_MUTEX(raidPtr->mutex);
2035 }
2036 RF_UNLOCK_MUTEX(raidPtr->mutex);
2037 }
2038
2039
2040
2041
2042 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2043
2044 int
2045 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2046 {
2047 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2048 struct buf *bp;
2049
2050 req->queue = queue;
2051
2052 #if DIAGNOSTIC
2053 if (queue->raidPtr->raidid >= numraid) {
2054 printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
2055 numraid);
2056 panic("Invalid Unit number in rf_DispatchKernelIO");
2057 }
2058 #endif
2059
2060 bp = req->bp;
2061
2062 switch (req->type) {
2063 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2064 /* XXX need to do something extra here.. */
2065 /* I'm leaving this in, as I've never actually seen it used,
2066 * and I'd like folks to report it... GO */
2067 printf(("WAKEUP CALLED\n"));
2068 queue->numOutstanding++;
2069
2070 bp->b_flags = 0;
2071 bp->b_private = req;
2072
2073 KernelWakeupFunc(bp);
2074 break;
2075
2076 case RF_IO_TYPE_READ:
2077 case RF_IO_TYPE_WRITE:
2078 #if RF_ACC_TRACE > 0
2079 if (req->tracerec) {
2080 RF_ETIMER_START(req->tracerec->timer);
2081 }
2082 #endif
2083 InitBP(bp, queue->rf_cinfo->ci_vp,
2084 op, queue->rf_cinfo->ci_dev,
2085 req->sectorOffset, req->numSector,
2086 req->buf, KernelWakeupFunc, (void *) req,
2087 queue->raidPtr->logBytesPerSector, req->b_proc);
2088
2089 if (rf_debugKernelAccess) {
2090 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2091 (long) bp->b_blkno));
2092 }
2093 queue->numOutstanding++;
2094 queue->last_deq_sector = req->sectorOffset;
2095 /* acc wouldn't have been let in if there were any pending
2096 * reqs at any other priority */
2097 queue->curPriority = req->priority;
2098
2099 db1_printf(("Going for %c to unit %d col %d\n",
2100 req->type, queue->raidPtr->raidid,
2101 queue->col));
2102 db1_printf(("sector %d count %d (%d bytes) %d\n",
2103 (int) req->sectorOffset, (int) req->numSector,
2104 (int) (req->numSector <<
2105 queue->raidPtr->logBytesPerSector),
2106 (int) queue->raidPtr->logBytesPerSector));
2107 bdev_strategy(bp);
2108
2109 break;
2110
2111 default:
2112 panic("bad req->type in rf_DispatchKernelIO");
2113 }
2114 db1_printf(("Exiting from DispatchKernelIO\n"));
2115
2116 return (0);
2117 }
2118 /* this is the callback function associated with a I/O invoked from
2119 kernel code.
2120 */
2121 static void
2122 KernelWakeupFunc(struct buf *bp)
2123 {
2124 RF_DiskQueueData_t *req = NULL;
2125 RF_DiskQueue_t *queue;
2126 int s;
2127
2128 s = splbio();
2129 db1_printf(("recovering the request queue:\n"));
2130 req = bp->b_private;
2131
2132 queue = (RF_DiskQueue_t *) req->queue;
2133
2134 #if RF_ACC_TRACE > 0
2135 if (req->tracerec) {
2136 RF_ETIMER_STOP(req->tracerec->timer);
2137 RF_ETIMER_EVAL(req->tracerec->timer);
2138 RF_LOCK_MUTEX(rf_tracing_mutex);
2139 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2140 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2141 req->tracerec->num_phys_ios++;
2142 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2143 }
2144 #endif
2145
2146 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2147 * ballistic, and mark the component as hosed... */
2148
2149 if (bp->b_error != 0) {
2150 /* Mark the disk as dead */
2151 /* but only mark it once... */
2152 /* and only if it wouldn't leave this RAID set
2153 completely broken */
2154 if (((queue->raidPtr->Disks[queue->col].status ==
2155 rf_ds_optimal) ||
2156 (queue->raidPtr->Disks[queue->col].status ==
2157 rf_ds_used_spare)) &&
2158 (queue->raidPtr->numFailures <
2159 queue->raidPtr->Layout.map->faultsTolerated)) {
2160 printf("raid%d: IO Error. Marking %s as failed.\n",
2161 queue->raidPtr->raidid,
2162 queue->raidPtr->Disks[queue->col].devname);
2163 queue->raidPtr->Disks[queue->col].status =
2164 rf_ds_failed;
2165 queue->raidPtr->status = rf_rs_degraded;
2166 queue->raidPtr->numFailures++;
2167 queue->raidPtr->numNewFailures++;
2168 } else { /* Disk is already dead... */
2169 /* printf("Disk already marked as dead!\n"); */
2170 }
2171
2172 }
2173
2174 /* Fill in the error value */
2175
2176 req->error = bp->b_error;
2177
2178 simple_lock(&queue->raidPtr->iodone_lock);
2179
2180 /* Drop this one on the "finished" queue... */
2181 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2182
2183 /* Let the raidio thread know there is work to be done. */
2184 wakeup(&(queue->raidPtr->iodone));
2185
2186 simple_unlock(&queue->raidPtr->iodone_lock);
2187
2188 splx(s);
2189 }
2190
2191
2192
2193 /*
2194 * initialize a buf structure for doing an I/O in the kernel.
2195 */
2196 static void
2197 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2198 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2199 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2200 struct proc *b_proc)
2201 {
2202 /* bp->b_flags = B_PHYS | rw_flag; */
2203 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2204 bp->b_oflags = 0;
2205 bp->b_cflags = 0;
2206 bp->b_bcount = numSect << logBytesPerSector;
2207 bp->b_bufsize = bp->b_bcount;
2208 bp->b_error = 0;
2209 bp->b_dev = dev;
2210 bp->b_data = bf;
2211 bp->b_blkno = startSect;
2212 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2213 if (bp->b_bcount == 0) {
2214 panic("bp->b_bcount is zero in InitBP!!");
2215 }
2216 bp->b_proc = b_proc;
2217 bp->b_iodone = cbFunc;
2218 bp->b_private = cbArg;
2219 }
2220
2221 static void
2222 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2223 struct disklabel *lp)
2224 {
2225 memset(lp, 0, sizeof(*lp));
2226
2227 /* fabricate a label... */
2228 lp->d_secperunit = raidPtr->totalSectors;
2229 lp->d_secsize = raidPtr->bytesPerSector;
2230 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2231 lp->d_ntracks = 4 * raidPtr->numCol;
2232 lp->d_ncylinders = raidPtr->totalSectors /
2233 (lp->d_nsectors * lp->d_ntracks);
2234 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2235
2236 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2237 lp->d_type = DTYPE_RAID;
2238 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2239 lp->d_rpm = 3600;
2240 lp->d_interleave = 1;
2241 lp->d_flags = 0;
2242
2243 lp->d_partitions[RAW_PART].p_offset = 0;
2244 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2245 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2246 lp->d_npartitions = RAW_PART + 1;
2247
2248 lp->d_magic = DISKMAGIC;
2249 lp->d_magic2 = DISKMAGIC;
2250 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2251
2252 }
2253 /*
2254 * Read the disklabel from the raid device. If one is not present, fake one
2255 * up.
2256 */
2257 static void
2258 raidgetdisklabel(dev_t dev)
2259 {
2260 int unit = raidunit(dev);
2261 struct raid_softc *rs = &raid_softc[unit];
2262 const char *errstring;
2263 struct disklabel *lp = rs->sc_dkdev.dk_label;
2264 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2265 RF_Raid_t *raidPtr;
2266
2267 db1_printf(("Getting the disklabel...\n"));
2268
2269 memset(clp, 0, sizeof(*clp));
2270
2271 raidPtr = raidPtrs[unit];
2272
2273 raidgetdefaultlabel(raidPtr, rs, lp);
2274
2275 /*
2276 * Call the generic disklabel extraction routine.
2277 */
2278 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2279 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2280 if (errstring)
2281 raidmakedisklabel(rs);
2282 else {
2283 int i;
2284 struct partition *pp;
2285
2286 /*
2287 * Sanity check whether the found disklabel is valid.
2288 *
2289 * This is necessary since total size of the raid device
2290 * may vary when an interleave is changed even though exactly
2291 * same components are used, and old disklabel may used
2292 * if that is found.
2293 */
2294 if (lp->d_secperunit != rs->sc_size)
2295 printf("raid%d: WARNING: %s: "
2296 "total sector size in disklabel (%d) != "
2297 "the size of raid (%ld)\n", unit, rs->sc_xname,
2298 lp->d_secperunit, (long) rs->sc_size);
2299 for (i = 0; i < lp->d_npartitions; i++) {
2300 pp = &lp->d_partitions[i];
2301 if (pp->p_offset + pp->p_size > rs->sc_size)
2302 printf("raid%d: WARNING: %s: end of partition `%c' "
2303 "exceeds the size of raid (%ld)\n",
2304 unit, rs->sc_xname, 'a' + i, (long) rs->sc_size);
2305 }
2306 }
2307
2308 }
2309 /*
2310 * Take care of things one might want to take care of in the event
2311 * that a disklabel isn't present.
2312 */
2313 static void
2314 raidmakedisklabel(struct raid_softc *rs)
2315 {
2316 struct disklabel *lp = rs->sc_dkdev.dk_label;
2317 db1_printf(("Making a label..\n"));
2318
2319 /*
2320 * For historical reasons, if there's no disklabel present
2321 * the raw partition must be marked FS_BSDFFS.
2322 */
2323
2324 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2325
2326 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2327
2328 lp->d_checksum = dkcksum(lp);
2329 }
2330 /*
2331 * Wait interruptibly for an exclusive lock.
2332 *
2333 * XXX
2334 * Several drivers do this; it should be abstracted and made MP-safe.
2335 * (Hmm... where have we seen this warning before :-> GO )
2336 */
2337 static int
2338 raidlock(struct raid_softc *rs)
2339 {
2340 int error;
2341
2342 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2343 rs->sc_flags |= RAIDF_WANTED;
2344 if ((error =
2345 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2346 return (error);
2347 }
2348 rs->sc_flags |= RAIDF_LOCKED;
2349 return (0);
2350 }
2351 /*
2352 * Unlock and wake up any waiters.
2353 */
2354 static void
2355 raidunlock(struct raid_softc *rs)
2356 {
2357
2358 rs->sc_flags &= ~RAIDF_LOCKED;
2359 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2360 rs->sc_flags &= ~RAIDF_WANTED;
2361 wakeup(rs);
2362 }
2363 }
2364
2365
2366 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2367 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2368
2369 int
2370 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2371 {
2372 RF_ComponentLabel_t clabel;
2373 raidread_component_label(dev, b_vp, &clabel);
2374 clabel.mod_counter = mod_counter;
2375 clabel.clean = RF_RAID_CLEAN;
2376 raidwrite_component_label(dev, b_vp, &clabel);
2377 return(0);
2378 }
2379
2380
2381 int
2382 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2383 {
2384 RF_ComponentLabel_t clabel;
2385 raidread_component_label(dev, b_vp, &clabel);
2386 clabel.mod_counter = mod_counter;
2387 clabel.clean = RF_RAID_DIRTY;
2388 raidwrite_component_label(dev, b_vp, &clabel);
2389 return(0);
2390 }
2391
2392 /* ARGSUSED */
2393 int
2394 raidread_component_label(dev_t dev, struct vnode *b_vp,
2395 RF_ComponentLabel_t *clabel)
2396 {
2397 struct buf *bp;
2398 const struct bdevsw *bdev;
2399 int error;
2400
2401 /* XXX should probably ensure that we don't try to do this if
2402 someone has changed rf_protected_sectors. */
2403
2404 if (b_vp == NULL) {
2405 /* For whatever reason, this component is not valid.
2406 Don't try to read a component label from it. */
2407 return(EINVAL);
2408 }
2409
2410 /* get a block of the appropriate size... */
2411 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2412 bp->b_dev = dev;
2413
2414 /* get our ducks in a row for the read */
2415 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2416 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2417 bp->b_flags |= B_READ;
2418 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2419
2420 bdev = bdevsw_lookup(bp->b_dev);
2421 if (bdev == NULL)
2422 return (ENXIO);
2423 (*bdev->d_strategy)(bp);
2424
2425 error = biowait(bp);
2426
2427 if (!error) {
2428 memcpy(clabel, bp->b_data,
2429 sizeof(RF_ComponentLabel_t));
2430 }
2431
2432 brelse(bp, 0);
2433 return(error);
2434 }
2435 /* ARGSUSED */
2436 int
2437 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2438 RF_ComponentLabel_t *clabel)
2439 {
2440 struct buf *bp;
2441 const struct bdevsw *bdev;
2442 int error;
2443
2444 /* get a block of the appropriate size... */
2445 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2446 bp->b_dev = dev;
2447
2448 /* get our ducks in a row for the write */
2449 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2450 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2451 bp->b_flags |= B_WRITE;
2452 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2453
2454 memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
2455
2456 memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
2457
2458 bdev = bdevsw_lookup(bp->b_dev);
2459 if (bdev == NULL)
2460 return (ENXIO);
2461 (*bdev->d_strategy)(bp);
2462 error = biowait(bp);
2463 brelse(bp, 0);
2464 if (error) {
2465 #if 1
2466 printf("Failed to write RAID component info!\n");
2467 #endif
2468 }
2469
2470 return(error);
2471 }
2472
2473 void
2474 rf_markalldirty(RF_Raid_t *raidPtr)
2475 {
2476 RF_ComponentLabel_t clabel;
2477 int sparecol;
2478 int c;
2479 int j;
2480 int scol = -1;
2481
2482 raidPtr->mod_counter++;
2483 for (c = 0; c < raidPtr->numCol; c++) {
2484 /* we don't want to touch (at all) a disk that has
2485 failed */
2486 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2487 raidread_component_label(
2488 raidPtr->Disks[c].dev,
2489 raidPtr->raid_cinfo[c].ci_vp,
2490 &clabel);
2491 if (clabel.status == rf_ds_spared) {
2492 /* XXX do something special...
2493 but whatever you do, don't
2494 try to access it!! */
2495 } else {
2496 raidmarkdirty(
2497 raidPtr->Disks[c].dev,
2498 raidPtr->raid_cinfo[c].ci_vp,
2499 raidPtr->mod_counter);
2500 }
2501 }
2502 }
2503
2504 for( c = 0; c < raidPtr->numSpare ; c++) {
2505 sparecol = raidPtr->numCol + c;
2506 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2507 /*
2508
2509 we claim this disk is "optimal" if it's
2510 rf_ds_used_spare, as that means it should be
2511 directly substitutable for the disk it replaced.
2512 We note that too...
2513
2514 */
2515
2516 for(j=0;j<raidPtr->numCol;j++) {
2517 if (raidPtr->Disks[j].spareCol == sparecol) {
2518 scol = j;
2519 break;
2520 }
2521 }
2522
2523 raidread_component_label(
2524 raidPtr->Disks[sparecol].dev,
2525 raidPtr->raid_cinfo[sparecol].ci_vp,
2526 &clabel);
2527 /* make sure status is noted */
2528
2529 raid_init_component_label(raidPtr, &clabel);
2530
2531 clabel.row = 0;
2532 clabel.column = scol;
2533 /* Note: we *don't* change status from rf_ds_used_spare
2534 to rf_ds_optimal */
2535 /* clabel.status = rf_ds_optimal; */
2536
2537 raidmarkdirty(raidPtr->Disks[sparecol].dev,
2538 raidPtr->raid_cinfo[sparecol].ci_vp,
2539 raidPtr->mod_counter);
2540 }
2541 }
2542 }
2543
2544
2545 void
2546 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2547 {
2548 RF_ComponentLabel_t clabel;
2549 int sparecol;
2550 int c;
2551 int j;
2552 int scol;
2553
2554 scol = -1;
2555
2556 /* XXX should do extra checks to make sure things really are clean,
2557 rather than blindly setting the clean bit... */
2558
2559 raidPtr->mod_counter++;
2560
2561 for (c = 0; c < raidPtr->numCol; c++) {
2562 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2563 raidread_component_label(
2564 raidPtr->Disks[c].dev,
2565 raidPtr->raid_cinfo[c].ci_vp,
2566 &clabel);
2567 /* make sure status is noted */
2568 clabel.status = rf_ds_optimal;
2569
2570 /* bump the counter */
2571 clabel.mod_counter = raidPtr->mod_counter;
2572
2573 /* note what unit we are configured as */
2574 clabel.last_unit = raidPtr->raidid;
2575
2576 raidwrite_component_label(
2577 raidPtr->Disks[c].dev,
2578 raidPtr->raid_cinfo[c].ci_vp,
2579 &clabel);
2580 if (final == RF_FINAL_COMPONENT_UPDATE) {
2581 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2582 raidmarkclean(
2583 raidPtr->Disks[c].dev,
2584 raidPtr->raid_cinfo[c].ci_vp,
2585 raidPtr->mod_counter);
2586 }
2587 }
2588 }
2589 /* else we don't touch it.. */
2590 }
2591
2592 for( c = 0; c < raidPtr->numSpare ; c++) {
2593 sparecol = raidPtr->numCol + c;
2594 /* Need to ensure that the reconstruct actually completed! */
2595 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2596 /*
2597
2598 we claim this disk is "optimal" if it's
2599 rf_ds_used_spare, as that means it should be
2600 directly substitutable for the disk it replaced.
2601 We note that too...
2602
2603 */
2604
2605 for(j=0;j<raidPtr->numCol;j++) {
2606 if (raidPtr->Disks[j].spareCol == sparecol) {
2607 scol = j;
2608 break;
2609 }
2610 }
2611
2612 /* XXX shouldn't *really* need this... */
2613 raidread_component_label(
2614 raidPtr->Disks[sparecol].dev,
2615 raidPtr->raid_cinfo[sparecol].ci_vp,
2616 &clabel);
2617 /* make sure status is noted */
2618
2619 raid_init_component_label(raidPtr, &clabel);
2620
2621 clabel.mod_counter = raidPtr->mod_counter;
2622 clabel.column = scol;
2623 clabel.status = rf_ds_optimal;
2624 clabel.last_unit = raidPtr->raidid;
2625
2626 raidwrite_component_label(
2627 raidPtr->Disks[sparecol].dev,
2628 raidPtr->raid_cinfo[sparecol].ci_vp,
2629 &clabel);
2630 if (final == RF_FINAL_COMPONENT_UPDATE) {
2631 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2632 raidmarkclean( raidPtr->Disks[sparecol].dev,
2633 raidPtr->raid_cinfo[sparecol].ci_vp,
2634 raidPtr->mod_counter);
2635 }
2636 }
2637 }
2638 }
2639 }
2640
2641 void
2642 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2643 {
2644
2645 if (vp != NULL) {
2646 if (auto_configured == 1) {
2647 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2648 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2649 vput(vp);
2650
2651 } else {
2652 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2653 }
2654 }
2655 }
2656
2657
2658 void
2659 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2660 {
2661 int r,c;
2662 struct vnode *vp;
2663 int acd;
2664
2665
2666 /* We take this opportunity to close the vnodes like we should.. */
2667
2668 for (c = 0; c < raidPtr->numCol; c++) {
2669 vp = raidPtr->raid_cinfo[c].ci_vp;
2670 acd = raidPtr->Disks[c].auto_configured;
2671 rf_close_component(raidPtr, vp, acd);
2672 raidPtr->raid_cinfo[c].ci_vp = NULL;
2673 raidPtr->Disks[c].auto_configured = 0;
2674 }
2675
2676 for (r = 0; r < raidPtr->numSpare; r++) {
2677 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2678 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2679 rf_close_component(raidPtr, vp, acd);
2680 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2681 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2682 }
2683 }
2684
2685
2686 void
2687 rf_ReconThread(struct rf_recon_req *req)
2688 {
2689 int s;
2690 RF_Raid_t *raidPtr;
2691
2692 s = splbio();
2693 raidPtr = (RF_Raid_t *) req->raidPtr;
2694 raidPtr->recon_in_progress = 1;
2695
2696 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2697 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2698
2699 RF_Free(req, sizeof(*req));
2700
2701 raidPtr->recon_in_progress = 0;
2702 splx(s);
2703
2704 /* That's all... */
2705 kthread_exit(0); /* does not return */
2706 }
2707
2708 void
2709 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2710 {
2711 int retcode;
2712 int s;
2713
2714 raidPtr->parity_rewrite_stripes_done = 0;
2715 raidPtr->parity_rewrite_in_progress = 1;
2716 s = splbio();
2717 retcode = rf_RewriteParity(raidPtr);
2718 splx(s);
2719 if (retcode) {
2720 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2721 } else {
2722 /* set the clean bit! If we shutdown correctly,
2723 the clean bit on each component label will get
2724 set */
2725 raidPtr->parity_good = RF_RAID_CLEAN;
2726 }
2727 raidPtr->parity_rewrite_in_progress = 0;
2728
2729 /* Anyone waiting for us to stop? If so, inform them... */
2730 if (raidPtr->waitShutdown) {
2731 wakeup(&raidPtr->parity_rewrite_in_progress);
2732 }
2733
2734 /* That's all... */
2735 kthread_exit(0); /* does not return */
2736 }
2737
2738
2739 void
2740 rf_CopybackThread(RF_Raid_t *raidPtr)
2741 {
2742 int s;
2743
2744 raidPtr->copyback_in_progress = 1;
2745 s = splbio();
2746 rf_CopybackReconstructedData(raidPtr);
2747 splx(s);
2748 raidPtr->copyback_in_progress = 0;
2749
2750 /* That's all... */
2751 kthread_exit(0); /* does not return */
2752 }
2753
2754
2755 void
2756 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2757 {
2758 int s;
2759 RF_Raid_t *raidPtr;
2760
2761 s = splbio();
2762 raidPtr = req->raidPtr;
2763 raidPtr->recon_in_progress = 1;
2764 rf_ReconstructInPlace(raidPtr, req->col);
2765 RF_Free(req, sizeof(*req));
2766 raidPtr->recon_in_progress = 0;
2767 splx(s);
2768
2769 /* That's all... */
2770 kthread_exit(0); /* does not return */
2771 }
2772
2773 static RF_AutoConfig_t *
2774 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2775 const char *cname, RF_SectorCount_t size)
2776 {
2777 int good_one = 0;
2778 RF_ComponentLabel_t *clabel;
2779 RF_AutoConfig_t *ac;
2780
2781 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2782 if (clabel == NULL) {
2783 oomem:
2784 while(ac_list) {
2785 ac = ac_list;
2786 if (ac->clabel)
2787 free(ac->clabel, M_RAIDFRAME);
2788 ac_list = ac_list->next;
2789 free(ac, M_RAIDFRAME);
2790 }
2791 printf("RAID auto config: out of memory!\n");
2792 return NULL; /* XXX probably should panic? */
2793 }
2794
2795 if (!raidread_component_label(dev, vp, clabel)) {
2796 /* Got the label. Does it look reasonable? */
2797 if (rf_reasonable_label(clabel) &&
2798 (clabel->partitionSize <= size)) {
2799 #ifdef DEBUG
2800 printf("Component on: %s: %llu\n",
2801 cname, (unsigned long long)size);
2802 rf_print_component_label(clabel);
2803 #endif
2804 /* if it's reasonable, add it, else ignore it. */
2805 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2806 M_NOWAIT);
2807 if (ac == NULL) {
2808 free(clabel, M_RAIDFRAME);
2809 goto oomem;
2810 }
2811 strlcpy(ac->devname, cname, sizeof(ac->devname));
2812 ac->dev = dev;
2813 ac->vp = vp;
2814 ac->clabel = clabel;
2815 ac->next = ac_list;
2816 ac_list = ac;
2817 good_one = 1;
2818 }
2819 }
2820 if (!good_one) {
2821 /* cleanup */
2822 free(clabel, M_RAIDFRAME);
2823 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2824 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2825 vput(vp);
2826 }
2827 return ac_list;
2828 }
2829
2830 RF_AutoConfig_t *
2831 rf_find_raid_components()
2832 {
2833 struct vnode *vp;
2834 struct disklabel label;
2835 struct device *dv;
2836 dev_t dev;
2837 int bmajor, bminor, wedge;
2838 int error;
2839 int i;
2840 RF_AutoConfig_t *ac_list;
2841
2842
2843 /* initialize the AutoConfig list */
2844 ac_list = NULL;
2845
2846 /* we begin by trolling through *all* the devices on the system */
2847
2848 for (dv = alldevs.tqh_first; dv != NULL;
2849 dv = dv->dv_list.tqe_next) {
2850
2851 /* we are only interested in disks... */
2852 if (device_class(dv) != DV_DISK)
2853 continue;
2854
2855 /* we don't care about floppies... */
2856 if (device_is_a(dv, "fd")) {
2857 continue;
2858 }
2859
2860 /* we don't care about CD's... */
2861 if (device_is_a(dv, "cd")) {
2862 continue;
2863 }
2864
2865 /* we don't care about md's... */
2866 if (device_is_a(dv, "md")) {
2867 continue;
2868 }
2869
2870 /* hdfd is the Atari/Hades floppy driver */
2871 if (device_is_a(dv, "hdfd")) {
2872 continue;
2873 }
2874
2875 /* fdisa is the Atari/Milan floppy driver */
2876 if (device_is_a(dv, "fdisa")) {
2877 continue;
2878 }
2879
2880 /* need to find the device_name_to_block_device_major stuff */
2881 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2882
2883 /* get a vnode for the raw partition of this disk */
2884
2885 wedge = device_is_a(dv, "dk");
2886 bminor = minor(device_unit(dv));
2887 dev = wedge ? makedev(bmajor, bminor) :
2888 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2889 if (bdevvp(dev, &vp))
2890 panic("RAID can't alloc vnode");
2891
2892 error = VOP_OPEN(vp, FREAD, NOCRED);
2893
2894 if (error) {
2895 /* "Who cares." Continue looking
2896 for something that exists*/
2897 vput(vp);
2898 continue;
2899 }
2900
2901 if (wedge) {
2902 struct dkwedge_info dkw;
2903 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2904 NOCRED);
2905 if (error) {
2906 printf("RAIDframe: can't get wedge info for "
2907 "dev %s (%d)\n", device_xname(dv), error);
2908 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2909 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2910 vput(vp);
2911 continue;
2912 }
2913
2914 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2915 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2916 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2917 vput(vp);
2918 continue;
2919 }
2920
2921 ac_list = rf_get_component(ac_list, dev, vp,
2922 device_xname(dv), dkw.dkw_size);
2923 continue;
2924 }
2925
2926 /* Ok, the disk exists. Go get the disklabel. */
2927 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2928 if (error) {
2929 /*
2930 * XXX can't happen - open() would
2931 * have errored out (or faked up one)
2932 */
2933 if (error != ENOTTY)
2934 printf("RAIDframe: can't get label for dev "
2935 "%s (%d)\n", device_xname(dv), error);
2936 }
2937
2938 /* don't need this any more. We'll allocate it again
2939 a little later if we really do... */
2940 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2941 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2942 vput(vp);
2943
2944 if (error)
2945 continue;
2946
2947 for (i = 0; i < label.d_npartitions; i++) {
2948 char cname[sizeof(ac_list->devname)];
2949
2950 /* We only support partitions marked as RAID */
2951 if (label.d_partitions[i].p_fstype != FS_RAID)
2952 continue;
2953
2954 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2955 if (bdevvp(dev, &vp))
2956 panic("RAID can't alloc vnode");
2957
2958 error = VOP_OPEN(vp, FREAD, NOCRED);
2959 if (error) {
2960 /* Whatever... */
2961 vput(vp);
2962 continue;
2963 }
2964 snprintf(cname, sizeof(cname), "%s%c",
2965 device_xname(dv), 'a' + i);
2966 ac_list = rf_get_component(ac_list, dev, vp, cname,
2967 label.d_partitions[i].p_size);
2968 }
2969 }
2970 return ac_list;
2971 }
2972
2973
2974 static int
2975 rf_reasonable_label(RF_ComponentLabel_t *clabel)
2976 {
2977
2978 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2979 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2980 ((clabel->clean == RF_RAID_CLEAN) ||
2981 (clabel->clean == RF_RAID_DIRTY)) &&
2982 clabel->row >=0 &&
2983 clabel->column >= 0 &&
2984 clabel->num_rows > 0 &&
2985 clabel->num_columns > 0 &&
2986 clabel->row < clabel->num_rows &&
2987 clabel->column < clabel->num_columns &&
2988 clabel->blockSize > 0 &&
2989 clabel->numBlocks > 0) {
2990 /* label looks reasonable enough... */
2991 return(1);
2992 }
2993 return(0);
2994 }
2995
2996
2997 #ifdef DEBUG
2998 void
2999 rf_print_component_label(RF_ComponentLabel_t *clabel)
3000 {
3001 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3002 clabel->row, clabel->column,
3003 clabel->num_rows, clabel->num_columns);
3004 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3005 clabel->version, clabel->serial_number,
3006 clabel->mod_counter);
3007 printf(" Clean: %s Status: %d\n",
3008 clabel->clean ? "Yes" : "No", clabel->status );
3009 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3010 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3011 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
3012 (char) clabel->parityConfig, clabel->blockSize,
3013 clabel->numBlocks);
3014 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
3015 printf(" Contains root partition: %s\n",
3016 clabel->root_partition ? "Yes" : "No" );
3017 printf(" Last configured as: raid%d\n", clabel->last_unit );
3018 #if 0
3019 printf(" Config order: %d\n", clabel->config_order);
3020 #endif
3021
3022 }
3023 #endif
3024
3025 RF_ConfigSet_t *
3026 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3027 {
3028 RF_AutoConfig_t *ac;
3029 RF_ConfigSet_t *config_sets;
3030 RF_ConfigSet_t *cset;
3031 RF_AutoConfig_t *ac_next;
3032
3033
3034 config_sets = NULL;
3035
3036 /* Go through the AutoConfig list, and figure out which components
3037 belong to what sets. */
3038 ac = ac_list;
3039 while(ac!=NULL) {
3040 /* we're going to putz with ac->next, so save it here
3041 for use at the end of the loop */
3042 ac_next = ac->next;
3043
3044 if (config_sets == NULL) {
3045 /* will need at least this one... */
3046 config_sets = (RF_ConfigSet_t *)
3047 malloc(sizeof(RF_ConfigSet_t),
3048 M_RAIDFRAME, M_NOWAIT);
3049 if (config_sets == NULL) {
3050 panic("rf_create_auto_sets: No memory!");
3051 }
3052 /* this one is easy :) */
3053 config_sets->ac = ac;
3054 config_sets->next = NULL;
3055 config_sets->rootable = 0;
3056 ac->next = NULL;
3057 } else {
3058 /* which set does this component fit into? */
3059 cset = config_sets;
3060 while(cset!=NULL) {
3061 if (rf_does_it_fit(cset, ac)) {
3062 /* looks like it matches... */
3063 ac->next = cset->ac;
3064 cset->ac = ac;
3065 break;
3066 }
3067 cset = cset->next;
3068 }
3069 if (cset==NULL) {
3070 /* didn't find a match above... new set..*/
3071 cset = (RF_ConfigSet_t *)
3072 malloc(sizeof(RF_ConfigSet_t),
3073 M_RAIDFRAME, M_NOWAIT);
3074 if (cset == NULL) {
3075 panic("rf_create_auto_sets: No memory!");
3076 }
3077 cset->ac = ac;
3078 ac->next = NULL;
3079 cset->next = config_sets;
3080 cset->rootable = 0;
3081 config_sets = cset;
3082 }
3083 }
3084 ac = ac_next;
3085 }
3086
3087
3088 return(config_sets);
3089 }
3090
3091 static int
3092 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3093 {
3094 RF_ComponentLabel_t *clabel1, *clabel2;
3095
3096 /* If this one matches the *first* one in the set, that's good
3097 enough, since the other members of the set would have been
3098 through here too... */
3099 /* note that we are not checking partitionSize here..
3100
3101 Note that we are also not checking the mod_counters here.
3102 If everything else matches execpt the mod_counter, that's
3103 good enough for this test. We will deal with the mod_counters
3104 a little later in the autoconfiguration process.
3105
3106 (clabel1->mod_counter == clabel2->mod_counter) &&
3107
3108 The reason we don't check for this is that failed disks
3109 will have lower modification counts. If those disks are
3110 not added to the set they used to belong to, then they will
3111 form their own set, which may result in 2 different sets,
3112 for example, competing to be configured at raid0, and
3113 perhaps competing to be the root filesystem set. If the
3114 wrong ones get configured, or both attempt to become /,
3115 weird behaviour and or serious lossage will occur. Thus we
3116 need to bring them into the fold here, and kick them out at
3117 a later point.
3118
3119 */
3120
3121 clabel1 = cset->ac->clabel;
3122 clabel2 = ac->clabel;
3123 if ((clabel1->version == clabel2->version) &&
3124 (clabel1->serial_number == clabel2->serial_number) &&
3125 (clabel1->num_rows == clabel2->num_rows) &&
3126 (clabel1->num_columns == clabel2->num_columns) &&
3127 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3128 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3129 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3130 (clabel1->parityConfig == clabel2->parityConfig) &&
3131 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3132 (clabel1->blockSize == clabel2->blockSize) &&
3133 (clabel1->numBlocks == clabel2->numBlocks) &&
3134 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3135 (clabel1->root_partition == clabel2->root_partition) &&
3136 (clabel1->last_unit == clabel2->last_unit) &&
3137 (clabel1->config_order == clabel2->config_order)) {
3138 /* if it get's here, it almost *has* to be a match */
3139 } else {
3140 /* it's not consistent with somebody in the set..
3141 punt */
3142 return(0);
3143 }
3144 /* all was fine.. it must fit... */
3145 return(1);
3146 }
3147
3148 int
3149 rf_have_enough_components(RF_ConfigSet_t *cset)
3150 {
3151 RF_AutoConfig_t *ac;
3152 RF_AutoConfig_t *auto_config;
3153 RF_ComponentLabel_t *clabel;
3154 int c;
3155 int num_cols;
3156 int num_missing;
3157 int mod_counter;
3158 int mod_counter_found;
3159 int even_pair_failed;
3160 char parity_type;
3161
3162
3163 /* check to see that we have enough 'live' components
3164 of this set. If so, we can configure it if necessary */
3165
3166 num_cols = cset->ac->clabel->num_columns;
3167 parity_type = cset->ac->clabel->parityConfig;
3168
3169 /* XXX Check for duplicate components!?!?!? */
3170
3171 /* Determine what the mod_counter is supposed to be for this set. */
3172
3173 mod_counter_found = 0;
3174 mod_counter = 0;
3175 ac = cset->ac;
3176 while(ac!=NULL) {
3177 if (mod_counter_found==0) {
3178 mod_counter = ac->clabel->mod_counter;
3179 mod_counter_found = 1;
3180 } else {
3181 if (ac->clabel->mod_counter > mod_counter) {
3182 mod_counter = ac->clabel->mod_counter;
3183 }
3184 }
3185 ac = ac->next;
3186 }
3187
3188 num_missing = 0;
3189 auto_config = cset->ac;
3190
3191 even_pair_failed = 0;
3192 for(c=0; c<num_cols; c++) {
3193 ac = auto_config;
3194 while(ac!=NULL) {
3195 if ((ac->clabel->column == c) &&
3196 (ac->clabel->mod_counter == mod_counter)) {
3197 /* it's this one... */
3198 #ifdef DEBUG
3199 printf("Found: %s at %d\n",
3200 ac->devname,c);
3201 #endif
3202 break;
3203 }
3204 ac=ac->next;
3205 }
3206 if (ac==NULL) {
3207 /* Didn't find one here! */
3208 /* special case for RAID 1, especially
3209 where there are more than 2
3210 components (where RAIDframe treats
3211 things a little differently :( ) */
3212 if (parity_type == '1') {
3213 if (c%2 == 0) { /* even component */
3214 even_pair_failed = 1;
3215 } else { /* odd component. If
3216 we're failed, and
3217 so is the even
3218 component, it's
3219 "Good Night, Charlie" */
3220 if (even_pair_failed == 1) {
3221 return(0);
3222 }
3223 }
3224 } else {
3225 /* normal accounting */
3226 num_missing++;
3227 }
3228 }
3229 if ((parity_type == '1') && (c%2 == 1)) {
3230 /* Just did an even component, and we didn't
3231 bail.. reset the even_pair_failed flag,
3232 and go on to the next component.... */
3233 even_pair_failed = 0;
3234 }
3235 }
3236
3237 clabel = cset->ac->clabel;
3238
3239 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3240 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3241 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3242 /* XXX this needs to be made *much* more general */
3243 /* Too many failures */
3244 return(0);
3245 }
3246 /* otherwise, all is well, and we've got enough to take a kick
3247 at autoconfiguring this set */
3248 return(1);
3249 }
3250
3251 void
3252 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3253 RF_Raid_t *raidPtr)
3254 {
3255 RF_ComponentLabel_t *clabel;
3256 int i;
3257
3258 clabel = ac->clabel;
3259
3260 /* 1. Fill in the common stuff */
3261 config->numRow = clabel->num_rows = 1;
3262 config->numCol = clabel->num_columns;
3263 config->numSpare = 0; /* XXX should this be set here? */
3264 config->sectPerSU = clabel->sectPerSU;
3265 config->SUsPerPU = clabel->SUsPerPU;
3266 config->SUsPerRU = clabel->SUsPerRU;
3267 config->parityConfig = clabel->parityConfig;
3268 /* XXX... */
3269 strcpy(config->diskQueueType,"fifo");
3270 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3271 config->layoutSpecificSize = 0; /* XXX ?? */
3272
3273 while(ac!=NULL) {
3274 /* row/col values will be in range due to the checks
3275 in reasonable_label() */
3276 strcpy(config->devnames[0][ac->clabel->column],
3277 ac->devname);
3278 ac = ac->next;
3279 }
3280
3281 for(i=0;i<RF_MAXDBGV;i++) {
3282 config->debugVars[i][0] = 0;
3283 }
3284 }
3285
3286 int
3287 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3288 {
3289 RF_ComponentLabel_t clabel;
3290 struct vnode *vp;
3291 dev_t dev;
3292 int column;
3293 int sparecol;
3294
3295 raidPtr->autoconfigure = new_value;
3296
3297 for(column=0; column<raidPtr->numCol; column++) {
3298 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3299 dev = raidPtr->Disks[column].dev;
3300 vp = raidPtr->raid_cinfo[column].ci_vp;
3301 raidread_component_label(dev, vp, &clabel);
3302 clabel.autoconfigure = new_value;
3303 raidwrite_component_label(dev, vp, &clabel);
3304 }
3305 }
3306 for(column = 0; column < raidPtr->numSpare ; column++) {
3307 sparecol = raidPtr->numCol + column;
3308 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3309 dev = raidPtr->Disks[sparecol].dev;
3310 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3311 raidread_component_label(dev, vp, &clabel);
3312 clabel.autoconfigure = new_value;
3313 raidwrite_component_label(dev, vp, &clabel);
3314 }
3315 }
3316 return(new_value);
3317 }
3318
3319 int
3320 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3321 {
3322 RF_ComponentLabel_t clabel;
3323 struct vnode *vp;
3324 dev_t dev;
3325 int column;
3326 int sparecol;
3327
3328 raidPtr->root_partition = new_value;
3329 for(column=0; column<raidPtr->numCol; column++) {
3330 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3331 dev = raidPtr->Disks[column].dev;
3332 vp = raidPtr->raid_cinfo[column].ci_vp;
3333 raidread_component_label(dev, vp, &clabel);
3334 clabel.root_partition = new_value;
3335 raidwrite_component_label(dev, vp, &clabel);
3336 }
3337 }
3338 for(column = 0; column < raidPtr->numSpare ; column++) {
3339 sparecol = raidPtr->numCol + column;
3340 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3341 dev = raidPtr->Disks[sparecol].dev;
3342 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3343 raidread_component_label(dev, vp, &clabel);
3344 clabel.root_partition = new_value;
3345 raidwrite_component_label(dev, vp, &clabel);
3346 }
3347 }
3348 return(new_value);
3349 }
3350
3351 void
3352 rf_release_all_vps(RF_ConfigSet_t *cset)
3353 {
3354 RF_AutoConfig_t *ac;
3355
3356 ac = cset->ac;
3357 while(ac!=NULL) {
3358 /* Close the vp, and give it back */
3359 if (ac->vp) {
3360 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3361 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3362 vput(ac->vp);
3363 ac->vp = NULL;
3364 }
3365 ac = ac->next;
3366 }
3367 }
3368
3369
3370 void
3371 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3372 {
3373 RF_AutoConfig_t *ac;
3374 RF_AutoConfig_t *next_ac;
3375
3376 ac = cset->ac;
3377 while(ac!=NULL) {
3378 next_ac = ac->next;
3379 /* nuke the label */
3380 free(ac->clabel, M_RAIDFRAME);
3381 /* cleanup the config structure */
3382 free(ac, M_RAIDFRAME);
3383 /* "next.." */
3384 ac = next_ac;
3385 }
3386 /* and, finally, nuke the config set */
3387 free(cset, M_RAIDFRAME);
3388 }
3389
3390
3391 void
3392 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3393 {
3394 /* current version number */
3395 clabel->version = RF_COMPONENT_LABEL_VERSION;
3396 clabel->serial_number = raidPtr->serial_number;
3397 clabel->mod_counter = raidPtr->mod_counter;
3398 clabel->num_rows = 1;
3399 clabel->num_columns = raidPtr->numCol;
3400 clabel->clean = RF_RAID_DIRTY; /* not clean */
3401 clabel->status = rf_ds_optimal; /* "It's good!" */
3402
3403 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3404 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3405 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3406
3407 clabel->blockSize = raidPtr->bytesPerSector;
3408 clabel->numBlocks = raidPtr->sectorsPerDisk;
3409
3410 /* XXX not portable */
3411 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3412 clabel->maxOutstanding = raidPtr->maxOutstanding;
3413 clabel->autoconfigure = raidPtr->autoconfigure;
3414 clabel->root_partition = raidPtr->root_partition;
3415 clabel->last_unit = raidPtr->raidid;
3416 clabel->config_order = raidPtr->config_order;
3417 }
3418
3419 int
3420 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3421 {
3422 RF_Raid_t *raidPtr;
3423 RF_Config_t *config;
3424 int raidID;
3425 int retcode;
3426
3427 #ifdef DEBUG
3428 printf("RAID autoconfigure\n");
3429 #endif
3430
3431 retcode = 0;
3432 *unit = -1;
3433
3434 /* 1. Create a config structure */
3435
3436 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3437 M_RAIDFRAME,
3438 M_NOWAIT);
3439 if (config==NULL) {
3440 printf("Out of mem!?!?\n");
3441 /* XXX do something more intelligent here. */
3442 return(1);
3443 }
3444
3445 memset(config, 0, sizeof(RF_Config_t));
3446
3447 /*
3448 2. Figure out what RAID ID this one is supposed to live at
3449 See if we can get the same RAID dev that it was configured
3450 on last time..
3451 */
3452
3453 raidID = cset->ac->clabel->last_unit;
3454 if ((raidID < 0) || (raidID >= numraid)) {
3455 /* let's not wander off into lala land. */
3456 raidID = numraid - 1;
3457 }
3458 if (raidPtrs[raidID]->valid != 0) {
3459
3460 /*
3461 Nope... Go looking for an alternative...
3462 Start high so we don't immediately use raid0 if that's
3463 not taken.
3464 */
3465
3466 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3467 if (raidPtrs[raidID]->valid == 0) {
3468 /* can use this one! */
3469 break;
3470 }
3471 }
3472 }
3473
3474 if (raidID < 0) {
3475 /* punt... */
3476 printf("Unable to auto configure this set!\n");
3477 printf("(Out of RAID devs!)\n");
3478 free(config, M_RAIDFRAME);
3479 return(1);
3480 }
3481
3482 #ifdef DEBUG
3483 printf("Configuring raid%d:\n",raidID);
3484 #endif
3485
3486 raidPtr = raidPtrs[raidID];
3487
3488 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3489 raidPtr->raidid = raidID;
3490 raidPtr->openings = RAIDOUTSTANDING;
3491
3492 /* 3. Build the configuration structure */
3493 rf_create_configuration(cset->ac, config, raidPtr);
3494
3495 /* 4. Do the configuration */
3496 retcode = rf_Configure(raidPtr, config, cset->ac);
3497
3498 if (retcode == 0) {
3499
3500 raidinit(raidPtrs[raidID]);
3501
3502 rf_markalldirty(raidPtrs[raidID]);
3503 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3504 if (cset->ac->clabel->root_partition==1) {
3505 /* everything configured just fine. Make a note
3506 that this set is eligible to be root. */
3507 cset->rootable = 1;
3508 /* XXX do this here? */
3509 raidPtrs[raidID]->root_partition = 1;
3510 }
3511 }
3512
3513 /* 5. Cleanup */
3514 free(config, M_RAIDFRAME);
3515
3516 *unit = raidID;
3517 return(retcode);
3518 }
3519
3520 void
3521 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3522 {
3523 struct buf *bp;
3524
3525 bp = (struct buf *)desc->bp;
3526 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3527 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3528 }
3529
3530 void
3531 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3532 size_t xmin, size_t xmax)
3533 {
3534 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3535 pool_sethiwat(p, xmax);
3536 pool_prime(p, xmin);
3537 pool_setlowat(p, xmin);
3538 }
3539
3540 /*
3541 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3542 * if there is IO pending and if that IO could possibly be done for a
3543 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3544 * otherwise.
3545 *
3546 */
3547
3548 int
3549 rf_buf_queue_check(int raidid)
3550 {
3551 if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
3552 raidPtrs[raidid]->openings > 0) {
3553 /* there is work to do */
3554 return 0;
3555 }
3556 /* default is nothing to do */
3557 return 1;
3558 }
3559
3560 int
3561 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3562 {
3563 struct partinfo dpart;
3564 struct dkwedge_info dkw;
3565 int error;
3566
3567 error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred);
3568 if (error == 0) {
3569 diskPtr->blockSize = dpart.disklab->d_secsize;
3570 diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3571 diskPtr->partitionSize = dpart.part->p_size;
3572 return 0;
3573 }
3574
3575 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred);
3576 if (error == 0) {
3577 diskPtr->blockSize = 512; /* XXX */
3578 diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3579 diskPtr->partitionSize = dkw.dkw_size;
3580 return 0;
3581 }
3582 return error;
3583 }
3584
3585 static int
3586 raid_match(struct device *self, struct cfdata *cfdata,
3587 void *aux)
3588 {
3589 return 1;
3590 }
3591
3592 static void
3593 raid_attach(struct device *parent, struct device *self,
3594 void *aux)
3595 {
3596
3597 }
3598
3599
3600 static int
3601 raid_detach(struct device *self, int flags)
3602 {
3603 struct raid_softc *rs = (struct raid_softc *)self;
3604
3605 if (rs->sc_flags & RAIDF_INITED)
3606 return EBUSY;
3607
3608 return 0;
3609 }
3610
3611 static void
3612 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3613 {
3614 prop_dictionary_t disk_info, odisk_info, geom;
3615 disk_info = prop_dictionary_create();
3616 geom = prop_dictionary_create();
3617 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3618 raidPtr->totalSectors);
3619 prop_dictionary_set_uint32(geom, "sector-size",
3620 raidPtr->bytesPerSector);
3621
3622 prop_dictionary_set_uint16(geom, "sectors-per-track",
3623 raidPtr->Layout.dataSectorsPerStripe);
3624 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3625 4 * raidPtr->numCol);
3626
3627 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3628 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3629 (4 * raidPtr->numCol)));
3630
3631 prop_dictionary_set(disk_info, "geometry", geom);
3632 prop_object_release(geom);
3633 prop_dictionary_set(device_properties(rs->sc_dev),
3634 "disk-info", disk_info);
3635 odisk_info = rs->sc_dkdev.dk_info;
3636 rs->sc_dkdev.dk_info = disk_info;
3637 if (odisk_info)
3638 prop_object_release(odisk_info);
3639 }
3640