rf_netbsdkintf.c revision 1.245.4.1 1 /* $NetBSD: rf_netbsdkintf.c,v 1.245.4.1 2008/05/16 02:25:01 yamt Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Copyright (c) 1990, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * the Systems Programming Group of the University of Utah Computer
37 * Science Department.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 * from: Utah $Hdr: cd.c 1.6 90/11/28$
64 *
65 * @(#)cd.c 8.2 (Berkeley) 11/16/93
66 */
67
68 /*
69 * Copyright (c) 1988 University of Utah.
70 *
71 * This code is derived from software contributed to Berkeley by
72 * the Systems Programming Group of the University of Utah Computer
73 * Science Department.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgement:
85 * This product includes software developed by the University of
86 * California, Berkeley and its contributors.
87 * 4. Neither the name of the University nor the names of its contributors
88 * may be used to endorse or promote products derived from this software
89 * without specific prior written permission.
90 *
91 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
94 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
95 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
96 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
97 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
98 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
99 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
100 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
101 * SUCH DAMAGE.
102 *
103 * from: Utah $Hdr: cd.c 1.6 90/11/28$
104 *
105 * @(#)cd.c 8.2 (Berkeley) 11/16/93
106 */
107
108 /*
109 * Copyright (c) 1995 Carnegie-Mellon University.
110 * All rights reserved.
111 *
112 * Authors: Mark Holland, Jim Zelenka
113 *
114 * Permission to use, copy, modify and distribute this software and
115 * its documentation is hereby granted, provided that both the copyright
116 * notice and this permission notice appear in all copies of the
117 * software, derivative works or modified versions, and any portions
118 * thereof, and that both notices appear in supporting documentation.
119 *
120 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
121 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
122 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
123 *
124 * Carnegie Mellon requests users of this software to return to
125 *
126 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
127 * School of Computer Science
128 * Carnegie Mellon University
129 * Pittsburgh PA 15213-3890
130 *
131 * any improvements or extensions that they make and grant Carnegie the
132 * rights to redistribute these changes.
133 */
134
135 /***********************************************************
136 *
137 * rf_kintf.c -- the kernel interface routines for RAIDframe
138 *
139 ***********************************************************/
140
141 #include <sys/cdefs.h>
142 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.245.4.1 2008/05/16 02:25:01 yamt Exp $");
143
144 #include <sys/param.h>
145 #include <sys/errno.h>
146 #include <sys/pool.h>
147 #include <sys/proc.h>
148 #include <sys/queue.h>
149 #include <sys/disk.h>
150 #include <sys/device.h>
151 #include <sys/stat.h>
152 #include <sys/ioctl.h>
153 #include <sys/fcntl.h>
154 #include <sys/systm.h>
155 #include <sys/vnode.h>
156 #include <sys/disklabel.h>
157 #include <sys/conf.h>
158 #include <sys/buf.h>
159 #include <sys/bufq.h>
160 #include <sys/user.h>
161 #include <sys/reboot.h>
162 #include <sys/kauth.h>
163
164 #include <prop/proplib.h>
165
166 #include <dev/raidframe/raidframevar.h>
167 #include <dev/raidframe/raidframeio.h>
168 #include "raid.h"
169 #include "opt_raid_autoconfig.h"
170 #include "rf_raid.h"
171 #include "rf_copyback.h"
172 #include "rf_dag.h"
173 #include "rf_dagflags.h"
174 #include "rf_desc.h"
175 #include "rf_diskqueue.h"
176 #include "rf_etimer.h"
177 #include "rf_general.h"
178 #include "rf_kintf.h"
179 #include "rf_options.h"
180 #include "rf_driver.h"
181 #include "rf_parityscan.h"
182 #include "rf_threadstuff.h"
183
184 #ifdef DEBUG
185 int rf_kdebug_level = 0;
186 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
187 #else /* DEBUG */
188 #define db1_printf(a) { }
189 #endif /* DEBUG */
190
191 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
192
193 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
194
195 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
196 * spare table */
197 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
198 * installation process */
199
200 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
201
202 /* prototypes */
203 static void KernelWakeupFunc(struct buf *);
204 static void InitBP(struct buf *, struct vnode *, unsigned,
205 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
206 void *, int, struct proc *);
207 static void raidinit(RF_Raid_t *);
208
209 void raidattach(int);
210 static int raid_match(struct device *, struct cfdata *, void *);
211 static void raid_attach(struct device *, struct device *, void *);
212 static int raid_detach(struct device *, int);
213
214 dev_type_open(raidopen);
215 dev_type_close(raidclose);
216 dev_type_read(raidread);
217 dev_type_write(raidwrite);
218 dev_type_ioctl(raidioctl);
219 dev_type_strategy(raidstrategy);
220 dev_type_dump(raiddump);
221 dev_type_size(raidsize);
222
223 const struct bdevsw raid_bdevsw = {
224 raidopen, raidclose, raidstrategy, raidioctl,
225 raiddump, raidsize, D_DISK
226 };
227
228 const struct cdevsw raid_cdevsw = {
229 raidopen, raidclose, raidread, raidwrite, raidioctl,
230 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
231 };
232
233 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
234
235 /* XXX Not sure if the following should be replacing the raidPtrs above,
236 or if it should be used in conjunction with that...
237 */
238
239 struct raid_softc {
240 struct device *sc_dev;
241 int sc_flags; /* flags */
242 int sc_cflags; /* configuration flags */
243 uint64_t sc_size; /* size of the raid device */
244 char sc_xname[20]; /* XXX external name */
245 struct disk sc_dkdev; /* generic disk device info */
246 struct bufq_state *buf_queue; /* used for the device queue */
247 };
248 /* sc_flags */
249 #define RAIDF_INITED 0x01 /* unit has been initialized */
250 #define RAIDF_WLABEL 0x02 /* label area is writable */
251 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
252 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
253 #define RAIDF_LOCKED 0x80 /* unit is locked */
254
255 #define raidunit(x) DISKUNIT(x)
256 int numraid = 0;
257
258 extern struct cfdriver raid_cd;
259 CFATTACH_DECL_NEW(raid, sizeof(struct raid_softc),
260 raid_match, raid_attach, raid_detach, NULL);
261
262 /*
263 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
264 * Be aware that large numbers can allow the driver to consume a lot of
265 * kernel memory, especially on writes, and in degraded mode reads.
266 *
267 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
268 * a single 64K write will typically require 64K for the old data,
269 * 64K for the old parity, and 64K for the new parity, for a total
270 * of 192K (if the parity buffer is not re-used immediately).
271 * Even it if is used immediately, that's still 128K, which when multiplied
272 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
273 *
274 * Now in degraded mode, for example, a 64K read on the above setup may
275 * require data reconstruction, which will require *all* of the 4 remaining
276 * disks to participate -- 4 * 32K/disk == 128K again.
277 */
278
279 #ifndef RAIDOUTSTANDING
280 #define RAIDOUTSTANDING 6
281 #endif
282
283 #define RAIDLABELDEV(dev) \
284 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
285
286 /* declared here, and made public, for the benefit of KVM stuff.. */
287 struct raid_softc *raid_softc;
288
289 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
290 struct disklabel *);
291 static void raidgetdisklabel(dev_t);
292 static void raidmakedisklabel(struct raid_softc *);
293
294 static int raidlock(struct raid_softc *);
295 static void raidunlock(struct raid_softc *);
296
297 static void rf_markalldirty(RF_Raid_t *);
298 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
299
300 void rf_ReconThread(struct rf_recon_req *);
301 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
302 void rf_CopybackThread(RF_Raid_t *raidPtr);
303 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
304 int rf_autoconfig(struct device *self);
305 void rf_buildroothack(RF_ConfigSet_t *);
306
307 RF_AutoConfig_t *rf_find_raid_components(void);
308 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
309 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
310 static int rf_reasonable_label(RF_ComponentLabel_t *);
311 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
312 int rf_set_autoconfig(RF_Raid_t *, int);
313 int rf_set_rootpartition(RF_Raid_t *, int);
314 void rf_release_all_vps(RF_ConfigSet_t *);
315 void rf_cleanup_config_set(RF_ConfigSet_t *);
316 int rf_have_enough_components(RF_ConfigSet_t *);
317 int rf_auto_config_set(RF_ConfigSet_t *, int *);
318
319 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
320 allow autoconfig to take place.
321 Note that this is overridden by having
322 RAID_AUTOCONFIG as an option in the
323 kernel config file. */
324
325 struct RF_Pools_s rf_pools;
326
327 void
328 raidattach(int num)
329 {
330 int raidID;
331 int i, rc;
332
333 #ifdef DEBUG
334 printf("raidattach: Asked for %d units\n", num);
335 #endif
336
337 if (num <= 0) {
338 #ifdef DIAGNOSTIC
339 panic("raidattach: count <= 0");
340 #endif
341 return;
342 }
343 /* This is where all the initialization stuff gets done. */
344
345 numraid = num;
346
347 /* Make some space for requested number of units... */
348
349 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
350 if (raidPtrs == NULL) {
351 panic("raidPtrs is NULL!!");
352 }
353
354 rf_mutex_init(&rf_sparet_wait_mutex);
355
356 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
357
358 for (i = 0; i < num; i++)
359 raidPtrs[i] = NULL;
360 rc = rf_BootRaidframe();
361 if (rc == 0)
362 aprint_normal("Kernelized RAIDframe activated\n");
363 else
364 panic("Serious error booting RAID!!");
365
366 /* put together some datastructures like the CCD device does.. This
367 * lets us lock the device and what-not when it gets opened. */
368
369 raid_softc = (struct raid_softc *)
370 malloc(num * sizeof(struct raid_softc),
371 M_RAIDFRAME, M_NOWAIT);
372 if (raid_softc == NULL) {
373 aprint_error("WARNING: no memory for RAIDframe driver\n");
374 return;
375 }
376
377 memset(raid_softc, 0, num * sizeof(struct raid_softc));
378
379 for (raidID = 0; raidID < num; raidID++) {
380 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
381
382 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
383 (RF_Raid_t *));
384 if (raidPtrs[raidID] == NULL) {
385 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
386 numraid = raidID;
387 return;
388 }
389 }
390
391 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
392 aprint_error("raidattach: config_cfattach_attach failed?\n");
393 }
394
395 #ifdef RAID_AUTOCONFIG
396 raidautoconfig = 1;
397 #endif
398
399 /*
400 * Register a finalizer which will be used to auto-config RAID
401 * sets once all real hardware devices have been found.
402 */
403 if (config_finalize_register(NULL, rf_autoconfig) != 0)
404 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
405 }
406
407 int
408 rf_autoconfig(struct device *self)
409 {
410 RF_AutoConfig_t *ac_list;
411 RF_ConfigSet_t *config_sets;
412
413 if (raidautoconfig == 0)
414 return (0);
415
416 /* XXX This code can only be run once. */
417 raidautoconfig = 0;
418
419 /* 1. locate all RAID components on the system */
420 #ifdef DEBUG
421 printf("Searching for RAID components...\n");
422 #endif
423 ac_list = rf_find_raid_components();
424
425 /* 2. Sort them into their respective sets. */
426 config_sets = rf_create_auto_sets(ac_list);
427
428 /*
429 * 3. Evaluate each set andconfigure the valid ones.
430 * This gets done in rf_buildroothack().
431 */
432 rf_buildroothack(config_sets);
433
434 return 1;
435 }
436
437 void
438 rf_buildroothack(RF_ConfigSet_t *config_sets)
439 {
440 RF_ConfigSet_t *cset;
441 RF_ConfigSet_t *next_cset;
442 int retcode;
443 int raidID;
444 int rootID;
445 int col;
446 int num_root;
447 char *devname;
448
449 rootID = 0;
450 num_root = 0;
451 cset = config_sets;
452 while(cset != NULL ) {
453 next_cset = cset->next;
454 if (rf_have_enough_components(cset) &&
455 cset->ac->clabel->autoconfigure==1) {
456 retcode = rf_auto_config_set(cset,&raidID);
457 if (!retcode) {
458 #ifdef DEBUG
459 printf("raid%d: configured ok\n", raidID);
460 #endif
461 if (cset->rootable) {
462 rootID = raidID;
463 num_root++;
464 }
465 } else {
466 /* The autoconfig didn't work :( */
467 #ifdef DEBUG
468 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
469 #endif
470 rf_release_all_vps(cset);
471 }
472 } else {
473 /* we're not autoconfiguring this set...
474 release the associated resources */
475 rf_release_all_vps(cset);
476 }
477 /* cleanup */
478 rf_cleanup_config_set(cset);
479 cset = next_cset;
480 }
481
482 /* if the user has specified what the root device should be
483 then we don't touch booted_device or boothowto... */
484
485 if (rootspec != NULL)
486 return;
487
488 /* we found something bootable... */
489
490 if (num_root == 1) {
491 booted_device = raid_softc[rootID].sc_dev;
492 } else if (num_root > 1) {
493
494 /*
495 * Maybe the MD code can help. If it cannot, then
496 * setroot() will discover that we have no
497 * booted_device and will ask the user if nothing was
498 * hardwired in the kernel config file
499 */
500
501 if (booted_device == NULL)
502 cpu_rootconf();
503 if (booted_device == NULL)
504 return;
505
506 num_root = 0;
507 for (raidID = 0; raidID < numraid; raidID++) {
508 if (raidPtrs[raidID]->valid == 0)
509 continue;
510
511 if (raidPtrs[raidID]->root_partition == 0)
512 continue;
513
514 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
515 devname = raidPtrs[raidID]->Disks[col].devname;
516 devname += sizeof("/dev/") - 1;
517 if (strncmp(devname, device_xname(booted_device),
518 strlen(device_xname(booted_device))) != 0)
519 continue;
520 #ifdef DEBUG
521 printf("raid%d includes boot device %s\n",
522 raidID, devname);
523 #endif
524 num_root++;
525 rootID = raidID;
526 }
527 }
528
529 if (num_root == 1) {
530 booted_device = raid_softc[rootID].sc_dev;
531 } else {
532 /* we can't guess.. require the user to answer... */
533 boothowto |= RB_ASKNAME;
534 }
535 }
536 }
537
538
539 int
540 raidsize(dev_t dev)
541 {
542 struct raid_softc *rs;
543 struct disklabel *lp;
544 int part, unit, omask, size;
545
546 unit = raidunit(dev);
547 if (unit >= numraid)
548 return (-1);
549 rs = &raid_softc[unit];
550
551 if ((rs->sc_flags & RAIDF_INITED) == 0)
552 return (-1);
553
554 part = DISKPART(dev);
555 omask = rs->sc_dkdev.dk_openmask & (1 << part);
556 lp = rs->sc_dkdev.dk_label;
557
558 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
559 return (-1);
560
561 if (lp->d_partitions[part].p_fstype != FS_SWAP)
562 size = -1;
563 else
564 size = lp->d_partitions[part].p_size *
565 (lp->d_secsize / DEV_BSIZE);
566
567 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
568 return (-1);
569
570 return (size);
571
572 }
573
574 int
575 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
576 {
577 int unit = raidunit(dev);
578 struct raid_softc *rs;
579 const struct bdevsw *bdev;
580 struct disklabel *lp;
581 RF_Raid_t *raidPtr;
582 daddr_t offset;
583 int part, c, sparecol, j, scol, dumpto;
584 int error = 0;
585
586 if (unit >= numraid)
587 return (ENXIO);
588
589 rs = &raid_softc[unit];
590 raidPtr = raidPtrs[unit];
591
592 if ((rs->sc_flags & RAIDF_INITED) == 0)
593 return ENXIO;
594
595 /* we only support dumping to RAID 1 sets */
596 if (raidPtr->Layout.numDataCol != 1 ||
597 raidPtr->Layout.numParityCol != 1)
598 return EINVAL;
599
600
601 if ((error = raidlock(rs)) != 0)
602 return error;
603
604 if (size % DEV_BSIZE != 0) {
605 error = EINVAL;
606 goto out;
607 }
608
609 if (blkno + size / DEV_BSIZE > rs->sc_size) {
610 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
611 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
612 size / DEV_BSIZE, rs->sc_size);
613 error = EINVAL;
614 goto out;
615 }
616
617 part = DISKPART(dev);
618 lp = rs->sc_dkdev.dk_label;
619 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
620
621 /* figure out what device is alive.. */
622
623 /*
624 Look for a component to dump to. The preference for the
625 component to dump to is as follows:
626 1) the master
627 2) a used_spare of the master
628 3) the slave
629 4) a used_spare of the slave
630 */
631
632 dumpto = -1;
633 for (c = 0; c < raidPtr->numCol; c++) {
634 if (raidPtr->Disks[c].status == rf_ds_optimal) {
635 /* this might be the one */
636 dumpto = c;
637 break;
638 }
639 }
640
641 /*
642 At this point we have possibly selected a live master or a
643 live slave. We now check to see if there is a spared
644 master (or a spared slave), if we didn't find a live master
645 or a live slave.
646 */
647
648 for (c = 0; c < raidPtr->numSpare; c++) {
649 sparecol = raidPtr->numCol + c;
650 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
651 /* How about this one? */
652 scol = -1;
653 for(j=0;j<raidPtr->numCol;j++) {
654 if (raidPtr->Disks[j].spareCol == sparecol) {
655 scol = j;
656 break;
657 }
658 }
659 if (scol == 0) {
660 /*
661 We must have found a spared master!
662 We'll take that over anything else
663 found so far. (We couldn't have
664 found a real master before, since
665 this is a used spare, and it's
666 saying that it's replacing the
667 master.) On reboot (with
668 autoconfiguration turned on)
669 sparecol will become the 1st
670 component (component0) of this set.
671 */
672 dumpto = sparecol;
673 break;
674 } else if (scol != -1) {
675 /*
676 Must be a spared slave. We'll dump
677 to that if we havn't found anything
678 else so far.
679 */
680 if (dumpto == -1)
681 dumpto = sparecol;
682 }
683 }
684 }
685
686 if (dumpto == -1) {
687 /* we couldn't find any live components to dump to!?!?
688 */
689 error = EINVAL;
690 goto out;
691 }
692
693 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
694
695 /*
696 Note that blkno is relative to this particular partition.
697 By adding the offset of this partition in the RAID
698 set, and also adding RF_PROTECTED_SECTORS, we get a
699 value that is relative to the partition used for the
700 underlying component.
701 */
702
703 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
704 blkno + offset, va, size);
705
706 out:
707 raidunlock(rs);
708
709 return error;
710 }
711 /* ARGSUSED */
712 int
713 raidopen(dev_t dev, int flags, int fmt,
714 struct lwp *l)
715 {
716 int unit = raidunit(dev);
717 struct raid_softc *rs;
718 struct disklabel *lp;
719 int part, pmask;
720 int error = 0;
721
722 if (unit >= numraid)
723 return (ENXIO);
724 rs = &raid_softc[unit];
725
726 if ((error = raidlock(rs)) != 0)
727 return (error);
728 lp = rs->sc_dkdev.dk_label;
729
730 part = DISKPART(dev);
731
732 /*
733 * If there are wedges, and this is not RAW_PART, then we
734 * need to fail.
735 */
736 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
737 error = EBUSY;
738 goto bad;
739 }
740 pmask = (1 << part);
741
742 if ((rs->sc_flags & RAIDF_INITED) &&
743 (rs->sc_dkdev.dk_openmask == 0))
744 raidgetdisklabel(dev);
745
746 /* make sure that this partition exists */
747
748 if (part != RAW_PART) {
749 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
750 ((part >= lp->d_npartitions) ||
751 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
752 error = ENXIO;
753 goto bad;
754 }
755 }
756 /* Prevent this unit from being unconfigured while open. */
757 switch (fmt) {
758 case S_IFCHR:
759 rs->sc_dkdev.dk_copenmask |= pmask;
760 break;
761
762 case S_IFBLK:
763 rs->sc_dkdev.dk_bopenmask |= pmask;
764 break;
765 }
766
767 if ((rs->sc_dkdev.dk_openmask == 0) &&
768 ((rs->sc_flags & RAIDF_INITED) != 0)) {
769 /* First one... mark things as dirty... Note that we *MUST*
770 have done a configure before this. I DO NOT WANT TO BE
771 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
772 THAT THEY BELONG TOGETHER!!!!! */
773 /* XXX should check to see if we're only open for reading
774 here... If so, we needn't do this, but then need some
775 other way of keeping track of what's happened.. */
776
777 rf_markalldirty( raidPtrs[unit] );
778 }
779
780
781 rs->sc_dkdev.dk_openmask =
782 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
783
784 bad:
785 raidunlock(rs);
786
787 return (error);
788
789
790 }
791 /* ARGSUSED */
792 int
793 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
794 {
795 int unit = raidunit(dev);
796 struct cfdata *cf;
797 struct raid_softc *rs;
798 int error = 0;
799 int part;
800
801 if (unit >= numraid)
802 return (ENXIO);
803 rs = &raid_softc[unit];
804
805 if ((error = raidlock(rs)) != 0)
806 return (error);
807
808 part = DISKPART(dev);
809
810 /* ...that much closer to allowing unconfiguration... */
811 switch (fmt) {
812 case S_IFCHR:
813 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
814 break;
815
816 case S_IFBLK:
817 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
818 break;
819 }
820 rs->sc_dkdev.dk_openmask =
821 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
822
823 if ((rs->sc_dkdev.dk_openmask == 0) &&
824 ((rs->sc_flags & RAIDF_INITED) != 0)) {
825 /* Last one... device is not unconfigured yet.
826 Device shutdown has taken care of setting the
827 clean bits if RAIDF_INITED is not set
828 mark things as clean... */
829
830 rf_update_component_labels(raidPtrs[unit],
831 RF_FINAL_COMPONENT_UPDATE);
832 if (doing_shutdown) {
833 /* last one, and we're going down, so
834 lights out for this RAID set too. */
835 error = rf_Shutdown(raidPtrs[unit]);
836
837 /* It's no longer initialized... */
838 rs->sc_flags &= ~RAIDF_INITED;
839
840 /* detach the device */
841
842 cf = device_cfdata(rs->sc_dev);
843 error = config_detach(rs->sc_dev, DETACH_QUIET);
844 free(cf, M_RAIDFRAME);
845
846 /* Detach the disk. */
847 disk_detach(&rs->sc_dkdev);
848 disk_destroy(&rs->sc_dkdev);
849 }
850 }
851
852 raidunlock(rs);
853 return (0);
854
855 }
856
857 void
858 raidstrategy(struct buf *bp)
859 {
860 int s;
861
862 unsigned int raidID = raidunit(bp->b_dev);
863 RF_Raid_t *raidPtr;
864 struct raid_softc *rs = &raid_softc[raidID];
865 int wlabel;
866
867 if ((rs->sc_flags & RAIDF_INITED) ==0) {
868 bp->b_error = ENXIO;
869 goto done;
870 }
871 if (raidID >= numraid || !raidPtrs[raidID]) {
872 bp->b_error = ENODEV;
873 goto done;
874 }
875 raidPtr = raidPtrs[raidID];
876 if (!raidPtr->valid) {
877 bp->b_error = ENODEV;
878 goto done;
879 }
880 if (bp->b_bcount == 0) {
881 db1_printf(("b_bcount is zero..\n"));
882 goto done;
883 }
884
885 /*
886 * Do bounds checking and adjust transfer. If there's an
887 * error, the bounds check will flag that for us.
888 */
889
890 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
891 if (DISKPART(bp->b_dev) == RAW_PART) {
892 uint64_t size; /* device size in DEV_BSIZE unit */
893
894 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
895 size = raidPtr->totalSectors <<
896 (raidPtr->logBytesPerSector - DEV_BSHIFT);
897 } else {
898 size = raidPtr->totalSectors >>
899 (DEV_BSHIFT - raidPtr->logBytesPerSector);
900 }
901 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
902 goto done;
903 }
904 } else {
905 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
906 db1_printf(("Bounds check failed!!:%d %d\n",
907 (int) bp->b_blkno, (int) wlabel));
908 goto done;
909 }
910 }
911 s = splbio();
912
913 bp->b_resid = 0;
914
915 /* stuff it onto our queue */
916 BUFQ_PUT(rs->buf_queue, bp);
917
918 /* scheduled the IO to happen at the next convenient time */
919 wakeup(&(raidPtrs[raidID]->iodone));
920
921 splx(s);
922 return;
923
924 done:
925 bp->b_resid = bp->b_bcount;
926 biodone(bp);
927 }
928 /* ARGSUSED */
929 int
930 raidread(dev_t dev, struct uio *uio, int flags)
931 {
932 int unit = raidunit(dev);
933 struct raid_softc *rs;
934
935 if (unit >= numraid)
936 return (ENXIO);
937 rs = &raid_softc[unit];
938
939 if ((rs->sc_flags & RAIDF_INITED) == 0)
940 return (ENXIO);
941
942 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
943
944 }
945 /* ARGSUSED */
946 int
947 raidwrite(dev_t dev, struct uio *uio, int flags)
948 {
949 int unit = raidunit(dev);
950 struct raid_softc *rs;
951
952 if (unit >= numraid)
953 return (ENXIO);
954 rs = &raid_softc[unit];
955
956 if ((rs->sc_flags & RAIDF_INITED) == 0)
957 return (ENXIO);
958
959 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
960
961 }
962
963 int
964 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
965 {
966 int unit = raidunit(dev);
967 int error = 0;
968 int part, pmask;
969 struct cfdata *cf;
970 struct raid_softc *rs;
971 RF_Config_t *k_cfg, *u_cfg;
972 RF_Raid_t *raidPtr;
973 RF_RaidDisk_t *diskPtr;
974 RF_AccTotals_t *totals;
975 RF_DeviceConfig_t *d_cfg, **ucfgp;
976 u_char *specific_buf;
977 int retcode = 0;
978 int column;
979 int raidid;
980 struct rf_recon_req *rrcopy, *rr;
981 RF_ComponentLabel_t *clabel;
982 RF_ComponentLabel_t *ci_label;
983 RF_ComponentLabel_t **clabel_ptr;
984 RF_SingleComponent_t *sparePtr,*componentPtr;
985 RF_SingleComponent_t component;
986 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
987 int i, j, d;
988 #ifdef __HAVE_OLD_DISKLABEL
989 struct disklabel newlabel;
990 #endif
991 struct dkwedge_info *dkw;
992
993 if (unit >= numraid)
994 return (ENXIO);
995 rs = &raid_softc[unit];
996 raidPtr = raidPtrs[unit];
997
998 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
999 (int) DISKPART(dev), (int) unit, (int) cmd));
1000
1001 /* Must be open for writes for these commands... */
1002 switch (cmd) {
1003 #ifdef DIOCGSECTORSIZE
1004 case DIOCGSECTORSIZE:
1005 *(u_int *)data = raidPtr->bytesPerSector;
1006 return 0;
1007 case DIOCGMEDIASIZE:
1008 *(off_t *)data =
1009 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1010 return 0;
1011 #endif
1012 case DIOCSDINFO:
1013 case DIOCWDINFO:
1014 #ifdef __HAVE_OLD_DISKLABEL
1015 case ODIOCWDINFO:
1016 case ODIOCSDINFO:
1017 #endif
1018 case DIOCWLABEL:
1019 case DIOCAWEDGE:
1020 case DIOCDWEDGE:
1021 if ((flag & FWRITE) == 0)
1022 return (EBADF);
1023 }
1024
1025 /* Must be initialized for these... */
1026 switch (cmd) {
1027 case DIOCGDINFO:
1028 case DIOCSDINFO:
1029 case DIOCWDINFO:
1030 #ifdef __HAVE_OLD_DISKLABEL
1031 case ODIOCGDINFO:
1032 case ODIOCWDINFO:
1033 case ODIOCSDINFO:
1034 case ODIOCGDEFLABEL:
1035 #endif
1036 case DIOCGPART:
1037 case DIOCWLABEL:
1038 case DIOCGDEFLABEL:
1039 case DIOCAWEDGE:
1040 case DIOCDWEDGE:
1041 case DIOCLWEDGES:
1042 case RAIDFRAME_SHUTDOWN:
1043 case RAIDFRAME_REWRITEPARITY:
1044 case RAIDFRAME_GET_INFO:
1045 case RAIDFRAME_RESET_ACCTOTALS:
1046 case RAIDFRAME_GET_ACCTOTALS:
1047 case RAIDFRAME_KEEP_ACCTOTALS:
1048 case RAIDFRAME_GET_SIZE:
1049 case RAIDFRAME_FAIL_DISK:
1050 case RAIDFRAME_COPYBACK:
1051 case RAIDFRAME_CHECK_RECON_STATUS:
1052 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1053 case RAIDFRAME_GET_COMPONENT_LABEL:
1054 case RAIDFRAME_SET_COMPONENT_LABEL:
1055 case RAIDFRAME_ADD_HOT_SPARE:
1056 case RAIDFRAME_REMOVE_HOT_SPARE:
1057 case RAIDFRAME_INIT_LABELS:
1058 case RAIDFRAME_REBUILD_IN_PLACE:
1059 case RAIDFRAME_CHECK_PARITY:
1060 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1061 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1062 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1063 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1064 case RAIDFRAME_SET_AUTOCONFIG:
1065 case RAIDFRAME_SET_ROOT:
1066 case RAIDFRAME_DELETE_COMPONENT:
1067 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1068 if ((rs->sc_flags & RAIDF_INITED) == 0)
1069 return (ENXIO);
1070 }
1071
1072 switch (cmd) {
1073
1074 /* configure the system */
1075 case RAIDFRAME_CONFIGURE:
1076
1077 if (raidPtr->valid) {
1078 /* There is a valid RAID set running on this unit! */
1079 printf("raid%d: Device already configured!\n",unit);
1080 return(EINVAL);
1081 }
1082
1083 /* copy-in the configuration information */
1084 /* data points to a pointer to the configuration structure */
1085
1086 u_cfg = *((RF_Config_t **) data);
1087 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1088 if (k_cfg == NULL) {
1089 return (ENOMEM);
1090 }
1091 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1092 if (retcode) {
1093 RF_Free(k_cfg, sizeof(RF_Config_t));
1094 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1095 retcode));
1096 return (retcode);
1097 }
1098 /* allocate a buffer for the layout-specific data, and copy it
1099 * in */
1100 if (k_cfg->layoutSpecificSize) {
1101 if (k_cfg->layoutSpecificSize > 10000) {
1102 /* sanity check */
1103 RF_Free(k_cfg, sizeof(RF_Config_t));
1104 return (EINVAL);
1105 }
1106 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1107 (u_char *));
1108 if (specific_buf == NULL) {
1109 RF_Free(k_cfg, sizeof(RF_Config_t));
1110 return (ENOMEM);
1111 }
1112 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1113 k_cfg->layoutSpecificSize);
1114 if (retcode) {
1115 RF_Free(k_cfg, sizeof(RF_Config_t));
1116 RF_Free(specific_buf,
1117 k_cfg->layoutSpecificSize);
1118 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1119 retcode));
1120 return (retcode);
1121 }
1122 } else
1123 specific_buf = NULL;
1124 k_cfg->layoutSpecific = specific_buf;
1125
1126 /* should do some kind of sanity check on the configuration.
1127 * Store the sum of all the bytes in the last byte? */
1128
1129 /* configure the system */
1130
1131 /*
1132 * Clear the entire RAID descriptor, just to make sure
1133 * there is no stale data left in the case of a
1134 * reconfiguration
1135 */
1136 memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1137 raidPtr->raidid = unit;
1138
1139 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1140
1141 if (retcode == 0) {
1142
1143 /* allow this many simultaneous IO's to
1144 this RAID device */
1145 raidPtr->openings = RAIDOUTSTANDING;
1146
1147 raidinit(raidPtr);
1148 rf_markalldirty(raidPtr);
1149 }
1150 /* free the buffers. No return code here. */
1151 if (k_cfg->layoutSpecificSize) {
1152 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1153 }
1154 RF_Free(k_cfg, sizeof(RF_Config_t));
1155
1156 return (retcode);
1157
1158 /* shutdown the system */
1159 case RAIDFRAME_SHUTDOWN:
1160
1161 if ((error = raidlock(rs)) != 0)
1162 return (error);
1163
1164 /*
1165 * If somebody has a partition mounted, we shouldn't
1166 * shutdown.
1167 */
1168
1169 part = DISKPART(dev);
1170 pmask = (1 << part);
1171 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1172 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1173 (rs->sc_dkdev.dk_copenmask & pmask))) {
1174 raidunlock(rs);
1175 return (EBUSY);
1176 }
1177
1178 retcode = rf_Shutdown(raidPtr);
1179
1180 /* It's no longer initialized... */
1181 rs->sc_flags &= ~RAIDF_INITED;
1182
1183 /* free the pseudo device attach bits */
1184
1185 cf = device_cfdata(rs->sc_dev);
1186 /* XXX this causes us to not return any errors
1187 from the above call to rf_Shutdown() */
1188 retcode = config_detach(rs->sc_dev, DETACH_QUIET);
1189 free(cf, M_RAIDFRAME);
1190
1191 /* Detach the disk. */
1192 disk_detach(&rs->sc_dkdev);
1193 disk_destroy(&rs->sc_dkdev);
1194
1195 raidunlock(rs);
1196
1197 return (retcode);
1198 case RAIDFRAME_GET_COMPONENT_LABEL:
1199 clabel_ptr = (RF_ComponentLabel_t **) data;
1200 /* need to read the component label for the disk indicated
1201 by row,column in clabel */
1202
1203 /* For practice, let's get it directly fromdisk, rather
1204 than from the in-core copy */
1205 RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
1206 (RF_ComponentLabel_t *));
1207 if (clabel == NULL)
1208 return (ENOMEM);
1209
1210 retcode = copyin( *clabel_ptr, clabel,
1211 sizeof(RF_ComponentLabel_t));
1212
1213 if (retcode) {
1214 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1215 return(retcode);
1216 }
1217
1218 clabel->row = 0; /* Don't allow looking at anything else.*/
1219
1220 column = clabel->column;
1221
1222 if ((column < 0) || (column >= raidPtr->numCol +
1223 raidPtr->numSpare)) {
1224 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1225 return(EINVAL);
1226 }
1227
1228 retcode = raidread_component_label(raidPtr->Disks[column].dev,
1229 raidPtr->raid_cinfo[column].ci_vp,
1230 clabel );
1231
1232 if (retcode == 0) {
1233 retcode = copyout(clabel, *clabel_ptr,
1234 sizeof(RF_ComponentLabel_t));
1235 }
1236 RF_Free(clabel, sizeof(RF_ComponentLabel_t));
1237 return (retcode);
1238
1239 case RAIDFRAME_SET_COMPONENT_LABEL:
1240 clabel = (RF_ComponentLabel_t *) data;
1241
1242 /* XXX check the label for valid stuff... */
1243 /* Note that some things *should not* get modified --
1244 the user should be re-initing the labels instead of
1245 trying to patch things.
1246 */
1247
1248 raidid = raidPtr->raidid;
1249 #ifdef DEBUG
1250 printf("raid%d: Got component label:\n", raidid);
1251 printf("raid%d: Version: %d\n", raidid, clabel->version);
1252 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1253 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1254 printf("raid%d: Column: %d\n", raidid, clabel->column);
1255 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1256 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1257 printf("raid%d: Status: %d\n", raidid, clabel->status);
1258 #endif
1259 clabel->row = 0;
1260 column = clabel->column;
1261
1262 if ((column < 0) || (column >= raidPtr->numCol)) {
1263 return(EINVAL);
1264 }
1265
1266 /* XXX this isn't allowed to do anything for now :-) */
1267
1268 /* XXX and before it is, we need to fill in the rest
1269 of the fields!?!?!?! */
1270 #if 0
1271 raidwrite_component_label(
1272 raidPtr->Disks[column].dev,
1273 raidPtr->raid_cinfo[column].ci_vp,
1274 clabel );
1275 #endif
1276 return (0);
1277
1278 case RAIDFRAME_INIT_LABELS:
1279 clabel = (RF_ComponentLabel_t *) data;
1280 /*
1281 we only want the serial number from
1282 the above. We get all the rest of the information
1283 from the config that was used to create this RAID
1284 set.
1285 */
1286
1287 raidPtr->serial_number = clabel->serial_number;
1288
1289 RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t),
1290 (RF_ComponentLabel_t *));
1291 if (ci_label == NULL)
1292 return (ENOMEM);
1293
1294 raid_init_component_label(raidPtr, ci_label);
1295 ci_label->serial_number = clabel->serial_number;
1296 ci_label->row = 0; /* we dont' pretend to support more */
1297
1298 for(column=0;column<raidPtr->numCol;column++) {
1299 diskPtr = &raidPtr->Disks[column];
1300 if (!RF_DEAD_DISK(diskPtr->status)) {
1301 ci_label->partitionSize = diskPtr->partitionSize;
1302 ci_label->column = column;
1303 raidwrite_component_label(
1304 raidPtr->Disks[column].dev,
1305 raidPtr->raid_cinfo[column].ci_vp,
1306 ci_label );
1307 }
1308 }
1309 RF_Free(ci_label, sizeof(RF_ComponentLabel_t));
1310
1311 return (retcode);
1312 case RAIDFRAME_SET_AUTOCONFIG:
1313 d = rf_set_autoconfig(raidPtr, *(int *) data);
1314 printf("raid%d: New autoconfig value is: %d\n",
1315 raidPtr->raidid, d);
1316 *(int *) data = d;
1317 return (retcode);
1318
1319 case RAIDFRAME_SET_ROOT:
1320 d = rf_set_rootpartition(raidPtr, *(int *) data);
1321 printf("raid%d: New rootpartition value is: %d\n",
1322 raidPtr->raidid, d);
1323 *(int *) data = d;
1324 return (retcode);
1325
1326 /* initialize all parity */
1327 case RAIDFRAME_REWRITEPARITY:
1328
1329 if (raidPtr->Layout.map->faultsTolerated == 0) {
1330 /* Parity for RAID 0 is trivially correct */
1331 raidPtr->parity_good = RF_RAID_CLEAN;
1332 return(0);
1333 }
1334
1335 if (raidPtr->parity_rewrite_in_progress == 1) {
1336 /* Re-write is already in progress! */
1337 return(EINVAL);
1338 }
1339
1340 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1341 rf_RewriteParityThread,
1342 raidPtr,"raid_parity");
1343 return (retcode);
1344
1345
1346 case RAIDFRAME_ADD_HOT_SPARE:
1347 sparePtr = (RF_SingleComponent_t *) data;
1348 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1349 retcode = rf_add_hot_spare(raidPtr, &component);
1350 return(retcode);
1351
1352 case RAIDFRAME_REMOVE_HOT_SPARE:
1353 return(retcode);
1354
1355 case RAIDFRAME_DELETE_COMPONENT:
1356 componentPtr = (RF_SingleComponent_t *)data;
1357 memcpy( &component, componentPtr,
1358 sizeof(RF_SingleComponent_t));
1359 retcode = rf_delete_component(raidPtr, &component);
1360 return(retcode);
1361
1362 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1363 componentPtr = (RF_SingleComponent_t *)data;
1364 memcpy( &component, componentPtr,
1365 sizeof(RF_SingleComponent_t));
1366 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1367 return(retcode);
1368
1369 case RAIDFRAME_REBUILD_IN_PLACE:
1370
1371 if (raidPtr->Layout.map->faultsTolerated == 0) {
1372 /* Can't do this on a RAID 0!! */
1373 return(EINVAL);
1374 }
1375
1376 if (raidPtr->recon_in_progress == 1) {
1377 /* a reconstruct is already in progress! */
1378 return(EINVAL);
1379 }
1380
1381 componentPtr = (RF_SingleComponent_t *) data;
1382 memcpy( &component, componentPtr,
1383 sizeof(RF_SingleComponent_t));
1384 component.row = 0; /* we don't support any more */
1385 column = component.column;
1386
1387 if ((column < 0) || (column >= raidPtr->numCol)) {
1388 return(EINVAL);
1389 }
1390
1391 RF_LOCK_MUTEX(raidPtr->mutex);
1392 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1393 (raidPtr->numFailures > 0)) {
1394 /* XXX 0 above shouldn't be constant!!! */
1395 /* some component other than this has failed.
1396 Let's not make things worse than they already
1397 are... */
1398 printf("raid%d: Unable to reconstruct to disk at:\n",
1399 raidPtr->raidid);
1400 printf("raid%d: Col: %d Too many failures.\n",
1401 raidPtr->raidid, column);
1402 RF_UNLOCK_MUTEX(raidPtr->mutex);
1403 return (EINVAL);
1404 }
1405 if (raidPtr->Disks[column].status ==
1406 rf_ds_reconstructing) {
1407 printf("raid%d: Unable to reconstruct to disk at:\n",
1408 raidPtr->raidid);
1409 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1410
1411 RF_UNLOCK_MUTEX(raidPtr->mutex);
1412 return (EINVAL);
1413 }
1414 if (raidPtr->Disks[column].status == rf_ds_spared) {
1415 RF_UNLOCK_MUTEX(raidPtr->mutex);
1416 return (EINVAL);
1417 }
1418 RF_UNLOCK_MUTEX(raidPtr->mutex);
1419
1420 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1421 if (rrcopy == NULL)
1422 return(ENOMEM);
1423
1424 rrcopy->raidPtr = (void *) raidPtr;
1425 rrcopy->col = column;
1426
1427 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1428 rf_ReconstructInPlaceThread,
1429 rrcopy,"raid_reconip");
1430 return(retcode);
1431
1432 case RAIDFRAME_GET_INFO:
1433 if (!raidPtr->valid)
1434 return (ENODEV);
1435 ucfgp = (RF_DeviceConfig_t **) data;
1436 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1437 (RF_DeviceConfig_t *));
1438 if (d_cfg == NULL)
1439 return (ENOMEM);
1440 d_cfg->rows = 1; /* there is only 1 row now */
1441 d_cfg->cols = raidPtr->numCol;
1442 d_cfg->ndevs = raidPtr->numCol;
1443 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1444 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1445 return (ENOMEM);
1446 }
1447 d_cfg->nspares = raidPtr->numSpare;
1448 if (d_cfg->nspares >= RF_MAX_DISKS) {
1449 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1450 return (ENOMEM);
1451 }
1452 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1453 d = 0;
1454 for (j = 0; j < d_cfg->cols; j++) {
1455 d_cfg->devs[d] = raidPtr->Disks[j];
1456 d++;
1457 }
1458 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1459 d_cfg->spares[i] = raidPtr->Disks[j];
1460 }
1461 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1462 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1463
1464 return (retcode);
1465
1466 case RAIDFRAME_CHECK_PARITY:
1467 *(int *) data = raidPtr->parity_good;
1468 return (0);
1469
1470 case RAIDFRAME_RESET_ACCTOTALS:
1471 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1472 return (0);
1473
1474 case RAIDFRAME_GET_ACCTOTALS:
1475 totals = (RF_AccTotals_t *) data;
1476 *totals = raidPtr->acc_totals;
1477 return (0);
1478
1479 case RAIDFRAME_KEEP_ACCTOTALS:
1480 raidPtr->keep_acc_totals = *(int *)data;
1481 return (0);
1482
1483 case RAIDFRAME_GET_SIZE:
1484 *(int *) data = raidPtr->totalSectors;
1485 return (0);
1486
1487 /* fail a disk & optionally start reconstruction */
1488 case RAIDFRAME_FAIL_DISK:
1489
1490 if (raidPtr->Layout.map->faultsTolerated == 0) {
1491 /* Can't do this on a RAID 0!! */
1492 return(EINVAL);
1493 }
1494
1495 rr = (struct rf_recon_req *) data;
1496 rr->row = 0;
1497 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1498 return (EINVAL);
1499
1500
1501 RF_LOCK_MUTEX(raidPtr->mutex);
1502 if (raidPtr->status == rf_rs_reconstructing) {
1503 /* you can't fail a disk while we're reconstructing! */
1504 /* XXX wrong for RAID6 */
1505 RF_UNLOCK_MUTEX(raidPtr->mutex);
1506 return (EINVAL);
1507 }
1508 if ((raidPtr->Disks[rr->col].status ==
1509 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1510 /* some other component has failed. Let's not make
1511 things worse. XXX wrong for RAID6 */
1512 RF_UNLOCK_MUTEX(raidPtr->mutex);
1513 return (EINVAL);
1514 }
1515 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1516 /* Can't fail a spared disk! */
1517 RF_UNLOCK_MUTEX(raidPtr->mutex);
1518 return (EINVAL);
1519 }
1520 RF_UNLOCK_MUTEX(raidPtr->mutex);
1521
1522 /* make a copy of the recon request so that we don't rely on
1523 * the user's buffer */
1524 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1525 if (rrcopy == NULL)
1526 return(ENOMEM);
1527 memcpy(rrcopy, rr, sizeof(*rr));
1528 rrcopy->raidPtr = (void *) raidPtr;
1529
1530 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1531 rf_ReconThread,
1532 rrcopy,"raid_recon");
1533 return (0);
1534
1535 /* invoke a copyback operation after recon on whatever disk
1536 * needs it, if any */
1537 case RAIDFRAME_COPYBACK:
1538
1539 if (raidPtr->Layout.map->faultsTolerated == 0) {
1540 /* This makes no sense on a RAID 0!! */
1541 return(EINVAL);
1542 }
1543
1544 if (raidPtr->copyback_in_progress == 1) {
1545 /* Copyback is already in progress! */
1546 return(EINVAL);
1547 }
1548
1549 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1550 rf_CopybackThread,
1551 raidPtr,"raid_copyback");
1552 return (retcode);
1553
1554 /* return the percentage completion of reconstruction */
1555 case RAIDFRAME_CHECK_RECON_STATUS:
1556 if (raidPtr->Layout.map->faultsTolerated == 0) {
1557 /* This makes no sense on a RAID 0, so tell the
1558 user it's done. */
1559 *(int *) data = 100;
1560 return(0);
1561 }
1562 if (raidPtr->status != rf_rs_reconstructing)
1563 *(int *) data = 100;
1564 else {
1565 if (raidPtr->reconControl->numRUsTotal > 0) {
1566 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1567 } else {
1568 *(int *) data = 0;
1569 }
1570 }
1571 return (0);
1572 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1573 progressInfoPtr = (RF_ProgressInfo_t **) data;
1574 if (raidPtr->status != rf_rs_reconstructing) {
1575 progressInfo.remaining = 0;
1576 progressInfo.completed = 100;
1577 progressInfo.total = 100;
1578 } else {
1579 progressInfo.total =
1580 raidPtr->reconControl->numRUsTotal;
1581 progressInfo.completed =
1582 raidPtr->reconControl->numRUsComplete;
1583 progressInfo.remaining = progressInfo.total -
1584 progressInfo.completed;
1585 }
1586 retcode = copyout(&progressInfo, *progressInfoPtr,
1587 sizeof(RF_ProgressInfo_t));
1588 return (retcode);
1589
1590 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1591 if (raidPtr->Layout.map->faultsTolerated == 0) {
1592 /* This makes no sense on a RAID 0, so tell the
1593 user it's done. */
1594 *(int *) data = 100;
1595 return(0);
1596 }
1597 if (raidPtr->parity_rewrite_in_progress == 1) {
1598 *(int *) data = 100 *
1599 raidPtr->parity_rewrite_stripes_done /
1600 raidPtr->Layout.numStripe;
1601 } else {
1602 *(int *) data = 100;
1603 }
1604 return (0);
1605
1606 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1607 progressInfoPtr = (RF_ProgressInfo_t **) data;
1608 if (raidPtr->parity_rewrite_in_progress == 1) {
1609 progressInfo.total = raidPtr->Layout.numStripe;
1610 progressInfo.completed =
1611 raidPtr->parity_rewrite_stripes_done;
1612 progressInfo.remaining = progressInfo.total -
1613 progressInfo.completed;
1614 } else {
1615 progressInfo.remaining = 0;
1616 progressInfo.completed = 100;
1617 progressInfo.total = 100;
1618 }
1619 retcode = copyout(&progressInfo, *progressInfoPtr,
1620 sizeof(RF_ProgressInfo_t));
1621 return (retcode);
1622
1623 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1624 if (raidPtr->Layout.map->faultsTolerated == 0) {
1625 /* This makes no sense on a RAID 0 */
1626 *(int *) data = 100;
1627 return(0);
1628 }
1629 if (raidPtr->copyback_in_progress == 1) {
1630 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1631 raidPtr->Layout.numStripe;
1632 } else {
1633 *(int *) data = 100;
1634 }
1635 return (0);
1636
1637 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1638 progressInfoPtr = (RF_ProgressInfo_t **) data;
1639 if (raidPtr->copyback_in_progress == 1) {
1640 progressInfo.total = raidPtr->Layout.numStripe;
1641 progressInfo.completed =
1642 raidPtr->copyback_stripes_done;
1643 progressInfo.remaining = progressInfo.total -
1644 progressInfo.completed;
1645 } else {
1646 progressInfo.remaining = 0;
1647 progressInfo.completed = 100;
1648 progressInfo.total = 100;
1649 }
1650 retcode = copyout(&progressInfo, *progressInfoPtr,
1651 sizeof(RF_ProgressInfo_t));
1652 return (retcode);
1653
1654 /* the sparetable daemon calls this to wait for the kernel to
1655 * need a spare table. this ioctl does not return until a
1656 * spare table is needed. XXX -- calling mpsleep here in the
1657 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1658 * -- I should either compute the spare table in the kernel,
1659 * or have a different -- XXX XXX -- interface (a different
1660 * character device) for delivering the table -- XXX */
1661 #if 0
1662 case RAIDFRAME_SPARET_WAIT:
1663 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1664 while (!rf_sparet_wait_queue)
1665 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1666 waitreq = rf_sparet_wait_queue;
1667 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1668 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1669
1670 /* structure assignment */
1671 *((RF_SparetWait_t *) data) = *waitreq;
1672
1673 RF_Free(waitreq, sizeof(*waitreq));
1674 return (0);
1675
1676 /* wakes up a process waiting on SPARET_WAIT and puts an error
1677 * code in it that will cause the dameon to exit */
1678 case RAIDFRAME_ABORT_SPARET_WAIT:
1679 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1680 waitreq->fcol = -1;
1681 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1682 waitreq->next = rf_sparet_wait_queue;
1683 rf_sparet_wait_queue = waitreq;
1684 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1685 wakeup(&rf_sparet_wait_queue);
1686 return (0);
1687
1688 /* used by the spare table daemon to deliver a spare table
1689 * into the kernel */
1690 case RAIDFRAME_SEND_SPARET:
1691
1692 /* install the spare table */
1693 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1694
1695 /* respond to the requestor. the return status of the spare
1696 * table installation is passed in the "fcol" field */
1697 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1698 waitreq->fcol = retcode;
1699 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1700 waitreq->next = rf_sparet_resp_queue;
1701 rf_sparet_resp_queue = waitreq;
1702 wakeup(&rf_sparet_resp_queue);
1703 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1704
1705 return (retcode);
1706 #endif
1707
1708 default:
1709 break; /* fall through to the os-specific code below */
1710
1711 }
1712
1713 if (!raidPtr->valid)
1714 return (EINVAL);
1715
1716 /*
1717 * Add support for "regular" device ioctls here.
1718 */
1719
1720 switch (cmd) {
1721 case DIOCGDINFO:
1722 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1723 break;
1724 #ifdef __HAVE_OLD_DISKLABEL
1725 case ODIOCGDINFO:
1726 newlabel = *(rs->sc_dkdev.dk_label);
1727 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1728 return ENOTTY;
1729 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1730 break;
1731 #endif
1732
1733 case DIOCGPART:
1734 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1735 ((struct partinfo *) data)->part =
1736 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1737 break;
1738
1739 case DIOCWDINFO:
1740 case DIOCSDINFO:
1741 #ifdef __HAVE_OLD_DISKLABEL
1742 case ODIOCWDINFO:
1743 case ODIOCSDINFO:
1744 #endif
1745 {
1746 struct disklabel *lp;
1747 #ifdef __HAVE_OLD_DISKLABEL
1748 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1749 memset(&newlabel, 0, sizeof newlabel);
1750 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1751 lp = &newlabel;
1752 } else
1753 #endif
1754 lp = (struct disklabel *)data;
1755
1756 if ((error = raidlock(rs)) != 0)
1757 return (error);
1758
1759 rs->sc_flags |= RAIDF_LABELLING;
1760
1761 error = setdisklabel(rs->sc_dkdev.dk_label,
1762 lp, 0, rs->sc_dkdev.dk_cpulabel);
1763 if (error == 0) {
1764 if (cmd == DIOCWDINFO
1765 #ifdef __HAVE_OLD_DISKLABEL
1766 || cmd == ODIOCWDINFO
1767 #endif
1768 )
1769 error = writedisklabel(RAIDLABELDEV(dev),
1770 raidstrategy, rs->sc_dkdev.dk_label,
1771 rs->sc_dkdev.dk_cpulabel);
1772 }
1773 rs->sc_flags &= ~RAIDF_LABELLING;
1774
1775 raidunlock(rs);
1776
1777 if (error)
1778 return (error);
1779 break;
1780 }
1781
1782 case DIOCWLABEL:
1783 if (*(int *) data != 0)
1784 rs->sc_flags |= RAIDF_WLABEL;
1785 else
1786 rs->sc_flags &= ~RAIDF_WLABEL;
1787 break;
1788
1789 case DIOCGDEFLABEL:
1790 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1791 break;
1792
1793 #ifdef __HAVE_OLD_DISKLABEL
1794 case ODIOCGDEFLABEL:
1795 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1796 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1797 return ENOTTY;
1798 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1799 break;
1800 #endif
1801
1802 case DIOCAWEDGE:
1803 case DIOCDWEDGE:
1804 dkw = (void *)data;
1805
1806 /* If the ioctl happens here, the parent is us. */
1807 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1808 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1809
1810 case DIOCLWEDGES:
1811 return dkwedge_list(&rs->sc_dkdev,
1812 (struct dkwedge_list *)data, l);
1813
1814 default:
1815 retcode = ENOTTY;
1816 }
1817 return (retcode);
1818
1819 }
1820
1821
1822 /* raidinit -- complete the rest of the initialization for the
1823 RAIDframe device. */
1824
1825
1826 static void
1827 raidinit(RF_Raid_t *raidPtr)
1828 {
1829 struct cfdata *cf;
1830 struct raid_softc *rs;
1831 int unit;
1832
1833 unit = raidPtr->raidid;
1834
1835 rs = &raid_softc[unit];
1836
1837 /* XXX should check return code first... */
1838 rs->sc_flags |= RAIDF_INITED;
1839
1840 /* XXX doesn't check bounds. */
1841 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1842
1843 /* attach the pseudo device */
1844 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1845 cf->cf_name = raid_cd.cd_name;
1846 cf->cf_atname = raid_cd.cd_name;
1847 cf->cf_unit = unit;
1848 cf->cf_fstate = FSTATE_STAR;
1849
1850 rs->sc_dev = config_attach_pseudo(cf);
1851
1852 if (rs->sc_dev==NULL) {
1853 printf("raid%d: config_attach_pseudo failed\n",
1854 raidPtr->raidid);
1855 }
1856
1857 /* disk_attach actually creates space for the CPU disklabel, among
1858 * other things, so it's critical to call this *BEFORE* we try putzing
1859 * with disklabels. */
1860
1861 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1862 disk_attach(&rs->sc_dkdev);
1863
1864 /* XXX There may be a weird interaction here between this, and
1865 * protectedSectors, as used in RAIDframe. */
1866
1867 rs->sc_size = raidPtr->totalSectors;
1868
1869 dkwedge_discover(&rs->sc_dkdev);
1870
1871 rf_set_properties(rs, raidPtr);
1872
1873 }
1874 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1875 /* wake up the daemon & tell it to get us a spare table
1876 * XXX
1877 * the entries in the queues should be tagged with the raidPtr
1878 * so that in the extremely rare case that two recons happen at once,
1879 * we know for which device were requesting a spare table
1880 * XXX
1881 *
1882 * XXX This code is not currently used. GO
1883 */
1884 int
1885 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1886 {
1887 int retcode;
1888
1889 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1890 req->next = rf_sparet_wait_queue;
1891 rf_sparet_wait_queue = req;
1892 wakeup(&rf_sparet_wait_queue);
1893
1894 /* mpsleep unlocks the mutex */
1895 while (!rf_sparet_resp_queue) {
1896 tsleep(&rf_sparet_resp_queue, PRIBIO,
1897 "raidframe getsparetable", 0);
1898 }
1899 req = rf_sparet_resp_queue;
1900 rf_sparet_resp_queue = req->next;
1901 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1902
1903 retcode = req->fcol;
1904 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1905 * alloc'd */
1906 return (retcode);
1907 }
1908 #endif
1909
1910 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1911 * bp & passes it down.
1912 * any calls originating in the kernel must use non-blocking I/O
1913 * do some extra sanity checking to return "appropriate" error values for
1914 * certain conditions (to make some standard utilities work)
1915 *
1916 * Formerly known as: rf_DoAccessKernel
1917 */
1918 void
1919 raidstart(RF_Raid_t *raidPtr)
1920 {
1921 RF_SectorCount_t num_blocks, pb, sum;
1922 RF_RaidAddr_t raid_addr;
1923 struct partition *pp;
1924 daddr_t blocknum;
1925 int unit;
1926 struct raid_softc *rs;
1927 int do_async;
1928 struct buf *bp;
1929 int rc;
1930
1931 unit = raidPtr->raidid;
1932 rs = &raid_softc[unit];
1933
1934 /* quick check to see if anything has died recently */
1935 RF_LOCK_MUTEX(raidPtr->mutex);
1936 if (raidPtr->numNewFailures > 0) {
1937 RF_UNLOCK_MUTEX(raidPtr->mutex);
1938 rf_update_component_labels(raidPtr,
1939 RF_NORMAL_COMPONENT_UPDATE);
1940 RF_LOCK_MUTEX(raidPtr->mutex);
1941 raidPtr->numNewFailures--;
1942 }
1943
1944 /* Check to see if we're at the limit... */
1945 while (raidPtr->openings > 0) {
1946 RF_UNLOCK_MUTEX(raidPtr->mutex);
1947
1948 /* get the next item, if any, from the queue */
1949 if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
1950 /* nothing more to do */
1951 return;
1952 }
1953
1954 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1955 * partition.. Need to make it absolute to the underlying
1956 * device.. */
1957
1958 blocknum = bp->b_blkno;
1959 if (DISKPART(bp->b_dev) != RAW_PART) {
1960 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1961 blocknum += pp->p_offset;
1962 }
1963
1964 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1965 (int) blocknum));
1966
1967 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1968 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1969
1970 /* *THIS* is where we adjust what block we're going to...
1971 * but DO NOT TOUCH bp->b_blkno!!! */
1972 raid_addr = blocknum;
1973
1974 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1975 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1976 sum = raid_addr + num_blocks + pb;
1977 if (1 || rf_debugKernelAccess) {
1978 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1979 (int) raid_addr, (int) sum, (int) num_blocks,
1980 (int) pb, (int) bp->b_resid));
1981 }
1982 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1983 || (sum < num_blocks) || (sum < pb)) {
1984 bp->b_error = ENOSPC;
1985 bp->b_resid = bp->b_bcount;
1986 biodone(bp);
1987 RF_LOCK_MUTEX(raidPtr->mutex);
1988 continue;
1989 }
1990 /*
1991 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1992 */
1993
1994 if (bp->b_bcount & raidPtr->sectorMask) {
1995 bp->b_error = EINVAL;
1996 bp->b_resid = bp->b_bcount;
1997 biodone(bp);
1998 RF_LOCK_MUTEX(raidPtr->mutex);
1999 continue;
2000
2001 }
2002 db1_printf(("Calling DoAccess..\n"));
2003
2004
2005 RF_LOCK_MUTEX(raidPtr->mutex);
2006 raidPtr->openings--;
2007 RF_UNLOCK_MUTEX(raidPtr->mutex);
2008
2009 /*
2010 * Everything is async.
2011 */
2012 do_async = 1;
2013
2014 disk_busy(&rs->sc_dkdev);
2015
2016 /* XXX we're still at splbio() here... do we *really*
2017 need to be? */
2018
2019 /* don't ever condition on bp->b_flags & B_WRITE.
2020 * always condition on B_READ instead */
2021
2022 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2023 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2024 do_async, raid_addr, num_blocks,
2025 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2026
2027 if (rc) {
2028 bp->b_error = rc;
2029 bp->b_resid = bp->b_bcount;
2030 biodone(bp);
2031 /* continue loop */
2032 }
2033
2034 RF_LOCK_MUTEX(raidPtr->mutex);
2035 }
2036 RF_UNLOCK_MUTEX(raidPtr->mutex);
2037 }
2038
2039
2040
2041
2042 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2043
2044 int
2045 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2046 {
2047 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2048 struct buf *bp;
2049
2050 req->queue = queue;
2051
2052 #if DIAGNOSTIC
2053 if (queue->raidPtr->raidid >= numraid) {
2054 printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
2055 numraid);
2056 panic("Invalid Unit number in rf_DispatchKernelIO");
2057 }
2058 #endif
2059
2060 bp = req->bp;
2061
2062 switch (req->type) {
2063 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2064 /* XXX need to do something extra here.. */
2065 /* I'm leaving this in, as I've never actually seen it used,
2066 * and I'd like folks to report it... GO */
2067 printf(("WAKEUP CALLED\n"));
2068 queue->numOutstanding++;
2069
2070 bp->b_flags = 0;
2071 bp->b_private = req;
2072
2073 KernelWakeupFunc(bp);
2074 break;
2075
2076 case RF_IO_TYPE_READ:
2077 case RF_IO_TYPE_WRITE:
2078 #if RF_ACC_TRACE > 0
2079 if (req->tracerec) {
2080 RF_ETIMER_START(req->tracerec->timer);
2081 }
2082 #endif
2083 InitBP(bp, queue->rf_cinfo->ci_vp,
2084 op, queue->rf_cinfo->ci_dev,
2085 req->sectorOffset, req->numSector,
2086 req->buf, KernelWakeupFunc, (void *) req,
2087 queue->raidPtr->logBytesPerSector, req->b_proc);
2088
2089 if (rf_debugKernelAccess) {
2090 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2091 (long) bp->b_blkno));
2092 }
2093 queue->numOutstanding++;
2094 queue->last_deq_sector = req->sectorOffset;
2095 /* acc wouldn't have been let in if there were any pending
2096 * reqs at any other priority */
2097 queue->curPriority = req->priority;
2098
2099 db1_printf(("Going for %c to unit %d col %d\n",
2100 req->type, queue->raidPtr->raidid,
2101 queue->col));
2102 db1_printf(("sector %d count %d (%d bytes) %d\n",
2103 (int) req->sectorOffset, (int) req->numSector,
2104 (int) (req->numSector <<
2105 queue->raidPtr->logBytesPerSector),
2106 (int) queue->raidPtr->logBytesPerSector));
2107 VOP_STRATEGY(bp->b_vp, bp);
2108
2109 break;
2110
2111 default:
2112 panic("bad req->type in rf_DispatchKernelIO");
2113 }
2114 db1_printf(("Exiting from DispatchKernelIO\n"));
2115
2116 return (0);
2117 }
2118 /* this is the callback function associated with a I/O invoked from
2119 kernel code.
2120 */
2121 static void
2122 KernelWakeupFunc(struct buf *bp)
2123 {
2124 RF_DiskQueueData_t *req = NULL;
2125 RF_DiskQueue_t *queue;
2126 int s;
2127
2128 s = splbio();
2129 db1_printf(("recovering the request queue:\n"));
2130 req = bp->b_private;
2131
2132 queue = (RF_DiskQueue_t *) req->queue;
2133
2134 #if RF_ACC_TRACE > 0
2135 if (req->tracerec) {
2136 RF_ETIMER_STOP(req->tracerec->timer);
2137 RF_ETIMER_EVAL(req->tracerec->timer);
2138 RF_LOCK_MUTEX(rf_tracing_mutex);
2139 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2140 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2141 req->tracerec->num_phys_ios++;
2142 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2143 }
2144 #endif
2145
2146 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2147 * ballistic, and mark the component as hosed... */
2148
2149 if (bp->b_error != 0) {
2150 /* Mark the disk as dead */
2151 /* but only mark it once... */
2152 /* and only if it wouldn't leave this RAID set
2153 completely broken */
2154 if (((queue->raidPtr->Disks[queue->col].status ==
2155 rf_ds_optimal) ||
2156 (queue->raidPtr->Disks[queue->col].status ==
2157 rf_ds_used_spare)) &&
2158 (queue->raidPtr->numFailures <
2159 queue->raidPtr->Layout.map->faultsTolerated)) {
2160 printf("raid%d: IO Error. Marking %s as failed.\n",
2161 queue->raidPtr->raidid,
2162 queue->raidPtr->Disks[queue->col].devname);
2163 queue->raidPtr->Disks[queue->col].status =
2164 rf_ds_failed;
2165 queue->raidPtr->status = rf_rs_degraded;
2166 queue->raidPtr->numFailures++;
2167 queue->raidPtr->numNewFailures++;
2168 } else { /* Disk is already dead... */
2169 /* printf("Disk already marked as dead!\n"); */
2170 }
2171
2172 }
2173
2174 /* Fill in the error value */
2175
2176 req->error = bp->b_error;
2177
2178 simple_lock(&queue->raidPtr->iodone_lock);
2179
2180 /* Drop this one on the "finished" queue... */
2181 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2182
2183 /* Let the raidio thread know there is work to be done. */
2184 wakeup(&(queue->raidPtr->iodone));
2185
2186 simple_unlock(&queue->raidPtr->iodone_lock);
2187
2188 splx(s);
2189 }
2190
2191
2192
2193 /*
2194 * initialize a buf structure for doing an I/O in the kernel.
2195 */
2196 static void
2197 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2198 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2199 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2200 struct proc *b_proc)
2201 {
2202 /* bp->b_flags = B_PHYS | rw_flag; */
2203 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2204 bp->b_oflags = 0;
2205 bp->b_cflags = 0;
2206 bp->b_bcount = numSect << logBytesPerSector;
2207 bp->b_bufsize = bp->b_bcount;
2208 bp->b_error = 0;
2209 bp->b_dev = dev;
2210 bp->b_data = bf;
2211 bp->b_blkno = startSect;
2212 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2213 if (bp->b_bcount == 0) {
2214 panic("bp->b_bcount is zero in InitBP!!");
2215 }
2216 bp->b_proc = b_proc;
2217 bp->b_iodone = cbFunc;
2218 bp->b_private = cbArg;
2219 bp->b_vp = b_vp;
2220 bp->b_objlock = &b_vp->v_interlock;
2221 if ((bp->b_flags & B_READ) == 0) {
2222 mutex_enter(&b_vp->v_interlock);
2223 b_vp->v_numoutput++;
2224 mutex_exit(&b_vp->v_interlock);
2225 }
2226
2227 }
2228
2229 static void
2230 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2231 struct disklabel *lp)
2232 {
2233 memset(lp, 0, sizeof(*lp));
2234
2235 /* fabricate a label... */
2236 lp->d_secperunit = raidPtr->totalSectors;
2237 lp->d_secsize = raidPtr->bytesPerSector;
2238 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2239 lp->d_ntracks = 4 * raidPtr->numCol;
2240 lp->d_ncylinders = raidPtr->totalSectors /
2241 (lp->d_nsectors * lp->d_ntracks);
2242 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2243
2244 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2245 lp->d_type = DTYPE_RAID;
2246 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2247 lp->d_rpm = 3600;
2248 lp->d_interleave = 1;
2249 lp->d_flags = 0;
2250
2251 lp->d_partitions[RAW_PART].p_offset = 0;
2252 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2253 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2254 lp->d_npartitions = RAW_PART + 1;
2255
2256 lp->d_magic = DISKMAGIC;
2257 lp->d_magic2 = DISKMAGIC;
2258 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2259
2260 }
2261 /*
2262 * Read the disklabel from the raid device. If one is not present, fake one
2263 * up.
2264 */
2265 static void
2266 raidgetdisklabel(dev_t dev)
2267 {
2268 int unit = raidunit(dev);
2269 struct raid_softc *rs = &raid_softc[unit];
2270 const char *errstring;
2271 struct disklabel *lp = rs->sc_dkdev.dk_label;
2272 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2273 RF_Raid_t *raidPtr;
2274
2275 db1_printf(("Getting the disklabel...\n"));
2276
2277 memset(clp, 0, sizeof(*clp));
2278
2279 raidPtr = raidPtrs[unit];
2280
2281 raidgetdefaultlabel(raidPtr, rs, lp);
2282
2283 /*
2284 * Call the generic disklabel extraction routine.
2285 */
2286 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2287 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2288 if (errstring)
2289 raidmakedisklabel(rs);
2290 else {
2291 int i;
2292 struct partition *pp;
2293
2294 /*
2295 * Sanity check whether the found disklabel is valid.
2296 *
2297 * This is necessary since total size of the raid device
2298 * may vary when an interleave is changed even though exactly
2299 * same components are used, and old disklabel may used
2300 * if that is found.
2301 */
2302 if (lp->d_secperunit != rs->sc_size)
2303 printf("raid%d: WARNING: %s: "
2304 "total sector size in disklabel (%d) != "
2305 "the size of raid (%ld)\n", unit, rs->sc_xname,
2306 lp->d_secperunit, (long) rs->sc_size);
2307 for (i = 0; i < lp->d_npartitions; i++) {
2308 pp = &lp->d_partitions[i];
2309 if (pp->p_offset + pp->p_size > rs->sc_size)
2310 printf("raid%d: WARNING: %s: end of partition `%c' "
2311 "exceeds the size of raid (%ld)\n",
2312 unit, rs->sc_xname, 'a' + i, (long) rs->sc_size);
2313 }
2314 }
2315
2316 }
2317 /*
2318 * Take care of things one might want to take care of in the event
2319 * that a disklabel isn't present.
2320 */
2321 static void
2322 raidmakedisklabel(struct raid_softc *rs)
2323 {
2324 struct disklabel *lp = rs->sc_dkdev.dk_label;
2325 db1_printf(("Making a label..\n"));
2326
2327 /*
2328 * For historical reasons, if there's no disklabel present
2329 * the raw partition must be marked FS_BSDFFS.
2330 */
2331
2332 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2333
2334 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2335
2336 lp->d_checksum = dkcksum(lp);
2337 }
2338 /*
2339 * Wait interruptibly for an exclusive lock.
2340 *
2341 * XXX
2342 * Several drivers do this; it should be abstracted and made MP-safe.
2343 * (Hmm... where have we seen this warning before :-> GO )
2344 */
2345 static int
2346 raidlock(struct raid_softc *rs)
2347 {
2348 int error;
2349
2350 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2351 rs->sc_flags |= RAIDF_WANTED;
2352 if ((error =
2353 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2354 return (error);
2355 }
2356 rs->sc_flags |= RAIDF_LOCKED;
2357 return (0);
2358 }
2359 /*
2360 * Unlock and wake up any waiters.
2361 */
2362 static void
2363 raidunlock(struct raid_softc *rs)
2364 {
2365
2366 rs->sc_flags &= ~RAIDF_LOCKED;
2367 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2368 rs->sc_flags &= ~RAIDF_WANTED;
2369 wakeup(rs);
2370 }
2371 }
2372
2373
2374 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2375 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2376
2377 int
2378 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2379 {
2380 RF_ComponentLabel_t clabel;
2381 raidread_component_label(dev, b_vp, &clabel);
2382 clabel.mod_counter = mod_counter;
2383 clabel.clean = RF_RAID_CLEAN;
2384 raidwrite_component_label(dev, b_vp, &clabel);
2385 return(0);
2386 }
2387
2388
2389 int
2390 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2391 {
2392 RF_ComponentLabel_t clabel;
2393 raidread_component_label(dev, b_vp, &clabel);
2394 clabel.mod_counter = mod_counter;
2395 clabel.clean = RF_RAID_DIRTY;
2396 raidwrite_component_label(dev, b_vp, &clabel);
2397 return(0);
2398 }
2399
2400 /* ARGSUSED */
2401 int
2402 raidread_component_label(dev_t dev, struct vnode *b_vp,
2403 RF_ComponentLabel_t *clabel)
2404 {
2405 struct buf *bp;
2406 const struct bdevsw *bdev;
2407 int error;
2408
2409 /* XXX should probably ensure that we don't try to do this if
2410 someone has changed rf_protected_sectors. */
2411
2412 if (b_vp == NULL) {
2413 /* For whatever reason, this component is not valid.
2414 Don't try to read a component label from it. */
2415 return(EINVAL);
2416 }
2417
2418 /* get a block of the appropriate size... */
2419 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2420 bp->b_dev = dev;
2421
2422 /* get our ducks in a row for the read */
2423 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2424 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2425 bp->b_flags |= B_READ;
2426 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2427
2428 bdev = bdevsw_lookup(bp->b_dev);
2429 if (bdev == NULL)
2430 return (ENXIO);
2431 (*bdev->d_strategy)(bp);
2432
2433 error = biowait(bp);
2434
2435 if (!error) {
2436 memcpy(clabel, bp->b_data,
2437 sizeof(RF_ComponentLabel_t));
2438 }
2439
2440 brelse(bp, 0);
2441 return(error);
2442 }
2443 /* ARGSUSED */
2444 int
2445 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2446 RF_ComponentLabel_t *clabel)
2447 {
2448 struct buf *bp;
2449 const struct bdevsw *bdev;
2450 int error;
2451
2452 /* get a block of the appropriate size... */
2453 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2454 bp->b_dev = dev;
2455
2456 /* get our ducks in a row for the write */
2457 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2458 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2459 bp->b_flags |= B_WRITE;
2460 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2461
2462 memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
2463
2464 memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
2465
2466 bdev = bdevsw_lookup(bp->b_dev);
2467 if (bdev == NULL)
2468 return (ENXIO);
2469 (*bdev->d_strategy)(bp);
2470 error = biowait(bp);
2471 brelse(bp, 0);
2472 if (error) {
2473 #if 1
2474 printf("Failed to write RAID component info!\n");
2475 #endif
2476 }
2477
2478 return(error);
2479 }
2480
2481 void
2482 rf_markalldirty(RF_Raid_t *raidPtr)
2483 {
2484 RF_ComponentLabel_t clabel;
2485 int sparecol;
2486 int c;
2487 int j;
2488 int scol = -1;
2489
2490 raidPtr->mod_counter++;
2491 for (c = 0; c < raidPtr->numCol; c++) {
2492 /* we don't want to touch (at all) a disk that has
2493 failed */
2494 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2495 raidread_component_label(
2496 raidPtr->Disks[c].dev,
2497 raidPtr->raid_cinfo[c].ci_vp,
2498 &clabel);
2499 if (clabel.status == rf_ds_spared) {
2500 /* XXX do something special...
2501 but whatever you do, don't
2502 try to access it!! */
2503 } else {
2504 raidmarkdirty(
2505 raidPtr->Disks[c].dev,
2506 raidPtr->raid_cinfo[c].ci_vp,
2507 raidPtr->mod_counter);
2508 }
2509 }
2510 }
2511
2512 for( c = 0; c < raidPtr->numSpare ; c++) {
2513 sparecol = raidPtr->numCol + c;
2514 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2515 /*
2516
2517 we claim this disk is "optimal" if it's
2518 rf_ds_used_spare, as that means it should be
2519 directly substitutable for the disk it replaced.
2520 We note that too...
2521
2522 */
2523
2524 for(j=0;j<raidPtr->numCol;j++) {
2525 if (raidPtr->Disks[j].spareCol == sparecol) {
2526 scol = j;
2527 break;
2528 }
2529 }
2530
2531 raidread_component_label(
2532 raidPtr->Disks[sparecol].dev,
2533 raidPtr->raid_cinfo[sparecol].ci_vp,
2534 &clabel);
2535 /* make sure status is noted */
2536
2537 raid_init_component_label(raidPtr, &clabel);
2538
2539 clabel.row = 0;
2540 clabel.column = scol;
2541 /* Note: we *don't* change status from rf_ds_used_spare
2542 to rf_ds_optimal */
2543 /* clabel.status = rf_ds_optimal; */
2544
2545 raidmarkdirty(raidPtr->Disks[sparecol].dev,
2546 raidPtr->raid_cinfo[sparecol].ci_vp,
2547 raidPtr->mod_counter);
2548 }
2549 }
2550 }
2551
2552
2553 void
2554 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2555 {
2556 RF_ComponentLabel_t clabel;
2557 int sparecol;
2558 int c;
2559 int j;
2560 int scol;
2561
2562 scol = -1;
2563
2564 /* XXX should do extra checks to make sure things really are clean,
2565 rather than blindly setting the clean bit... */
2566
2567 raidPtr->mod_counter++;
2568
2569 for (c = 0; c < raidPtr->numCol; c++) {
2570 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2571 raidread_component_label(
2572 raidPtr->Disks[c].dev,
2573 raidPtr->raid_cinfo[c].ci_vp,
2574 &clabel);
2575 /* make sure status is noted */
2576 clabel.status = rf_ds_optimal;
2577
2578 /* bump the counter */
2579 clabel.mod_counter = raidPtr->mod_counter;
2580
2581 /* note what unit we are configured as */
2582 clabel.last_unit = raidPtr->raidid;
2583
2584 raidwrite_component_label(
2585 raidPtr->Disks[c].dev,
2586 raidPtr->raid_cinfo[c].ci_vp,
2587 &clabel);
2588 if (final == RF_FINAL_COMPONENT_UPDATE) {
2589 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2590 raidmarkclean(
2591 raidPtr->Disks[c].dev,
2592 raidPtr->raid_cinfo[c].ci_vp,
2593 raidPtr->mod_counter);
2594 }
2595 }
2596 }
2597 /* else we don't touch it.. */
2598 }
2599
2600 for( c = 0; c < raidPtr->numSpare ; c++) {
2601 sparecol = raidPtr->numCol + c;
2602 /* Need to ensure that the reconstruct actually completed! */
2603 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2604 /*
2605
2606 we claim this disk is "optimal" if it's
2607 rf_ds_used_spare, as that means it should be
2608 directly substitutable for the disk it replaced.
2609 We note that too...
2610
2611 */
2612
2613 for(j=0;j<raidPtr->numCol;j++) {
2614 if (raidPtr->Disks[j].spareCol == sparecol) {
2615 scol = j;
2616 break;
2617 }
2618 }
2619
2620 /* XXX shouldn't *really* need this... */
2621 raidread_component_label(
2622 raidPtr->Disks[sparecol].dev,
2623 raidPtr->raid_cinfo[sparecol].ci_vp,
2624 &clabel);
2625 /* make sure status is noted */
2626
2627 raid_init_component_label(raidPtr, &clabel);
2628
2629 clabel.mod_counter = raidPtr->mod_counter;
2630 clabel.column = scol;
2631 clabel.status = rf_ds_optimal;
2632 clabel.last_unit = raidPtr->raidid;
2633
2634 raidwrite_component_label(
2635 raidPtr->Disks[sparecol].dev,
2636 raidPtr->raid_cinfo[sparecol].ci_vp,
2637 &clabel);
2638 if (final == RF_FINAL_COMPONENT_UPDATE) {
2639 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2640 raidmarkclean( raidPtr->Disks[sparecol].dev,
2641 raidPtr->raid_cinfo[sparecol].ci_vp,
2642 raidPtr->mod_counter);
2643 }
2644 }
2645 }
2646 }
2647 }
2648
2649 void
2650 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2651 {
2652
2653 if (vp != NULL) {
2654 if (auto_configured == 1) {
2655 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2656 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2657 vput(vp);
2658
2659 } else {
2660 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2661 }
2662 }
2663 }
2664
2665
2666 void
2667 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2668 {
2669 int r,c;
2670 struct vnode *vp;
2671 int acd;
2672
2673
2674 /* We take this opportunity to close the vnodes like we should.. */
2675
2676 for (c = 0; c < raidPtr->numCol; c++) {
2677 vp = raidPtr->raid_cinfo[c].ci_vp;
2678 acd = raidPtr->Disks[c].auto_configured;
2679 rf_close_component(raidPtr, vp, acd);
2680 raidPtr->raid_cinfo[c].ci_vp = NULL;
2681 raidPtr->Disks[c].auto_configured = 0;
2682 }
2683
2684 for (r = 0; r < raidPtr->numSpare; r++) {
2685 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2686 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2687 rf_close_component(raidPtr, vp, acd);
2688 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2689 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2690 }
2691 }
2692
2693
2694 void
2695 rf_ReconThread(struct rf_recon_req *req)
2696 {
2697 int s;
2698 RF_Raid_t *raidPtr;
2699
2700 s = splbio();
2701 raidPtr = (RF_Raid_t *) req->raidPtr;
2702 raidPtr->recon_in_progress = 1;
2703
2704 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2705 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2706
2707 RF_Free(req, sizeof(*req));
2708
2709 raidPtr->recon_in_progress = 0;
2710 splx(s);
2711
2712 /* That's all... */
2713 kthread_exit(0); /* does not return */
2714 }
2715
2716 void
2717 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2718 {
2719 int retcode;
2720 int s;
2721
2722 raidPtr->parity_rewrite_stripes_done = 0;
2723 raidPtr->parity_rewrite_in_progress = 1;
2724 s = splbio();
2725 retcode = rf_RewriteParity(raidPtr);
2726 splx(s);
2727 if (retcode) {
2728 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2729 } else {
2730 /* set the clean bit! If we shutdown correctly,
2731 the clean bit on each component label will get
2732 set */
2733 raidPtr->parity_good = RF_RAID_CLEAN;
2734 }
2735 raidPtr->parity_rewrite_in_progress = 0;
2736
2737 /* Anyone waiting for us to stop? If so, inform them... */
2738 if (raidPtr->waitShutdown) {
2739 wakeup(&raidPtr->parity_rewrite_in_progress);
2740 }
2741
2742 /* That's all... */
2743 kthread_exit(0); /* does not return */
2744 }
2745
2746
2747 void
2748 rf_CopybackThread(RF_Raid_t *raidPtr)
2749 {
2750 int s;
2751
2752 raidPtr->copyback_in_progress = 1;
2753 s = splbio();
2754 rf_CopybackReconstructedData(raidPtr);
2755 splx(s);
2756 raidPtr->copyback_in_progress = 0;
2757
2758 /* That's all... */
2759 kthread_exit(0); /* does not return */
2760 }
2761
2762
2763 void
2764 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2765 {
2766 int s;
2767 RF_Raid_t *raidPtr;
2768
2769 s = splbio();
2770 raidPtr = req->raidPtr;
2771 raidPtr->recon_in_progress = 1;
2772 rf_ReconstructInPlace(raidPtr, req->col);
2773 RF_Free(req, sizeof(*req));
2774 raidPtr->recon_in_progress = 0;
2775 splx(s);
2776
2777 /* That's all... */
2778 kthread_exit(0); /* does not return */
2779 }
2780
2781 static RF_AutoConfig_t *
2782 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2783 const char *cname, RF_SectorCount_t size)
2784 {
2785 int good_one = 0;
2786 RF_ComponentLabel_t *clabel;
2787 RF_AutoConfig_t *ac;
2788
2789 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2790 if (clabel == NULL) {
2791 oomem:
2792 while(ac_list) {
2793 ac = ac_list;
2794 if (ac->clabel)
2795 free(ac->clabel, M_RAIDFRAME);
2796 ac_list = ac_list->next;
2797 free(ac, M_RAIDFRAME);
2798 }
2799 printf("RAID auto config: out of memory!\n");
2800 return NULL; /* XXX probably should panic? */
2801 }
2802
2803 if (!raidread_component_label(dev, vp, clabel)) {
2804 /* Got the label. Does it look reasonable? */
2805 if (rf_reasonable_label(clabel) &&
2806 (clabel->partitionSize <= size)) {
2807 #ifdef DEBUG
2808 printf("Component on: %s: %llu\n",
2809 cname, (unsigned long long)size);
2810 rf_print_component_label(clabel);
2811 #endif
2812 /* if it's reasonable, add it, else ignore it. */
2813 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2814 M_NOWAIT);
2815 if (ac == NULL) {
2816 free(clabel, M_RAIDFRAME);
2817 goto oomem;
2818 }
2819 strlcpy(ac->devname, cname, sizeof(ac->devname));
2820 ac->dev = dev;
2821 ac->vp = vp;
2822 ac->clabel = clabel;
2823 ac->next = ac_list;
2824 ac_list = ac;
2825 good_one = 1;
2826 }
2827 }
2828 if (!good_one) {
2829 /* cleanup */
2830 free(clabel, M_RAIDFRAME);
2831 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2832 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2833 vput(vp);
2834 }
2835 return ac_list;
2836 }
2837
2838 RF_AutoConfig_t *
2839 rf_find_raid_components()
2840 {
2841 struct vnode *vp;
2842 struct disklabel label;
2843 struct device *dv;
2844 dev_t dev;
2845 int bmajor, bminor, wedge;
2846 int error;
2847 int i;
2848 RF_AutoConfig_t *ac_list;
2849
2850
2851 /* initialize the AutoConfig list */
2852 ac_list = NULL;
2853
2854 /* we begin by trolling through *all* the devices on the system */
2855
2856 for (dv = alldevs.tqh_first; dv != NULL;
2857 dv = dv->dv_list.tqe_next) {
2858
2859 /* we are only interested in disks... */
2860 if (device_class(dv) != DV_DISK)
2861 continue;
2862
2863 /* we don't care about floppies... */
2864 if (device_is_a(dv, "fd")) {
2865 continue;
2866 }
2867
2868 /* we don't care about CD's... */
2869 if (device_is_a(dv, "cd")) {
2870 continue;
2871 }
2872
2873 /* hdfd is the Atari/Hades floppy driver */
2874 if (device_is_a(dv, "hdfd")) {
2875 continue;
2876 }
2877
2878 /* fdisa is the Atari/Milan floppy driver */
2879 if (device_is_a(dv, "fdisa")) {
2880 continue;
2881 }
2882
2883 /* need to find the device_name_to_block_device_major stuff */
2884 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2885
2886 /* get a vnode for the raw partition of this disk */
2887
2888 wedge = device_is_a(dv, "dk");
2889 bminor = minor(device_unit(dv));
2890 dev = wedge ? makedev(bmajor, bminor) :
2891 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2892 if (bdevvp(dev, &vp))
2893 panic("RAID can't alloc vnode");
2894
2895 error = VOP_OPEN(vp, FREAD, NOCRED);
2896
2897 if (error) {
2898 /* "Who cares." Continue looking
2899 for something that exists*/
2900 vput(vp);
2901 continue;
2902 }
2903
2904 if (wedge) {
2905 struct dkwedge_info dkw;
2906 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2907 NOCRED);
2908 if (error) {
2909 printf("RAIDframe: can't get wedge info for "
2910 "dev %s (%d)\n", device_xname(dv), error);
2911 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2912 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2913 vput(vp);
2914 continue;
2915 }
2916
2917 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2918 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2919 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2920 vput(vp);
2921 continue;
2922 }
2923
2924 ac_list = rf_get_component(ac_list, dev, vp,
2925 device_xname(dv), dkw.dkw_size);
2926 continue;
2927 }
2928
2929 /* Ok, the disk exists. Go get the disklabel. */
2930 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2931 if (error) {
2932 /*
2933 * XXX can't happen - open() would
2934 * have errored out (or faked up one)
2935 */
2936 if (error != ENOTTY)
2937 printf("RAIDframe: can't get label for dev "
2938 "%s (%d)\n", device_xname(dv), error);
2939 }
2940
2941 /* don't need this any more. We'll allocate it again
2942 a little later if we really do... */
2943 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2944 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2945 vput(vp);
2946
2947 if (error)
2948 continue;
2949
2950 for (i = 0; i < label.d_npartitions; i++) {
2951 char cname[sizeof(ac_list->devname)];
2952
2953 /* We only support partitions marked as RAID */
2954 if (label.d_partitions[i].p_fstype != FS_RAID)
2955 continue;
2956
2957 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2958 if (bdevvp(dev, &vp))
2959 panic("RAID can't alloc vnode");
2960
2961 error = VOP_OPEN(vp, FREAD, NOCRED);
2962 if (error) {
2963 /* Whatever... */
2964 vput(vp);
2965 continue;
2966 }
2967 snprintf(cname, sizeof(cname), "%s%c",
2968 device_xname(dv), 'a' + i);
2969 ac_list = rf_get_component(ac_list, dev, vp, cname,
2970 label.d_partitions[i].p_size);
2971 }
2972 }
2973 return ac_list;
2974 }
2975
2976
2977 static int
2978 rf_reasonable_label(RF_ComponentLabel_t *clabel)
2979 {
2980
2981 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2982 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2983 ((clabel->clean == RF_RAID_CLEAN) ||
2984 (clabel->clean == RF_RAID_DIRTY)) &&
2985 clabel->row >=0 &&
2986 clabel->column >= 0 &&
2987 clabel->num_rows > 0 &&
2988 clabel->num_columns > 0 &&
2989 clabel->row < clabel->num_rows &&
2990 clabel->column < clabel->num_columns &&
2991 clabel->blockSize > 0 &&
2992 clabel->numBlocks > 0) {
2993 /* label looks reasonable enough... */
2994 return(1);
2995 }
2996 return(0);
2997 }
2998
2999
3000 #ifdef DEBUG
3001 void
3002 rf_print_component_label(RF_ComponentLabel_t *clabel)
3003 {
3004 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3005 clabel->row, clabel->column,
3006 clabel->num_rows, clabel->num_columns);
3007 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3008 clabel->version, clabel->serial_number,
3009 clabel->mod_counter);
3010 printf(" Clean: %s Status: %d\n",
3011 clabel->clean ? "Yes" : "No", clabel->status );
3012 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3013 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3014 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
3015 (char) clabel->parityConfig, clabel->blockSize,
3016 clabel->numBlocks);
3017 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
3018 printf(" Contains root partition: %s\n",
3019 clabel->root_partition ? "Yes" : "No" );
3020 printf(" Last configured as: raid%d\n", clabel->last_unit );
3021 #if 0
3022 printf(" Config order: %d\n", clabel->config_order);
3023 #endif
3024
3025 }
3026 #endif
3027
3028 RF_ConfigSet_t *
3029 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3030 {
3031 RF_AutoConfig_t *ac;
3032 RF_ConfigSet_t *config_sets;
3033 RF_ConfigSet_t *cset;
3034 RF_AutoConfig_t *ac_next;
3035
3036
3037 config_sets = NULL;
3038
3039 /* Go through the AutoConfig list, and figure out which components
3040 belong to what sets. */
3041 ac = ac_list;
3042 while(ac!=NULL) {
3043 /* we're going to putz with ac->next, so save it here
3044 for use at the end of the loop */
3045 ac_next = ac->next;
3046
3047 if (config_sets == NULL) {
3048 /* will need at least this one... */
3049 config_sets = (RF_ConfigSet_t *)
3050 malloc(sizeof(RF_ConfigSet_t),
3051 M_RAIDFRAME, M_NOWAIT);
3052 if (config_sets == NULL) {
3053 panic("rf_create_auto_sets: No memory!");
3054 }
3055 /* this one is easy :) */
3056 config_sets->ac = ac;
3057 config_sets->next = NULL;
3058 config_sets->rootable = 0;
3059 ac->next = NULL;
3060 } else {
3061 /* which set does this component fit into? */
3062 cset = config_sets;
3063 while(cset!=NULL) {
3064 if (rf_does_it_fit(cset, ac)) {
3065 /* looks like it matches... */
3066 ac->next = cset->ac;
3067 cset->ac = ac;
3068 break;
3069 }
3070 cset = cset->next;
3071 }
3072 if (cset==NULL) {
3073 /* didn't find a match above... new set..*/
3074 cset = (RF_ConfigSet_t *)
3075 malloc(sizeof(RF_ConfigSet_t),
3076 M_RAIDFRAME, M_NOWAIT);
3077 if (cset == NULL) {
3078 panic("rf_create_auto_sets: No memory!");
3079 }
3080 cset->ac = ac;
3081 ac->next = NULL;
3082 cset->next = config_sets;
3083 cset->rootable = 0;
3084 config_sets = cset;
3085 }
3086 }
3087 ac = ac_next;
3088 }
3089
3090
3091 return(config_sets);
3092 }
3093
3094 static int
3095 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3096 {
3097 RF_ComponentLabel_t *clabel1, *clabel2;
3098
3099 /* If this one matches the *first* one in the set, that's good
3100 enough, since the other members of the set would have been
3101 through here too... */
3102 /* note that we are not checking partitionSize here..
3103
3104 Note that we are also not checking the mod_counters here.
3105 If everything else matches execpt the mod_counter, that's
3106 good enough for this test. We will deal with the mod_counters
3107 a little later in the autoconfiguration process.
3108
3109 (clabel1->mod_counter == clabel2->mod_counter) &&
3110
3111 The reason we don't check for this is that failed disks
3112 will have lower modification counts. If those disks are
3113 not added to the set they used to belong to, then they will
3114 form their own set, which may result in 2 different sets,
3115 for example, competing to be configured at raid0, and
3116 perhaps competing to be the root filesystem set. If the
3117 wrong ones get configured, or both attempt to become /,
3118 weird behaviour and or serious lossage will occur. Thus we
3119 need to bring them into the fold here, and kick them out at
3120 a later point.
3121
3122 */
3123
3124 clabel1 = cset->ac->clabel;
3125 clabel2 = ac->clabel;
3126 if ((clabel1->version == clabel2->version) &&
3127 (clabel1->serial_number == clabel2->serial_number) &&
3128 (clabel1->num_rows == clabel2->num_rows) &&
3129 (clabel1->num_columns == clabel2->num_columns) &&
3130 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3131 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3132 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3133 (clabel1->parityConfig == clabel2->parityConfig) &&
3134 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3135 (clabel1->blockSize == clabel2->blockSize) &&
3136 (clabel1->numBlocks == clabel2->numBlocks) &&
3137 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3138 (clabel1->root_partition == clabel2->root_partition) &&
3139 (clabel1->last_unit == clabel2->last_unit) &&
3140 (clabel1->config_order == clabel2->config_order)) {
3141 /* if it get's here, it almost *has* to be a match */
3142 } else {
3143 /* it's not consistent with somebody in the set..
3144 punt */
3145 return(0);
3146 }
3147 /* all was fine.. it must fit... */
3148 return(1);
3149 }
3150
3151 int
3152 rf_have_enough_components(RF_ConfigSet_t *cset)
3153 {
3154 RF_AutoConfig_t *ac;
3155 RF_AutoConfig_t *auto_config;
3156 RF_ComponentLabel_t *clabel;
3157 int c;
3158 int num_cols;
3159 int num_missing;
3160 int mod_counter;
3161 int mod_counter_found;
3162 int even_pair_failed;
3163 char parity_type;
3164
3165
3166 /* check to see that we have enough 'live' components
3167 of this set. If so, we can configure it if necessary */
3168
3169 num_cols = cset->ac->clabel->num_columns;
3170 parity_type = cset->ac->clabel->parityConfig;
3171
3172 /* XXX Check for duplicate components!?!?!? */
3173
3174 /* Determine what the mod_counter is supposed to be for this set. */
3175
3176 mod_counter_found = 0;
3177 mod_counter = 0;
3178 ac = cset->ac;
3179 while(ac!=NULL) {
3180 if (mod_counter_found==0) {
3181 mod_counter = ac->clabel->mod_counter;
3182 mod_counter_found = 1;
3183 } else {
3184 if (ac->clabel->mod_counter > mod_counter) {
3185 mod_counter = ac->clabel->mod_counter;
3186 }
3187 }
3188 ac = ac->next;
3189 }
3190
3191 num_missing = 0;
3192 auto_config = cset->ac;
3193
3194 even_pair_failed = 0;
3195 for(c=0; c<num_cols; c++) {
3196 ac = auto_config;
3197 while(ac!=NULL) {
3198 if ((ac->clabel->column == c) &&
3199 (ac->clabel->mod_counter == mod_counter)) {
3200 /* it's this one... */
3201 #ifdef DEBUG
3202 printf("Found: %s at %d\n",
3203 ac->devname,c);
3204 #endif
3205 break;
3206 }
3207 ac=ac->next;
3208 }
3209 if (ac==NULL) {
3210 /* Didn't find one here! */
3211 /* special case for RAID 1, especially
3212 where there are more than 2
3213 components (where RAIDframe treats
3214 things a little differently :( ) */
3215 if (parity_type == '1') {
3216 if (c%2 == 0) { /* even component */
3217 even_pair_failed = 1;
3218 } else { /* odd component. If
3219 we're failed, and
3220 so is the even
3221 component, it's
3222 "Good Night, Charlie" */
3223 if (even_pair_failed == 1) {
3224 return(0);
3225 }
3226 }
3227 } else {
3228 /* normal accounting */
3229 num_missing++;
3230 }
3231 }
3232 if ((parity_type == '1') && (c%2 == 1)) {
3233 /* Just did an even component, and we didn't
3234 bail.. reset the even_pair_failed flag,
3235 and go on to the next component.... */
3236 even_pair_failed = 0;
3237 }
3238 }
3239
3240 clabel = cset->ac->clabel;
3241
3242 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3243 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3244 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3245 /* XXX this needs to be made *much* more general */
3246 /* Too many failures */
3247 return(0);
3248 }
3249 /* otherwise, all is well, and we've got enough to take a kick
3250 at autoconfiguring this set */
3251 return(1);
3252 }
3253
3254 void
3255 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3256 RF_Raid_t *raidPtr)
3257 {
3258 RF_ComponentLabel_t *clabel;
3259 int i;
3260
3261 clabel = ac->clabel;
3262
3263 /* 1. Fill in the common stuff */
3264 config->numRow = clabel->num_rows = 1;
3265 config->numCol = clabel->num_columns;
3266 config->numSpare = 0; /* XXX should this be set here? */
3267 config->sectPerSU = clabel->sectPerSU;
3268 config->SUsPerPU = clabel->SUsPerPU;
3269 config->SUsPerRU = clabel->SUsPerRU;
3270 config->parityConfig = clabel->parityConfig;
3271 /* XXX... */
3272 strcpy(config->diskQueueType,"fifo");
3273 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3274 config->layoutSpecificSize = 0; /* XXX ?? */
3275
3276 while(ac!=NULL) {
3277 /* row/col values will be in range due to the checks
3278 in reasonable_label() */
3279 strcpy(config->devnames[0][ac->clabel->column],
3280 ac->devname);
3281 ac = ac->next;
3282 }
3283
3284 for(i=0;i<RF_MAXDBGV;i++) {
3285 config->debugVars[i][0] = 0;
3286 }
3287 }
3288
3289 int
3290 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3291 {
3292 RF_ComponentLabel_t clabel;
3293 struct vnode *vp;
3294 dev_t dev;
3295 int column;
3296 int sparecol;
3297
3298 raidPtr->autoconfigure = new_value;
3299
3300 for(column=0; column<raidPtr->numCol; column++) {
3301 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3302 dev = raidPtr->Disks[column].dev;
3303 vp = raidPtr->raid_cinfo[column].ci_vp;
3304 raidread_component_label(dev, vp, &clabel);
3305 clabel.autoconfigure = new_value;
3306 raidwrite_component_label(dev, vp, &clabel);
3307 }
3308 }
3309 for(column = 0; column < raidPtr->numSpare ; column++) {
3310 sparecol = raidPtr->numCol + column;
3311 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3312 dev = raidPtr->Disks[sparecol].dev;
3313 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3314 raidread_component_label(dev, vp, &clabel);
3315 clabel.autoconfigure = new_value;
3316 raidwrite_component_label(dev, vp, &clabel);
3317 }
3318 }
3319 return(new_value);
3320 }
3321
3322 int
3323 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3324 {
3325 RF_ComponentLabel_t clabel;
3326 struct vnode *vp;
3327 dev_t dev;
3328 int column;
3329 int sparecol;
3330
3331 raidPtr->root_partition = new_value;
3332 for(column=0; column<raidPtr->numCol; column++) {
3333 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3334 dev = raidPtr->Disks[column].dev;
3335 vp = raidPtr->raid_cinfo[column].ci_vp;
3336 raidread_component_label(dev, vp, &clabel);
3337 clabel.root_partition = new_value;
3338 raidwrite_component_label(dev, vp, &clabel);
3339 }
3340 }
3341 for(column = 0; column < raidPtr->numSpare ; column++) {
3342 sparecol = raidPtr->numCol + column;
3343 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3344 dev = raidPtr->Disks[sparecol].dev;
3345 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3346 raidread_component_label(dev, vp, &clabel);
3347 clabel.root_partition = new_value;
3348 raidwrite_component_label(dev, vp, &clabel);
3349 }
3350 }
3351 return(new_value);
3352 }
3353
3354 void
3355 rf_release_all_vps(RF_ConfigSet_t *cset)
3356 {
3357 RF_AutoConfig_t *ac;
3358
3359 ac = cset->ac;
3360 while(ac!=NULL) {
3361 /* Close the vp, and give it back */
3362 if (ac->vp) {
3363 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3364 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3365 vput(ac->vp);
3366 ac->vp = NULL;
3367 }
3368 ac = ac->next;
3369 }
3370 }
3371
3372
3373 void
3374 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3375 {
3376 RF_AutoConfig_t *ac;
3377 RF_AutoConfig_t *next_ac;
3378
3379 ac = cset->ac;
3380 while(ac!=NULL) {
3381 next_ac = ac->next;
3382 /* nuke the label */
3383 free(ac->clabel, M_RAIDFRAME);
3384 /* cleanup the config structure */
3385 free(ac, M_RAIDFRAME);
3386 /* "next.." */
3387 ac = next_ac;
3388 }
3389 /* and, finally, nuke the config set */
3390 free(cset, M_RAIDFRAME);
3391 }
3392
3393
3394 void
3395 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3396 {
3397 /* current version number */
3398 clabel->version = RF_COMPONENT_LABEL_VERSION;
3399 clabel->serial_number = raidPtr->serial_number;
3400 clabel->mod_counter = raidPtr->mod_counter;
3401 clabel->num_rows = 1;
3402 clabel->num_columns = raidPtr->numCol;
3403 clabel->clean = RF_RAID_DIRTY; /* not clean */
3404 clabel->status = rf_ds_optimal; /* "It's good!" */
3405
3406 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3407 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3408 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3409
3410 clabel->blockSize = raidPtr->bytesPerSector;
3411 clabel->numBlocks = raidPtr->sectorsPerDisk;
3412
3413 /* XXX not portable */
3414 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3415 clabel->maxOutstanding = raidPtr->maxOutstanding;
3416 clabel->autoconfigure = raidPtr->autoconfigure;
3417 clabel->root_partition = raidPtr->root_partition;
3418 clabel->last_unit = raidPtr->raidid;
3419 clabel->config_order = raidPtr->config_order;
3420 }
3421
3422 int
3423 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3424 {
3425 RF_Raid_t *raidPtr;
3426 RF_Config_t *config;
3427 int raidID;
3428 int retcode;
3429
3430 #ifdef DEBUG
3431 printf("RAID autoconfigure\n");
3432 #endif
3433
3434 retcode = 0;
3435 *unit = -1;
3436
3437 /* 1. Create a config structure */
3438
3439 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3440 M_RAIDFRAME,
3441 M_NOWAIT);
3442 if (config==NULL) {
3443 printf("Out of mem!?!?\n");
3444 /* XXX do something more intelligent here. */
3445 return(1);
3446 }
3447
3448 memset(config, 0, sizeof(RF_Config_t));
3449
3450 /*
3451 2. Figure out what RAID ID this one is supposed to live at
3452 See if we can get the same RAID dev that it was configured
3453 on last time..
3454 */
3455
3456 raidID = cset->ac->clabel->last_unit;
3457 if ((raidID < 0) || (raidID >= numraid)) {
3458 /* let's not wander off into lala land. */
3459 raidID = numraid - 1;
3460 }
3461 if (raidPtrs[raidID]->valid != 0) {
3462
3463 /*
3464 Nope... Go looking for an alternative...
3465 Start high so we don't immediately use raid0 if that's
3466 not taken.
3467 */
3468
3469 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3470 if (raidPtrs[raidID]->valid == 0) {
3471 /* can use this one! */
3472 break;
3473 }
3474 }
3475 }
3476
3477 if (raidID < 0) {
3478 /* punt... */
3479 printf("Unable to auto configure this set!\n");
3480 printf("(Out of RAID devs!)\n");
3481 free(config, M_RAIDFRAME);
3482 return(1);
3483 }
3484
3485 #ifdef DEBUG
3486 printf("Configuring raid%d:\n",raidID);
3487 #endif
3488
3489 raidPtr = raidPtrs[raidID];
3490
3491 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3492 raidPtr->raidid = raidID;
3493 raidPtr->openings = RAIDOUTSTANDING;
3494
3495 /* 3. Build the configuration structure */
3496 rf_create_configuration(cset->ac, config, raidPtr);
3497
3498 /* 4. Do the configuration */
3499 retcode = rf_Configure(raidPtr, config, cset->ac);
3500
3501 if (retcode == 0) {
3502
3503 raidinit(raidPtrs[raidID]);
3504
3505 rf_markalldirty(raidPtrs[raidID]);
3506 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3507 if (cset->ac->clabel->root_partition==1) {
3508 /* everything configured just fine. Make a note
3509 that this set is eligible to be root. */
3510 cset->rootable = 1;
3511 /* XXX do this here? */
3512 raidPtrs[raidID]->root_partition = 1;
3513 }
3514 }
3515
3516 /* 5. Cleanup */
3517 free(config, M_RAIDFRAME);
3518
3519 *unit = raidID;
3520 return(retcode);
3521 }
3522
3523 void
3524 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3525 {
3526 struct buf *bp;
3527
3528 bp = (struct buf *)desc->bp;
3529 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3530 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3531 }
3532
3533 void
3534 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3535 size_t xmin, size_t xmax)
3536 {
3537 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3538 pool_sethiwat(p, xmax);
3539 pool_prime(p, xmin);
3540 pool_setlowat(p, xmin);
3541 }
3542
3543 /*
3544 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3545 * if there is IO pending and if that IO could possibly be done for a
3546 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3547 * otherwise.
3548 *
3549 */
3550
3551 int
3552 rf_buf_queue_check(int raidid)
3553 {
3554 if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
3555 raidPtrs[raidid]->openings > 0) {
3556 /* there is work to do */
3557 return 0;
3558 }
3559 /* default is nothing to do */
3560 return 1;
3561 }
3562
3563 int
3564 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3565 {
3566 struct partinfo dpart;
3567 struct dkwedge_info dkw;
3568 int error;
3569
3570 error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred);
3571 if (error == 0) {
3572 diskPtr->blockSize = dpart.disklab->d_secsize;
3573 diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3574 diskPtr->partitionSize = dpart.part->p_size;
3575 return 0;
3576 }
3577
3578 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred);
3579 if (error == 0) {
3580 diskPtr->blockSize = 512; /* XXX */
3581 diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3582 diskPtr->partitionSize = dkw.dkw_size;
3583 return 0;
3584 }
3585 return error;
3586 }
3587
3588 static int
3589 raid_match(struct device *self, struct cfdata *cfdata,
3590 void *aux)
3591 {
3592 return 1;
3593 }
3594
3595 static void
3596 raid_attach(struct device *parent, struct device *self,
3597 void *aux)
3598 {
3599
3600 }
3601
3602
3603 static int
3604 raid_detach(struct device *self, int flags)
3605 {
3606 struct raid_softc *rs = (struct raid_softc *)self;
3607
3608 if (rs->sc_flags & RAIDF_INITED)
3609 return EBUSY;
3610
3611 return 0;
3612 }
3613
3614 static void
3615 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3616 {
3617 prop_dictionary_t disk_info, odisk_info, geom;
3618 disk_info = prop_dictionary_create();
3619 geom = prop_dictionary_create();
3620 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3621 raidPtr->totalSectors);
3622 prop_dictionary_set_uint32(geom, "sector-size",
3623 raidPtr->bytesPerSector);
3624
3625 prop_dictionary_set_uint16(geom, "sectors-per-track",
3626 raidPtr->Layout.dataSectorsPerStripe);
3627 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3628 4 * raidPtr->numCol);
3629
3630 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3631 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3632 (4 * raidPtr->numCol)));
3633
3634 prop_dictionary_set(disk_info, "geometry", geom);
3635 prop_object_release(geom);
3636 prop_dictionary_set(device_properties(rs->sc_dev),
3637 "disk-info", disk_info);
3638 odisk_info = rs->sc_dkdev.dk_info;
3639 rs->sc_dkdev.dk_info = disk_info;
3640 if (odisk_info)
3641 prop_object_release(odisk_info);
3642 }
3643