rf_netbsdkintf.c revision 1.243.6.6 1 /* $NetBSD: rf_netbsdkintf.c,v 1.243.6.6 2008/10/05 20:11:31 mjf Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Copyright (c) 1990, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * the Systems Programming Group of the University of Utah Computer
37 * Science Department.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 * from: Utah $Hdr: cd.c 1.6 90/11/28$
64 *
65 * @(#)cd.c 8.2 (Berkeley) 11/16/93
66 */
67
68 /*
69 * Copyright (c) 1988 University of Utah.
70 *
71 * This code is derived from software contributed to Berkeley by
72 * the Systems Programming Group of the University of Utah Computer
73 * Science Department.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgement:
85 * This product includes software developed by the University of
86 * California, Berkeley and its contributors.
87 * 4. Neither the name of the University nor the names of its contributors
88 * may be used to endorse or promote products derived from this software
89 * without specific prior written permission.
90 *
91 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
94 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
95 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
96 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
97 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
98 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
99 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
100 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
101 * SUCH DAMAGE.
102 *
103 * from: Utah $Hdr: cd.c 1.6 90/11/28$
104 *
105 * @(#)cd.c 8.2 (Berkeley) 11/16/93
106 */
107
108 /*
109 * Copyright (c) 1995 Carnegie-Mellon University.
110 * All rights reserved.
111 *
112 * Authors: Mark Holland, Jim Zelenka
113 *
114 * Permission to use, copy, modify and distribute this software and
115 * its documentation is hereby granted, provided that both the copyright
116 * notice and this permission notice appear in all copies of the
117 * software, derivative works or modified versions, and any portions
118 * thereof, and that both notices appear in supporting documentation.
119 *
120 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
121 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
122 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
123 *
124 * Carnegie Mellon requests users of this software to return to
125 *
126 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
127 * School of Computer Science
128 * Carnegie Mellon University
129 * Pittsburgh PA 15213-3890
130 *
131 * any improvements or extensions that they make and grant Carnegie the
132 * rights to redistribute these changes.
133 */
134
135 /***********************************************************
136 *
137 * rf_kintf.c -- the kernel interface routines for RAIDframe
138 *
139 ***********************************************************/
140
141 #include <sys/cdefs.h>
142 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.243.6.6 2008/10/05 20:11:31 mjf Exp $");
143
144 #include <sys/param.h>
145 #include <sys/errno.h>
146 #include <sys/pool.h>
147 #include <sys/proc.h>
148 #include <sys/queue.h>
149 #include <sys/disk.h>
150 #include <sys/device.h>
151 #include <sys/stat.h>
152 #include <sys/ioctl.h>
153 #include <sys/fcntl.h>
154 #include <sys/systm.h>
155 #include <sys/vnode.h>
156 #include <sys/disklabel.h>
157 #include <sys/conf.h>
158 #include <sys/buf.h>
159 #include <sys/bufq.h>
160 #include <sys/user.h>
161 #include <sys/reboot.h>
162 #include <sys/kauth.h>
163
164 #include <prop/proplib.h>
165
166 #include <dev/raidframe/raidframevar.h>
167 #include <dev/raidframe/raidframeio.h>
168 #include "raid.h"
169 #include "opt_raid_autoconfig.h"
170 #include "rf_raid.h"
171 #include "rf_copyback.h"
172 #include "rf_dag.h"
173 #include "rf_dagflags.h"
174 #include "rf_desc.h"
175 #include "rf_diskqueue.h"
176 #include "rf_etimer.h"
177 #include "rf_general.h"
178 #include "rf_kintf.h"
179 #include "rf_options.h"
180 #include "rf_driver.h"
181 #include "rf_parityscan.h"
182 #include "rf_threadstuff.h"
183
184 #ifdef DEBUG
185 int rf_kdebug_level = 0;
186 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
187 #else /* DEBUG */
188 #define db1_printf(a) { }
189 #endif /* DEBUG */
190
191 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
192
193 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
194 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
195
196 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
197 * spare table */
198 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
199 * installation process */
200 #endif
201
202 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
203
204 /* prototypes */
205 static void KernelWakeupFunc(struct buf *);
206 static void InitBP(struct buf *, struct vnode *, unsigned,
207 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
208 void *, int, struct proc *);
209 static void raidinit(RF_Raid_t *);
210
211 void raidattach(int);
212 static int raid_match(struct device *, struct cfdata *, void *);
213 static void raid_attach(struct device *, struct device *, void *);
214 static int raid_detach(struct device *, int);
215
216 dev_type_open(raidopen);
217 dev_type_close(raidclose);
218 dev_type_read(raidread);
219 dev_type_write(raidwrite);
220 dev_type_ioctl(raidioctl);
221 dev_type_strategy(raidstrategy);
222 dev_type_dump(raiddump);
223 dev_type_size(raidsize);
224
225 const struct bdevsw raid_bdevsw = {
226 raidopen, raidclose, raidstrategy, raidioctl,
227 raiddump, raidsize, D_DISK
228 };
229
230 const struct cdevsw raid_cdevsw = {
231 raidopen, raidclose, raidread, raidwrite, raidioctl,
232 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
233 };
234
235 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
236
237 /* XXX Not sure if the following should be replacing the raidPtrs above,
238 or if it should be used in conjunction with that...
239 */
240
241 struct raid_softc {
242 struct device *sc_dev;
243 int sc_flags; /* flags */
244 int sc_cflags; /* configuration flags */
245 uint64_t sc_size; /* size of the raid device */
246 char sc_xname[20]; /* XXX external name */
247 struct disk sc_dkdev; /* generic disk device info */
248 struct bufq_state *buf_queue; /* used for the device queue */
249 };
250 /* sc_flags */
251 #define RAIDF_INITED 0x01 /* unit has been initialized */
252 #define RAIDF_WLABEL 0x02 /* label area is writable */
253 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
254 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
255 #define RAIDF_LOCKED 0x80 /* unit is locked */
256
257 #define raidunit(x) DISKUNIT(x)
258 int numraid = 0;
259
260 extern struct cfdriver raid_cd;
261 CFATTACH_DECL_NEW(raid, sizeof(struct raid_softc),
262 raid_match, raid_attach, raid_detach, NULL);
263
264 /*
265 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
266 * Be aware that large numbers can allow the driver to consume a lot of
267 * kernel memory, especially on writes, and in degraded mode reads.
268 *
269 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
270 * a single 64K write will typically require 64K for the old data,
271 * 64K for the old parity, and 64K for the new parity, for a total
272 * of 192K (if the parity buffer is not re-used immediately).
273 * Even it if is used immediately, that's still 128K, which when multiplied
274 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
275 *
276 * Now in degraded mode, for example, a 64K read on the above setup may
277 * require data reconstruction, which will require *all* of the 4 remaining
278 * disks to participate -- 4 * 32K/disk == 128K again.
279 */
280
281 #ifndef RAIDOUTSTANDING
282 #define RAIDOUTSTANDING 6
283 #endif
284
285 #define RAIDLABELDEV(dev) \
286 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
287
288 /* declared here, and made public, for the benefit of KVM stuff.. */
289 struct raid_softc *raid_softc;
290
291 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
292 struct disklabel *);
293 static void raidgetdisklabel(dev_t);
294 static void raidmakedisklabel(struct raid_softc *);
295
296 static int raidlock(struct raid_softc *);
297 static void raidunlock(struct raid_softc *);
298
299 static void rf_markalldirty(RF_Raid_t *);
300 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
301
302 void rf_ReconThread(struct rf_recon_req *);
303 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
304 void rf_CopybackThread(RF_Raid_t *raidPtr);
305 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
306 int rf_autoconfig(struct device *self);
307 void rf_buildroothack(RF_ConfigSet_t *);
308
309 RF_AutoConfig_t *rf_find_raid_components(void);
310 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
311 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
312 static int rf_reasonable_label(RF_ComponentLabel_t *);
313 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
314 int rf_set_autoconfig(RF_Raid_t *, int);
315 int rf_set_rootpartition(RF_Raid_t *, int);
316 void rf_release_all_vps(RF_ConfigSet_t *);
317 void rf_cleanup_config_set(RF_ConfigSet_t *);
318 int rf_have_enough_components(RF_ConfigSet_t *);
319 int rf_auto_config_set(RF_ConfigSet_t *, int *);
320
321 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
322 allow autoconfig to take place.
323 Note that this is overridden by having
324 RAID_AUTOCONFIG as an option in the
325 kernel config file. */
326
327 struct RF_Pools_s rf_pools;
328
329 void
330 raidattach(int num)
331 {
332 int raidID;
333 int i, rc;
334
335 #ifdef DEBUG
336 printf("raidattach: Asked for %d units\n", num);
337 #endif
338
339 if (num <= 0) {
340 #ifdef DIAGNOSTIC
341 panic("raidattach: count <= 0");
342 #endif
343 return;
344 }
345 /* This is where all the initialization stuff gets done. */
346
347 numraid = num;
348
349 /* Make some space for requested number of units... */
350
351 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
352 if (raidPtrs == NULL) {
353 panic("raidPtrs is NULL!!");
354 }
355
356 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
357 rf_mutex_init(&rf_sparet_wait_mutex);
358
359 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
360 #endif
361
362 for (i = 0; i < num; i++)
363 raidPtrs[i] = NULL;
364 rc = rf_BootRaidframe();
365 if (rc == 0)
366 aprint_normal("Kernelized RAIDframe activated\n");
367 else
368 panic("Serious error booting RAID!!");
369
370 /* put together some datastructures like the CCD device does.. This
371 * lets us lock the device and what-not when it gets opened. */
372
373 raid_softc = (struct raid_softc *)
374 malloc(num * sizeof(struct raid_softc),
375 M_RAIDFRAME, M_NOWAIT);
376 if (raid_softc == NULL) {
377 aprint_error("WARNING: no memory for RAIDframe driver\n");
378 return;
379 }
380
381 memset(raid_softc, 0, num * sizeof(struct raid_softc));
382
383 for (raidID = 0; raidID < num; raidID++) {
384 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
385
386 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
387 (RF_Raid_t *));
388 if (raidPtrs[raidID] == NULL) {
389 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
390 numraid = raidID;
391 return;
392 }
393 }
394
395 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
396 aprint_error("raidattach: config_cfattach_attach failed?\n");
397 }
398
399 #ifdef RAID_AUTOCONFIG
400 raidautoconfig = 1;
401 #endif
402
403 /*
404 * Register a finalizer which will be used to auto-config RAID
405 * sets once all real hardware devices have been found.
406 */
407 if (config_finalize_register(NULL, rf_autoconfig) != 0)
408 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
409 }
410
411 int
412 rf_autoconfig(struct device *self)
413 {
414 RF_AutoConfig_t *ac_list;
415 RF_ConfigSet_t *config_sets;
416
417 if (raidautoconfig == 0)
418 return (0);
419
420 /* XXX This code can only be run once. */
421 raidautoconfig = 0;
422
423 /* 1. locate all RAID components on the system */
424 #ifdef DEBUG
425 printf("Searching for RAID components...\n");
426 #endif
427 ac_list = rf_find_raid_components();
428
429 /* 2. Sort them into their respective sets. */
430 config_sets = rf_create_auto_sets(ac_list);
431
432 /*
433 * 3. Evaluate each set andconfigure the valid ones.
434 * This gets done in rf_buildroothack().
435 */
436 rf_buildroothack(config_sets);
437
438 return 1;
439 }
440
441 void
442 rf_buildroothack(RF_ConfigSet_t *config_sets)
443 {
444 RF_ConfigSet_t *cset;
445 RF_ConfigSet_t *next_cset;
446 int retcode;
447 int raidID;
448 int rootID;
449 int col;
450 int num_root;
451 char *devname;
452
453 rootID = 0;
454 num_root = 0;
455 cset = config_sets;
456 while(cset != NULL ) {
457 next_cset = cset->next;
458 if (rf_have_enough_components(cset) &&
459 cset->ac->clabel->autoconfigure==1) {
460 retcode = rf_auto_config_set(cset,&raidID);
461 if (!retcode) {
462 #ifdef DEBUG
463 printf("raid%d: configured ok\n", raidID);
464 #endif
465 if (cset->rootable) {
466 rootID = raidID;
467 num_root++;
468 }
469 } else {
470 /* The autoconfig didn't work :( */
471 #ifdef DEBUG
472 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
473 #endif
474 rf_release_all_vps(cset);
475 }
476 } else {
477 /* we're not autoconfiguring this set...
478 release the associated resources */
479 rf_release_all_vps(cset);
480 }
481 /* cleanup */
482 rf_cleanup_config_set(cset);
483 cset = next_cset;
484 }
485
486 /* if the user has specified what the root device should be
487 then we don't touch booted_device or boothowto... */
488
489 if (rootspec != NULL)
490 return;
491
492 /* we found something bootable... */
493
494 if (num_root == 1) {
495 booted_device = raid_softc[rootID].sc_dev;
496 } else if (num_root > 1) {
497
498 /*
499 * Maybe the MD code can help. If it cannot, then
500 * setroot() will discover that we have no
501 * booted_device and will ask the user if nothing was
502 * hardwired in the kernel config file
503 */
504
505 if (booted_device == NULL)
506 cpu_rootconf();
507 if (booted_device == NULL)
508 return;
509
510 num_root = 0;
511 for (raidID = 0; raidID < numraid; raidID++) {
512 if (raidPtrs[raidID]->valid == 0)
513 continue;
514
515 if (raidPtrs[raidID]->root_partition == 0)
516 continue;
517
518 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
519 devname = raidPtrs[raidID]->Disks[col].devname;
520 devname += sizeof("/dev/") - 1;
521 if (strncmp(devname, device_xname(booted_device),
522 strlen(device_xname(booted_device))) != 0)
523 continue;
524 #ifdef DEBUG
525 printf("raid%d includes boot device %s\n",
526 raidID, devname);
527 #endif
528 num_root++;
529 rootID = raidID;
530 }
531 }
532
533 if (num_root == 1) {
534 booted_device = raid_softc[rootID].sc_dev;
535 } else {
536 /* we can't guess.. require the user to answer... */
537 boothowto |= RB_ASKNAME;
538 }
539 }
540 }
541
542
543 int
544 raidsize(dev_t dev)
545 {
546 struct raid_softc *rs;
547 struct disklabel *lp;
548 int part, unit, omask, size;
549
550 unit = raidunit(dev);
551 if (unit >= numraid)
552 return (-1);
553 rs = &raid_softc[unit];
554
555 if ((rs->sc_flags & RAIDF_INITED) == 0)
556 return (-1);
557
558 part = DISKPART(dev);
559 omask = rs->sc_dkdev.dk_openmask & (1 << part);
560 lp = rs->sc_dkdev.dk_label;
561
562 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
563 return (-1);
564
565 if (lp->d_partitions[part].p_fstype != FS_SWAP)
566 size = -1;
567 else
568 size = lp->d_partitions[part].p_size *
569 (lp->d_secsize / DEV_BSIZE);
570
571 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
572 return (-1);
573
574 return (size);
575
576 }
577
578 int
579 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
580 {
581 int unit = raidunit(dev);
582 struct raid_softc *rs;
583 const struct bdevsw *bdev;
584 struct disklabel *lp;
585 RF_Raid_t *raidPtr;
586 daddr_t offset;
587 int part, c, sparecol, j, scol, dumpto;
588 int error = 0;
589
590 if (unit >= numraid)
591 return (ENXIO);
592
593 rs = &raid_softc[unit];
594 raidPtr = raidPtrs[unit];
595
596 if ((rs->sc_flags & RAIDF_INITED) == 0)
597 return ENXIO;
598
599 /* we only support dumping to RAID 1 sets */
600 if (raidPtr->Layout.numDataCol != 1 ||
601 raidPtr->Layout.numParityCol != 1)
602 return EINVAL;
603
604
605 if ((error = raidlock(rs)) != 0)
606 return error;
607
608 if (size % DEV_BSIZE != 0) {
609 error = EINVAL;
610 goto out;
611 }
612
613 if (blkno + size / DEV_BSIZE > rs->sc_size) {
614 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
615 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
616 size / DEV_BSIZE, rs->sc_size);
617 error = EINVAL;
618 goto out;
619 }
620
621 part = DISKPART(dev);
622 lp = rs->sc_dkdev.dk_label;
623 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
624
625 /* figure out what device is alive.. */
626
627 /*
628 Look for a component to dump to. The preference for the
629 component to dump to is as follows:
630 1) the master
631 2) a used_spare of the master
632 3) the slave
633 4) a used_spare of the slave
634 */
635
636 dumpto = -1;
637 for (c = 0; c < raidPtr->numCol; c++) {
638 if (raidPtr->Disks[c].status == rf_ds_optimal) {
639 /* this might be the one */
640 dumpto = c;
641 break;
642 }
643 }
644
645 /*
646 At this point we have possibly selected a live master or a
647 live slave. We now check to see if there is a spared
648 master (or a spared slave), if we didn't find a live master
649 or a live slave.
650 */
651
652 for (c = 0; c < raidPtr->numSpare; c++) {
653 sparecol = raidPtr->numCol + c;
654 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
655 /* How about this one? */
656 scol = -1;
657 for(j=0;j<raidPtr->numCol;j++) {
658 if (raidPtr->Disks[j].spareCol == sparecol) {
659 scol = j;
660 break;
661 }
662 }
663 if (scol == 0) {
664 /*
665 We must have found a spared master!
666 We'll take that over anything else
667 found so far. (We couldn't have
668 found a real master before, since
669 this is a used spare, and it's
670 saying that it's replacing the
671 master.) On reboot (with
672 autoconfiguration turned on)
673 sparecol will become the 1st
674 component (component0) of this set.
675 */
676 dumpto = sparecol;
677 break;
678 } else if (scol != -1) {
679 /*
680 Must be a spared slave. We'll dump
681 to that if we havn't found anything
682 else so far.
683 */
684 if (dumpto == -1)
685 dumpto = sparecol;
686 }
687 }
688 }
689
690 if (dumpto == -1) {
691 /* we couldn't find any live components to dump to!?!?
692 */
693 error = EINVAL;
694 goto out;
695 }
696
697 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
698
699 /*
700 Note that blkno is relative to this particular partition.
701 By adding the offset of this partition in the RAID
702 set, and also adding RF_PROTECTED_SECTORS, we get a
703 value that is relative to the partition used for the
704 underlying component.
705 */
706
707 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
708 blkno + offset, va, size);
709
710 out:
711 raidunlock(rs);
712
713 return error;
714 }
715 /* ARGSUSED */
716 int
717 raidopen(dev_t dev, int flags, int fmt,
718 struct lwp *l)
719 {
720 int unit = raidunit(dev);
721 struct raid_softc *rs;
722 struct disklabel *lp;
723 int part, pmask;
724 int error = 0;
725
726 if (unit >= numraid)
727 return (ENXIO);
728 rs = &raid_softc[unit];
729
730 if ((error = raidlock(rs)) != 0)
731 return (error);
732 lp = rs->sc_dkdev.dk_label;
733
734 part = DISKPART(dev);
735
736 /*
737 * If there are wedges, and this is not RAW_PART, then we
738 * need to fail.
739 */
740 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
741 error = EBUSY;
742 goto bad;
743 }
744 pmask = (1 << part);
745
746 if ((rs->sc_flags & RAIDF_INITED) &&
747 (rs->sc_dkdev.dk_openmask == 0))
748 raidgetdisklabel(dev);
749
750 /* make sure that this partition exists */
751
752 if (part != RAW_PART) {
753 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
754 ((part >= lp->d_npartitions) ||
755 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
756 error = ENXIO;
757 goto bad;
758 }
759 }
760 /* Prevent this unit from being unconfigured while open. */
761 switch (fmt) {
762 case S_IFCHR:
763 rs->sc_dkdev.dk_copenmask |= pmask;
764 break;
765
766 case S_IFBLK:
767 rs->sc_dkdev.dk_bopenmask |= pmask;
768 break;
769 }
770
771 if ((rs->sc_dkdev.dk_openmask == 0) &&
772 ((rs->sc_flags & RAIDF_INITED) != 0)) {
773 /* First one... mark things as dirty... Note that we *MUST*
774 have done a configure before this. I DO NOT WANT TO BE
775 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
776 THAT THEY BELONG TOGETHER!!!!! */
777 /* XXX should check to see if we're only open for reading
778 here... If so, we needn't do this, but then need some
779 other way of keeping track of what's happened.. */
780
781 rf_markalldirty( raidPtrs[unit] );
782 }
783
784
785 rs->sc_dkdev.dk_openmask =
786 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
787
788 bad:
789 raidunlock(rs);
790
791 return (error);
792
793
794 }
795 /* ARGSUSED */
796 int
797 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
798 {
799 int unit = raidunit(dev);
800 struct cfdata *cf;
801 struct raid_softc *rs;
802 int error = 0;
803 int part;
804
805 if (unit >= numraid)
806 return (ENXIO);
807 rs = &raid_softc[unit];
808
809 if ((error = raidlock(rs)) != 0)
810 return (error);
811
812 part = DISKPART(dev);
813
814 /* ...that much closer to allowing unconfiguration... */
815 switch (fmt) {
816 case S_IFCHR:
817 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
818 break;
819
820 case S_IFBLK:
821 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
822 break;
823 }
824 rs->sc_dkdev.dk_openmask =
825 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
826
827 if ((rs->sc_dkdev.dk_openmask == 0) &&
828 ((rs->sc_flags & RAIDF_INITED) != 0)) {
829 /* Last one... device is not unconfigured yet.
830 Device shutdown has taken care of setting the
831 clean bits if RAIDF_INITED is not set
832 mark things as clean... */
833
834 rf_update_component_labels(raidPtrs[unit],
835 RF_FINAL_COMPONENT_UPDATE);
836 if (doing_shutdown) {
837 /* last one, and we're going down, so
838 lights out for this RAID set too. */
839 error = rf_Shutdown(raidPtrs[unit]);
840
841 /* It's no longer initialized... */
842 rs->sc_flags &= ~RAIDF_INITED;
843
844 /* detach the device */
845
846 cf = device_cfdata(rs->sc_dev);
847 error = config_detach(rs->sc_dev, DETACH_QUIET);
848 free(cf, M_RAIDFRAME);
849
850 /* Detach the disk. */
851 disk_detach(&rs->sc_dkdev);
852 disk_destroy(&rs->sc_dkdev);
853 }
854 }
855
856 raidunlock(rs);
857 return (0);
858
859 }
860
861 void
862 raidstrategy(struct buf *bp)
863 {
864 int s;
865
866 unsigned int raidID = raidunit(bp->b_dev);
867 RF_Raid_t *raidPtr;
868 struct raid_softc *rs = &raid_softc[raidID];
869 int wlabel;
870
871 if ((rs->sc_flags & RAIDF_INITED) ==0) {
872 bp->b_error = ENXIO;
873 goto done;
874 }
875 if (raidID >= numraid || !raidPtrs[raidID]) {
876 bp->b_error = ENODEV;
877 goto done;
878 }
879 raidPtr = raidPtrs[raidID];
880 if (!raidPtr->valid) {
881 bp->b_error = ENODEV;
882 goto done;
883 }
884 if (bp->b_bcount == 0) {
885 db1_printf(("b_bcount is zero..\n"));
886 goto done;
887 }
888
889 /*
890 * Do bounds checking and adjust transfer. If there's an
891 * error, the bounds check will flag that for us.
892 */
893
894 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
895 if (DISKPART(bp->b_dev) == RAW_PART) {
896 uint64_t size; /* device size in DEV_BSIZE unit */
897
898 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
899 size = raidPtr->totalSectors <<
900 (raidPtr->logBytesPerSector - DEV_BSHIFT);
901 } else {
902 size = raidPtr->totalSectors >>
903 (DEV_BSHIFT - raidPtr->logBytesPerSector);
904 }
905 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
906 goto done;
907 }
908 } else {
909 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
910 db1_printf(("Bounds check failed!!:%d %d\n",
911 (int) bp->b_blkno, (int) wlabel));
912 goto done;
913 }
914 }
915 s = splbio();
916
917 bp->b_resid = 0;
918
919 /* stuff it onto our queue */
920 BUFQ_PUT(rs->buf_queue, bp);
921
922 /* scheduled the IO to happen at the next convenient time */
923 wakeup(&(raidPtrs[raidID]->iodone));
924
925 splx(s);
926 return;
927
928 done:
929 bp->b_resid = bp->b_bcount;
930 biodone(bp);
931 }
932 /* ARGSUSED */
933 int
934 raidread(dev_t dev, struct uio *uio, int flags)
935 {
936 int unit = raidunit(dev);
937 struct raid_softc *rs;
938
939 if (unit >= numraid)
940 return (ENXIO);
941 rs = &raid_softc[unit];
942
943 if ((rs->sc_flags & RAIDF_INITED) == 0)
944 return (ENXIO);
945
946 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
947
948 }
949 /* ARGSUSED */
950 int
951 raidwrite(dev_t dev, struct uio *uio, int flags)
952 {
953 int unit = raidunit(dev);
954 struct raid_softc *rs;
955
956 if (unit >= numraid)
957 return (ENXIO);
958 rs = &raid_softc[unit];
959
960 if ((rs->sc_flags & RAIDF_INITED) == 0)
961 return (ENXIO);
962
963 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
964
965 }
966
967 int
968 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
969 {
970 int unit = raidunit(dev);
971 int error = 0;
972 int part, pmask;
973 struct cfdata *cf;
974 struct raid_softc *rs;
975 RF_Config_t *k_cfg, *u_cfg;
976 RF_Raid_t *raidPtr;
977 RF_RaidDisk_t *diskPtr;
978 RF_AccTotals_t *totals;
979 RF_DeviceConfig_t *d_cfg, **ucfgp;
980 u_char *specific_buf;
981 int retcode = 0;
982 int column;
983 int raidid;
984 struct rf_recon_req *rrcopy, *rr;
985 RF_ComponentLabel_t *clabel;
986 RF_ComponentLabel_t *ci_label;
987 RF_ComponentLabel_t **clabel_ptr;
988 RF_SingleComponent_t *sparePtr,*componentPtr;
989 RF_SingleComponent_t component;
990 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
991 int i, j, d;
992 #ifdef __HAVE_OLD_DISKLABEL
993 struct disklabel newlabel;
994 #endif
995 struct dkwedge_info *dkw;
996
997 if (unit >= numraid)
998 return (ENXIO);
999 rs = &raid_softc[unit];
1000 raidPtr = raidPtrs[unit];
1001
1002 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
1003 (int) DISKPART(dev), (int) unit, (int) cmd));
1004
1005 /* Must be open for writes for these commands... */
1006 switch (cmd) {
1007 #ifdef DIOCGSECTORSIZE
1008 case DIOCGSECTORSIZE:
1009 *(u_int *)data = raidPtr->bytesPerSector;
1010 return 0;
1011 case DIOCGMEDIASIZE:
1012 *(off_t *)data =
1013 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1014 return 0;
1015 #endif
1016 case DIOCSDINFO:
1017 case DIOCWDINFO:
1018 #ifdef __HAVE_OLD_DISKLABEL
1019 case ODIOCWDINFO:
1020 case ODIOCSDINFO:
1021 #endif
1022 case DIOCWLABEL:
1023 case DIOCAWEDGE:
1024 case DIOCDWEDGE:
1025 if ((flag & FWRITE) == 0)
1026 return (EBADF);
1027 }
1028
1029 /* Must be initialized for these... */
1030 switch (cmd) {
1031 case DIOCGDINFO:
1032 case DIOCSDINFO:
1033 case DIOCWDINFO:
1034 #ifdef __HAVE_OLD_DISKLABEL
1035 case ODIOCGDINFO:
1036 case ODIOCWDINFO:
1037 case ODIOCSDINFO:
1038 case ODIOCGDEFLABEL:
1039 #endif
1040 case DIOCGPART:
1041 case DIOCWLABEL:
1042 case DIOCGDEFLABEL:
1043 case DIOCAWEDGE:
1044 case DIOCDWEDGE:
1045 case DIOCLWEDGES:
1046 case RAIDFRAME_SHUTDOWN:
1047 case RAIDFRAME_REWRITEPARITY:
1048 case RAIDFRAME_GET_INFO:
1049 case RAIDFRAME_RESET_ACCTOTALS:
1050 case RAIDFRAME_GET_ACCTOTALS:
1051 case RAIDFRAME_KEEP_ACCTOTALS:
1052 case RAIDFRAME_GET_SIZE:
1053 case RAIDFRAME_FAIL_DISK:
1054 case RAIDFRAME_COPYBACK:
1055 case RAIDFRAME_CHECK_RECON_STATUS:
1056 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1057 case RAIDFRAME_GET_COMPONENT_LABEL:
1058 case RAIDFRAME_SET_COMPONENT_LABEL:
1059 case RAIDFRAME_ADD_HOT_SPARE:
1060 case RAIDFRAME_REMOVE_HOT_SPARE:
1061 case RAIDFRAME_INIT_LABELS:
1062 case RAIDFRAME_REBUILD_IN_PLACE:
1063 case RAIDFRAME_CHECK_PARITY:
1064 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1065 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1066 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1067 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1068 case RAIDFRAME_SET_AUTOCONFIG:
1069 case RAIDFRAME_SET_ROOT:
1070 case RAIDFRAME_DELETE_COMPONENT:
1071 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1072 if ((rs->sc_flags & RAIDF_INITED) == 0)
1073 return (ENXIO);
1074 }
1075
1076 switch (cmd) {
1077
1078 /* configure the system */
1079 case RAIDFRAME_CONFIGURE:
1080
1081 if (raidPtr->valid) {
1082 /* There is a valid RAID set running on this unit! */
1083 printf("raid%d: Device already configured!\n",unit);
1084 return(EINVAL);
1085 }
1086
1087 /* copy-in the configuration information */
1088 /* data points to a pointer to the configuration structure */
1089
1090 u_cfg = *((RF_Config_t **) data);
1091 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1092 if (k_cfg == NULL) {
1093 return (ENOMEM);
1094 }
1095 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1096 if (retcode) {
1097 RF_Free(k_cfg, sizeof(RF_Config_t));
1098 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1099 retcode));
1100 return (retcode);
1101 }
1102 /* allocate a buffer for the layout-specific data, and copy it
1103 * in */
1104 if (k_cfg->layoutSpecificSize) {
1105 if (k_cfg->layoutSpecificSize > 10000) {
1106 /* sanity check */
1107 RF_Free(k_cfg, sizeof(RF_Config_t));
1108 return (EINVAL);
1109 }
1110 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1111 (u_char *));
1112 if (specific_buf == NULL) {
1113 RF_Free(k_cfg, sizeof(RF_Config_t));
1114 return (ENOMEM);
1115 }
1116 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1117 k_cfg->layoutSpecificSize);
1118 if (retcode) {
1119 RF_Free(k_cfg, sizeof(RF_Config_t));
1120 RF_Free(specific_buf,
1121 k_cfg->layoutSpecificSize);
1122 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1123 retcode));
1124 return (retcode);
1125 }
1126 } else
1127 specific_buf = NULL;
1128 k_cfg->layoutSpecific = specific_buf;
1129
1130 /* should do some kind of sanity check on the configuration.
1131 * Store the sum of all the bytes in the last byte? */
1132
1133 /* configure the system */
1134
1135 /*
1136 * Clear the entire RAID descriptor, just to make sure
1137 * there is no stale data left in the case of a
1138 * reconfiguration
1139 */
1140 memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1141 raidPtr->raidid = unit;
1142
1143 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1144
1145 if (retcode == 0) {
1146
1147 /* allow this many simultaneous IO's to
1148 this RAID device */
1149 raidPtr->openings = RAIDOUTSTANDING;
1150
1151 raidinit(raidPtr);
1152 rf_markalldirty(raidPtr);
1153 }
1154 /* free the buffers. No return code here. */
1155 if (k_cfg->layoutSpecificSize) {
1156 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1157 }
1158 RF_Free(k_cfg, sizeof(RF_Config_t));
1159
1160 return (retcode);
1161
1162 /* shutdown the system */
1163 case RAIDFRAME_SHUTDOWN:
1164
1165 if ((error = raidlock(rs)) != 0)
1166 return (error);
1167
1168 /*
1169 * If somebody has a partition mounted, we shouldn't
1170 * shutdown.
1171 */
1172
1173 part = DISKPART(dev);
1174 pmask = (1 << part);
1175 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1176 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1177 (rs->sc_dkdev.dk_copenmask & pmask))) {
1178 raidunlock(rs);
1179 return (EBUSY);
1180 }
1181
1182 retcode = rf_Shutdown(raidPtr);
1183
1184 /* It's no longer initialized... */
1185 rs->sc_flags &= ~RAIDF_INITED;
1186
1187 /* free the pseudo device attach bits */
1188
1189 cf = device_cfdata(rs->sc_dev);
1190 /* XXX this causes us to not return any errors
1191 from the above call to rf_Shutdown() */
1192 retcode = config_detach(rs->sc_dev, DETACH_QUIET);
1193 free(cf, M_RAIDFRAME);
1194
1195 /* Detach the disk. */
1196 disk_detach(&rs->sc_dkdev);
1197 disk_destroy(&rs->sc_dkdev);
1198
1199 raidunlock(rs);
1200
1201 return (retcode);
1202 case RAIDFRAME_GET_COMPONENT_LABEL:
1203 clabel_ptr = (RF_ComponentLabel_t **) data;
1204 /* need to read the component label for the disk indicated
1205 by row,column in clabel */
1206
1207 /* For practice, let's get it directly fromdisk, rather
1208 than from the in-core copy */
1209 RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
1210 (RF_ComponentLabel_t *));
1211 if (clabel == NULL)
1212 return (ENOMEM);
1213
1214 retcode = copyin( *clabel_ptr, clabel,
1215 sizeof(RF_ComponentLabel_t));
1216
1217 if (retcode) {
1218 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1219 return(retcode);
1220 }
1221
1222 clabel->row = 0; /* Don't allow looking at anything else.*/
1223
1224 column = clabel->column;
1225
1226 if ((column < 0) || (column >= raidPtr->numCol +
1227 raidPtr->numSpare)) {
1228 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1229 return(EINVAL);
1230 }
1231
1232 retcode = raidread_component_label(raidPtr->Disks[column].dev,
1233 raidPtr->raid_cinfo[column].ci_vp,
1234 clabel );
1235
1236 if (retcode == 0) {
1237 retcode = copyout(clabel, *clabel_ptr,
1238 sizeof(RF_ComponentLabel_t));
1239 }
1240 RF_Free(clabel, sizeof(RF_ComponentLabel_t));
1241 return (retcode);
1242
1243 case RAIDFRAME_SET_COMPONENT_LABEL:
1244 clabel = (RF_ComponentLabel_t *) data;
1245
1246 /* XXX check the label for valid stuff... */
1247 /* Note that some things *should not* get modified --
1248 the user should be re-initing the labels instead of
1249 trying to patch things.
1250 */
1251
1252 raidid = raidPtr->raidid;
1253 #ifdef DEBUG
1254 printf("raid%d: Got component label:\n", raidid);
1255 printf("raid%d: Version: %d\n", raidid, clabel->version);
1256 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1257 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1258 printf("raid%d: Column: %d\n", raidid, clabel->column);
1259 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1260 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1261 printf("raid%d: Status: %d\n", raidid, clabel->status);
1262 #endif
1263 clabel->row = 0;
1264 column = clabel->column;
1265
1266 if ((column < 0) || (column >= raidPtr->numCol)) {
1267 return(EINVAL);
1268 }
1269
1270 /* XXX this isn't allowed to do anything for now :-) */
1271
1272 /* XXX and before it is, we need to fill in the rest
1273 of the fields!?!?!?! */
1274 #if 0
1275 raidwrite_component_label(
1276 raidPtr->Disks[column].dev,
1277 raidPtr->raid_cinfo[column].ci_vp,
1278 clabel );
1279 #endif
1280 return (0);
1281
1282 case RAIDFRAME_INIT_LABELS:
1283 clabel = (RF_ComponentLabel_t *) data;
1284 /*
1285 we only want the serial number from
1286 the above. We get all the rest of the information
1287 from the config that was used to create this RAID
1288 set.
1289 */
1290
1291 raidPtr->serial_number = clabel->serial_number;
1292
1293 RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t),
1294 (RF_ComponentLabel_t *));
1295 if (ci_label == NULL)
1296 return (ENOMEM);
1297
1298 raid_init_component_label(raidPtr, ci_label);
1299 ci_label->serial_number = clabel->serial_number;
1300 ci_label->row = 0; /* we dont' pretend to support more */
1301
1302 for(column=0;column<raidPtr->numCol;column++) {
1303 diskPtr = &raidPtr->Disks[column];
1304 if (!RF_DEAD_DISK(diskPtr->status)) {
1305 ci_label->partitionSize = diskPtr->partitionSize;
1306 ci_label->column = column;
1307 raidwrite_component_label(
1308 raidPtr->Disks[column].dev,
1309 raidPtr->raid_cinfo[column].ci_vp,
1310 ci_label );
1311 }
1312 }
1313 RF_Free(ci_label, sizeof(RF_ComponentLabel_t));
1314
1315 return (retcode);
1316 case RAIDFRAME_SET_AUTOCONFIG:
1317 d = rf_set_autoconfig(raidPtr, *(int *) data);
1318 printf("raid%d: New autoconfig value is: %d\n",
1319 raidPtr->raidid, d);
1320 *(int *) data = d;
1321 return (retcode);
1322
1323 case RAIDFRAME_SET_ROOT:
1324 d = rf_set_rootpartition(raidPtr, *(int *) data);
1325 printf("raid%d: New rootpartition value is: %d\n",
1326 raidPtr->raidid, d);
1327 *(int *) data = d;
1328 return (retcode);
1329
1330 /* initialize all parity */
1331 case RAIDFRAME_REWRITEPARITY:
1332
1333 if (raidPtr->Layout.map->faultsTolerated == 0) {
1334 /* Parity for RAID 0 is trivially correct */
1335 raidPtr->parity_good = RF_RAID_CLEAN;
1336 return(0);
1337 }
1338
1339 if (raidPtr->parity_rewrite_in_progress == 1) {
1340 /* Re-write is already in progress! */
1341 return(EINVAL);
1342 }
1343
1344 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1345 rf_RewriteParityThread,
1346 raidPtr,"raid_parity");
1347 return (retcode);
1348
1349
1350 case RAIDFRAME_ADD_HOT_SPARE:
1351 sparePtr = (RF_SingleComponent_t *) data;
1352 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1353 retcode = rf_add_hot_spare(raidPtr, &component);
1354 return(retcode);
1355
1356 case RAIDFRAME_REMOVE_HOT_SPARE:
1357 return(retcode);
1358
1359 case RAIDFRAME_DELETE_COMPONENT:
1360 componentPtr = (RF_SingleComponent_t *)data;
1361 memcpy( &component, componentPtr,
1362 sizeof(RF_SingleComponent_t));
1363 retcode = rf_delete_component(raidPtr, &component);
1364 return(retcode);
1365
1366 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1367 componentPtr = (RF_SingleComponent_t *)data;
1368 memcpy( &component, componentPtr,
1369 sizeof(RF_SingleComponent_t));
1370 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1371 return(retcode);
1372
1373 case RAIDFRAME_REBUILD_IN_PLACE:
1374
1375 if (raidPtr->Layout.map->faultsTolerated == 0) {
1376 /* Can't do this on a RAID 0!! */
1377 return(EINVAL);
1378 }
1379
1380 if (raidPtr->recon_in_progress == 1) {
1381 /* a reconstruct is already in progress! */
1382 return(EINVAL);
1383 }
1384
1385 componentPtr = (RF_SingleComponent_t *) data;
1386 memcpy( &component, componentPtr,
1387 sizeof(RF_SingleComponent_t));
1388 component.row = 0; /* we don't support any more */
1389 column = component.column;
1390
1391 if ((column < 0) || (column >= raidPtr->numCol)) {
1392 return(EINVAL);
1393 }
1394
1395 RF_LOCK_MUTEX(raidPtr->mutex);
1396 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1397 (raidPtr->numFailures > 0)) {
1398 /* XXX 0 above shouldn't be constant!!! */
1399 /* some component other than this has failed.
1400 Let's not make things worse than they already
1401 are... */
1402 printf("raid%d: Unable to reconstruct to disk at:\n",
1403 raidPtr->raidid);
1404 printf("raid%d: Col: %d Too many failures.\n",
1405 raidPtr->raidid, column);
1406 RF_UNLOCK_MUTEX(raidPtr->mutex);
1407 return (EINVAL);
1408 }
1409 if (raidPtr->Disks[column].status ==
1410 rf_ds_reconstructing) {
1411 printf("raid%d: Unable to reconstruct to disk at:\n",
1412 raidPtr->raidid);
1413 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1414
1415 RF_UNLOCK_MUTEX(raidPtr->mutex);
1416 return (EINVAL);
1417 }
1418 if (raidPtr->Disks[column].status == rf_ds_spared) {
1419 RF_UNLOCK_MUTEX(raidPtr->mutex);
1420 return (EINVAL);
1421 }
1422 RF_UNLOCK_MUTEX(raidPtr->mutex);
1423
1424 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1425 if (rrcopy == NULL)
1426 return(ENOMEM);
1427
1428 rrcopy->raidPtr = (void *) raidPtr;
1429 rrcopy->col = column;
1430
1431 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1432 rf_ReconstructInPlaceThread,
1433 rrcopy,"raid_reconip");
1434 return(retcode);
1435
1436 case RAIDFRAME_GET_INFO:
1437 if (!raidPtr->valid)
1438 return (ENODEV);
1439 ucfgp = (RF_DeviceConfig_t **) data;
1440 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1441 (RF_DeviceConfig_t *));
1442 if (d_cfg == NULL)
1443 return (ENOMEM);
1444 d_cfg->rows = 1; /* there is only 1 row now */
1445 d_cfg->cols = raidPtr->numCol;
1446 d_cfg->ndevs = raidPtr->numCol;
1447 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1448 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1449 return (ENOMEM);
1450 }
1451 d_cfg->nspares = raidPtr->numSpare;
1452 if (d_cfg->nspares >= RF_MAX_DISKS) {
1453 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1454 return (ENOMEM);
1455 }
1456 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1457 d = 0;
1458 for (j = 0; j < d_cfg->cols; j++) {
1459 d_cfg->devs[d] = raidPtr->Disks[j];
1460 d++;
1461 }
1462 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1463 d_cfg->spares[i] = raidPtr->Disks[j];
1464 }
1465 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1466 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1467
1468 return (retcode);
1469
1470 case RAIDFRAME_CHECK_PARITY:
1471 *(int *) data = raidPtr->parity_good;
1472 return (0);
1473
1474 case RAIDFRAME_RESET_ACCTOTALS:
1475 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1476 return (0);
1477
1478 case RAIDFRAME_GET_ACCTOTALS:
1479 totals = (RF_AccTotals_t *) data;
1480 *totals = raidPtr->acc_totals;
1481 return (0);
1482
1483 case RAIDFRAME_KEEP_ACCTOTALS:
1484 raidPtr->keep_acc_totals = *(int *)data;
1485 return (0);
1486
1487 case RAIDFRAME_GET_SIZE:
1488 *(int *) data = raidPtr->totalSectors;
1489 return (0);
1490
1491 /* fail a disk & optionally start reconstruction */
1492 case RAIDFRAME_FAIL_DISK:
1493
1494 if (raidPtr->Layout.map->faultsTolerated == 0) {
1495 /* Can't do this on a RAID 0!! */
1496 return(EINVAL);
1497 }
1498
1499 rr = (struct rf_recon_req *) data;
1500 rr->row = 0;
1501 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1502 return (EINVAL);
1503
1504
1505 RF_LOCK_MUTEX(raidPtr->mutex);
1506 if (raidPtr->status == rf_rs_reconstructing) {
1507 /* you can't fail a disk while we're reconstructing! */
1508 /* XXX wrong for RAID6 */
1509 RF_UNLOCK_MUTEX(raidPtr->mutex);
1510 return (EINVAL);
1511 }
1512 if ((raidPtr->Disks[rr->col].status ==
1513 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1514 /* some other component has failed. Let's not make
1515 things worse. XXX wrong for RAID6 */
1516 RF_UNLOCK_MUTEX(raidPtr->mutex);
1517 return (EINVAL);
1518 }
1519 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1520 /* Can't fail a spared disk! */
1521 RF_UNLOCK_MUTEX(raidPtr->mutex);
1522 return (EINVAL);
1523 }
1524 RF_UNLOCK_MUTEX(raidPtr->mutex);
1525
1526 /* make a copy of the recon request so that we don't rely on
1527 * the user's buffer */
1528 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1529 if (rrcopy == NULL)
1530 return(ENOMEM);
1531 memcpy(rrcopy, rr, sizeof(*rr));
1532 rrcopy->raidPtr = (void *) raidPtr;
1533
1534 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1535 rf_ReconThread,
1536 rrcopy,"raid_recon");
1537 return (0);
1538
1539 /* invoke a copyback operation after recon on whatever disk
1540 * needs it, if any */
1541 case RAIDFRAME_COPYBACK:
1542
1543 if (raidPtr->Layout.map->faultsTolerated == 0) {
1544 /* This makes no sense on a RAID 0!! */
1545 return(EINVAL);
1546 }
1547
1548 if (raidPtr->copyback_in_progress == 1) {
1549 /* Copyback is already in progress! */
1550 return(EINVAL);
1551 }
1552
1553 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1554 rf_CopybackThread,
1555 raidPtr,"raid_copyback");
1556 return (retcode);
1557
1558 /* return the percentage completion of reconstruction */
1559 case RAIDFRAME_CHECK_RECON_STATUS:
1560 if (raidPtr->Layout.map->faultsTolerated == 0) {
1561 /* This makes no sense on a RAID 0, so tell the
1562 user it's done. */
1563 *(int *) data = 100;
1564 return(0);
1565 }
1566 if (raidPtr->status != rf_rs_reconstructing)
1567 *(int *) data = 100;
1568 else {
1569 if (raidPtr->reconControl->numRUsTotal > 0) {
1570 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1571 } else {
1572 *(int *) data = 0;
1573 }
1574 }
1575 return (0);
1576 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1577 progressInfoPtr = (RF_ProgressInfo_t **) data;
1578 if (raidPtr->status != rf_rs_reconstructing) {
1579 progressInfo.remaining = 0;
1580 progressInfo.completed = 100;
1581 progressInfo.total = 100;
1582 } else {
1583 progressInfo.total =
1584 raidPtr->reconControl->numRUsTotal;
1585 progressInfo.completed =
1586 raidPtr->reconControl->numRUsComplete;
1587 progressInfo.remaining = progressInfo.total -
1588 progressInfo.completed;
1589 }
1590 retcode = copyout(&progressInfo, *progressInfoPtr,
1591 sizeof(RF_ProgressInfo_t));
1592 return (retcode);
1593
1594 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1595 if (raidPtr->Layout.map->faultsTolerated == 0) {
1596 /* This makes no sense on a RAID 0, so tell the
1597 user it's done. */
1598 *(int *) data = 100;
1599 return(0);
1600 }
1601 if (raidPtr->parity_rewrite_in_progress == 1) {
1602 *(int *) data = 100 *
1603 raidPtr->parity_rewrite_stripes_done /
1604 raidPtr->Layout.numStripe;
1605 } else {
1606 *(int *) data = 100;
1607 }
1608 return (0);
1609
1610 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1611 progressInfoPtr = (RF_ProgressInfo_t **) data;
1612 if (raidPtr->parity_rewrite_in_progress == 1) {
1613 progressInfo.total = raidPtr->Layout.numStripe;
1614 progressInfo.completed =
1615 raidPtr->parity_rewrite_stripes_done;
1616 progressInfo.remaining = progressInfo.total -
1617 progressInfo.completed;
1618 } else {
1619 progressInfo.remaining = 0;
1620 progressInfo.completed = 100;
1621 progressInfo.total = 100;
1622 }
1623 retcode = copyout(&progressInfo, *progressInfoPtr,
1624 sizeof(RF_ProgressInfo_t));
1625 return (retcode);
1626
1627 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1628 if (raidPtr->Layout.map->faultsTolerated == 0) {
1629 /* This makes no sense on a RAID 0 */
1630 *(int *) data = 100;
1631 return(0);
1632 }
1633 if (raidPtr->copyback_in_progress == 1) {
1634 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1635 raidPtr->Layout.numStripe;
1636 } else {
1637 *(int *) data = 100;
1638 }
1639 return (0);
1640
1641 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1642 progressInfoPtr = (RF_ProgressInfo_t **) data;
1643 if (raidPtr->copyback_in_progress == 1) {
1644 progressInfo.total = raidPtr->Layout.numStripe;
1645 progressInfo.completed =
1646 raidPtr->copyback_stripes_done;
1647 progressInfo.remaining = progressInfo.total -
1648 progressInfo.completed;
1649 } else {
1650 progressInfo.remaining = 0;
1651 progressInfo.completed = 100;
1652 progressInfo.total = 100;
1653 }
1654 retcode = copyout(&progressInfo, *progressInfoPtr,
1655 sizeof(RF_ProgressInfo_t));
1656 return (retcode);
1657
1658 /* the sparetable daemon calls this to wait for the kernel to
1659 * need a spare table. this ioctl does not return until a
1660 * spare table is needed. XXX -- calling mpsleep here in the
1661 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1662 * -- I should either compute the spare table in the kernel,
1663 * or have a different -- XXX XXX -- interface (a different
1664 * character device) for delivering the table -- XXX */
1665 #if 0
1666 case RAIDFRAME_SPARET_WAIT:
1667 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1668 while (!rf_sparet_wait_queue)
1669 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1670 waitreq = rf_sparet_wait_queue;
1671 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1672 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1673
1674 /* structure assignment */
1675 *((RF_SparetWait_t *) data) = *waitreq;
1676
1677 RF_Free(waitreq, sizeof(*waitreq));
1678 return (0);
1679
1680 /* wakes up a process waiting on SPARET_WAIT and puts an error
1681 * code in it that will cause the dameon to exit */
1682 case RAIDFRAME_ABORT_SPARET_WAIT:
1683 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1684 waitreq->fcol = -1;
1685 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1686 waitreq->next = rf_sparet_wait_queue;
1687 rf_sparet_wait_queue = waitreq;
1688 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1689 wakeup(&rf_sparet_wait_queue);
1690 return (0);
1691
1692 /* used by the spare table daemon to deliver a spare table
1693 * into the kernel */
1694 case RAIDFRAME_SEND_SPARET:
1695
1696 /* install the spare table */
1697 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1698
1699 /* respond to the requestor. the return status of the spare
1700 * table installation is passed in the "fcol" field */
1701 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1702 waitreq->fcol = retcode;
1703 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1704 waitreq->next = rf_sparet_resp_queue;
1705 rf_sparet_resp_queue = waitreq;
1706 wakeup(&rf_sparet_resp_queue);
1707 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1708
1709 return (retcode);
1710 #endif
1711
1712 default:
1713 break; /* fall through to the os-specific code below */
1714
1715 }
1716
1717 if (!raidPtr->valid)
1718 return (EINVAL);
1719
1720 /*
1721 * Add support for "regular" device ioctls here.
1722 */
1723
1724 switch (cmd) {
1725 case DIOCGDINFO:
1726 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1727 break;
1728 #ifdef __HAVE_OLD_DISKLABEL
1729 case ODIOCGDINFO:
1730 newlabel = *(rs->sc_dkdev.dk_label);
1731 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1732 return ENOTTY;
1733 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1734 break;
1735 #endif
1736
1737 case DIOCGPART:
1738 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1739 ((struct partinfo *) data)->part =
1740 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1741 break;
1742
1743 case DIOCWDINFO:
1744 case DIOCSDINFO:
1745 #ifdef __HAVE_OLD_DISKLABEL
1746 case ODIOCWDINFO:
1747 case ODIOCSDINFO:
1748 #endif
1749 {
1750 struct disklabel *lp;
1751 #ifdef __HAVE_OLD_DISKLABEL
1752 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1753 memset(&newlabel, 0, sizeof newlabel);
1754 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1755 lp = &newlabel;
1756 } else
1757 #endif
1758 lp = (struct disklabel *)data;
1759
1760 if ((error = raidlock(rs)) != 0)
1761 return (error);
1762
1763 rs->sc_flags |= RAIDF_LABELLING;
1764
1765 error = setdisklabel(rs->sc_dkdev.dk_label,
1766 lp, 0, rs->sc_dkdev.dk_cpulabel);
1767 if (error == 0) {
1768 if (cmd == DIOCWDINFO
1769 #ifdef __HAVE_OLD_DISKLABEL
1770 || cmd == ODIOCWDINFO
1771 #endif
1772 )
1773 error = writedisklabel(RAIDLABELDEV(dev),
1774 raidstrategy, rs->sc_dkdev.dk_label,
1775 rs->sc_dkdev.dk_cpulabel);
1776 }
1777 rs->sc_flags &= ~RAIDF_LABELLING;
1778
1779 raidunlock(rs);
1780
1781 if (error)
1782 return (error);
1783 break;
1784 }
1785
1786 case DIOCWLABEL:
1787 if (*(int *) data != 0)
1788 rs->sc_flags |= RAIDF_WLABEL;
1789 else
1790 rs->sc_flags &= ~RAIDF_WLABEL;
1791 break;
1792
1793 case DIOCGDEFLABEL:
1794 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1795 break;
1796
1797 #ifdef __HAVE_OLD_DISKLABEL
1798 case ODIOCGDEFLABEL:
1799 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1800 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1801 return ENOTTY;
1802 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1803 break;
1804 #endif
1805
1806 case DIOCAWEDGE:
1807 case DIOCDWEDGE:
1808 dkw = (void *)data;
1809
1810 /* If the ioctl happens here, the parent is us. */
1811 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1812 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1813
1814 case DIOCLWEDGES:
1815 return dkwedge_list(&rs->sc_dkdev,
1816 (struct dkwedge_list *)data, l);
1817
1818 default:
1819 retcode = ENOTTY;
1820 }
1821 return (retcode);
1822
1823 }
1824
1825
1826 /* raidinit -- complete the rest of the initialization for the
1827 RAIDframe device. */
1828
1829
1830 static void
1831 raidinit(RF_Raid_t *raidPtr)
1832 {
1833 struct cfdata *cf;
1834 struct raid_softc *rs;
1835 int unit;
1836
1837 unit = raidPtr->raidid;
1838
1839 rs = &raid_softc[unit];
1840
1841 /* XXX should check return code first... */
1842 rs->sc_flags |= RAIDF_INITED;
1843
1844 /* XXX doesn't check bounds. */
1845 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1846
1847 /* attach the pseudo device */
1848 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1849 cf->cf_name = raid_cd.cd_name;
1850 cf->cf_atname = raid_cd.cd_name;
1851 cf->cf_unit = unit;
1852 cf->cf_fstate = FSTATE_STAR;
1853
1854 rs->sc_dev = config_attach_pseudo(cf);
1855
1856 if (rs->sc_dev==NULL) {
1857 printf("raid%d: config_attach_pseudo failed\n",
1858 raidPtr->raidid);
1859 }
1860
1861 /* disk_attach actually creates space for the CPU disklabel, among
1862 * other things, so it's critical to call this *BEFORE* we try putzing
1863 * with disklabels. */
1864
1865 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1866 disk_attach(&rs->sc_dkdev);
1867
1868 /* XXX There may be a weird interaction here between this, and
1869 * protectedSectors, as used in RAIDframe. */
1870
1871 rs->sc_size = raidPtr->totalSectors;
1872
1873 dkwedge_discover(&rs->sc_dkdev);
1874
1875 rf_set_properties(rs, raidPtr);
1876
1877 }
1878 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1879 /* wake up the daemon & tell it to get us a spare table
1880 * XXX
1881 * the entries in the queues should be tagged with the raidPtr
1882 * so that in the extremely rare case that two recons happen at once,
1883 * we know for which device were requesting a spare table
1884 * XXX
1885 *
1886 * XXX This code is not currently used. GO
1887 */
1888 int
1889 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1890 {
1891 int retcode;
1892
1893 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1894 req->next = rf_sparet_wait_queue;
1895 rf_sparet_wait_queue = req;
1896 wakeup(&rf_sparet_wait_queue);
1897
1898 /* mpsleep unlocks the mutex */
1899 while (!rf_sparet_resp_queue) {
1900 tsleep(&rf_sparet_resp_queue, PRIBIO,
1901 "raidframe getsparetable", 0);
1902 }
1903 req = rf_sparet_resp_queue;
1904 rf_sparet_resp_queue = req->next;
1905 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1906
1907 retcode = req->fcol;
1908 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1909 * alloc'd */
1910 return (retcode);
1911 }
1912 #endif
1913
1914 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1915 * bp & passes it down.
1916 * any calls originating in the kernel must use non-blocking I/O
1917 * do some extra sanity checking to return "appropriate" error values for
1918 * certain conditions (to make some standard utilities work)
1919 *
1920 * Formerly known as: rf_DoAccessKernel
1921 */
1922 void
1923 raidstart(RF_Raid_t *raidPtr)
1924 {
1925 RF_SectorCount_t num_blocks, pb, sum;
1926 RF_RaidAddr_t raid_addr;
1927 struct partition *pp;
1928 daddr_t blocknum;
1929 int unit;
1930 struct raid_softc *rs;
1931 int do_async;
1932 struct buf *bp;
1933 int rc;
1934
1935 unit = raidPtr->raidid;
1936 rs = &raid_softc[unit];
1937
1938 /* quick check to see if anything has died recently */
1939 RF_LOCK_MUTEX(raidPtr->mutex);
1940 if (raidPtr->numNewFailures > 0) {
1941 RF_UNLOCK_MUTEX(raidPtr->mutex);
1942 rf_update_component_labels(raidPtr,
1943 RF_NORMAL_COMPONENT_UPDATE);
1944 RF_LOCK_MUTEX(raidPtr->mutex);
1945 raidPtr->numNewFailures--;
1946 }
1947
1948 /* Check to see if we're at the limit... */
1949 while (raidPtr->openings > 0) {
1950 RF_UNLOCK_MUTEX(raidPtr->mutex);
1951
1952 /* get the next item, if any, from the queue */
1953 if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
1954 /* nothing more to do */
1955 return;
1956 }
1957
1958 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1959 * partition.. Need to make it absolute to the underlying
1960 * device.. */
1961
1962 blocknum = bp->b_blkno;
1963 if (DISKPART(bp->b_dev) != RAW_PART) {
1964 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1965 blocknum += pp->p_offset;
1966 }
1967
1968 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1969 (int) blocknum));
1970
1971 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1972 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1973
1974 /* *THIS* is where we adjust what block we're going to...
1975 * but DO NOT TOUCH bp->b_blkno!!! */
1976 raid_addr = blocknum;
1977
1978 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1979 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1980 sum = raid_addr + num_blocks + pb;
1981 if (1 || rf_debugKernelAccess) {
1982 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1983 (int) raid_addr, (int) sum, (int) num_blocks,
1984 (int) pb, (int) bp->b_resid));
1985 }
1986 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1987 || (sum < num_blocks) || (sum < pb)) {
1988 bp->b_error = ENOSPC;
1989 bp->b_resid = bp->b_bcount;
1990 biodone(bp);
1991 RF_LOCK_MUTEX(raidPtr->mutex);
1992 continue;
1993 }
1994 /*
1995 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1996 */
1997
1998 if (bp->b_bcount & raidPtr->sectorMask) {
1999 bp->b_error = EINVAL;
2000 bp->b_resid = bp->b_bcount;
2001 biodone(bp);
2002 RF_LOCK_MUTEX(raidPtr->mutex);
2003 continue;
2004
2005 }
2006 db1_printf(("Calling DoAccess..\n"));
2007
2008
2009 RF_LOCK_MUTEX(raidPtr->mutex);
2010 raidPtr->openings--;
2011 RF_UNLOCK_MUTEX(raidPtr->mutex);
2012
2013 /*
2014 * Everything is async.
2015 */
2016 do_async = 1;
2017
2018 disk_busy(&rs->sc_dkdev);
2019
2020 /* XXX we're still at splbio() here... do we *really*
2021 need to be? */
2022
2023 /* don't ever condition on bp->b_flags & B_WRITE.
2024 * always condition on B_READ instead */
2025
2026 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2027 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2028 do_async, raid_addr, num_blocks,
2029 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2030
2031 if (rc) {
2032 bp->b_error = rc;
2033 bp->b_resid = bp->b_bcount;
2034 biodone(bp);
2035 /* continue loop */
2036 }
2037
2038 RF_LOCK_MUTEX(raidPtr->mutex);
2039 }
2040 RF_UNLOCK_MUTEX(raidPtr->mutex);
2041 }
2042
2043
2044
2045
2046 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2047
2048 int
2049 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2050 {
2051 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2052 struct buf *bp;
2053
2054 req->queue = queue;
2055
2056 #if DIAGNOSTIC
2057 if (queue->raidPtr->raidid >= numraid) {
2058 printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
2059 numraid);
2060 panic("Invalid Unit number in rf_DispatchKernelIO");
2061 }
2062 #endif
2063
2064 bp = req->bp;
2065
2066 switch (req->type) {
2067 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2068 /* XXX need to do something extra here.. */
2069 /* I'm leaving this in, as I've never actually seen it used,
2070 * and I'd like folks to report it... GO */
2071 printf(("WAKEUP CALLED\n"));
2072 queue->numOutstanding++;
2073
2074 bp->b_flags = 0;
2075 bp->b_private = req;
2076
2077 KernelWakeupFunc(bp);
2078 break;
2079
2080 case RF_IO_TYPE_READ:
2081 case RF_IO_TYPE_WRITE:
2082 #if RF_ACC_TRACE > 0
2083 if (req->tracerec) {
2084 RF_ETIMER_START(req->tracerec->timer);
2085 }
2086 #endif
2087 InitBP(bp, queue->rf_cinfo->ci_vp,
2088 op, queue->rf_cinfo->ci_dev,
2089 req->sectorOffset, req->numSector,
2090 req->buf, KernelWakeupFunc, (void *) req,
2091 queue->raidPtr->logBytesPerSector, req->b_proc);
2092
2093 if (rf_debugKernelAccess) {
2094 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2095 (long) bp->b_blkno));
2096 }
2097 queue->numOutstanding++;
2098 queue->last_deq_sector = req->sectorOffset;
2099 /* acc wouldn't have been let in if there were any pending
2100 * reqs at any other priority */
2101 queue->curPriority = req->priority;
2102
2103 db1_printf(("Going for %c to unit %d col %d\n",
2104 req->type, queue->raidPtr->raidid,
2105 queue->col));
2106 db1_printf(("sector %d count %d (%d bytes) %d\n",
2107 (int) req->sectorOffset, (int) req->numSector,
2108 (int) (req->numSector <<
2109 queue->raidPtr->logBytesPerSector),
2110 (int) queue->raidPtr->logBytesPerSector));
2111 bdev_strategy(bp);
2112
2113 break;
2114
2115 default:
2116 panic("bad req->type in rf_DispatchKernelIO");
2117 }
2118 db1_printf(("Exiting from DispatchKernelIO\n"));
2119
2120 return (0);
2121 }
2122 /* this is the callback function associated with a I/O invoked from
2123 kernel code.
2124 */
2125 static void
2126 KernelWakeupFunc(struct buf *bp)
2127 {
2128 RF_DiskQueueData_t *req = NULL;
2129 RF_DiskQueue_t *queue;
2130 int s;
2131
2132 s = splbio();
2133 db1_printf(("recovering the request queue:\n"));
2134 req = bp->b_private;
2135
2136 queue = (RF_DiskQueue_t *) req->queue;
2137
2138 #if RF_ACC_TRACE > 0
2139 if (req->tracerec) {
2140 RF_ETIMER_STOP(req->tracerec->timer);
2141 RF_ETIMER_EVAL(req->tracerec->timer);
2142 RF_LOCK_MUTEX(rf_tracing_mutex);
2143 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2144 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2145 req->tracerec->num_phys_ios++;
2146 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2147 }
2148 #endif
2149
2150 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2151 * ballistic, and mark the component as hosed... */
2152
2153 if (bp->b_error != 0) {
2154 /* Mark the disk as dead */
2155 /* but only mark it once... */
2156 /* and only if it wouldn't leave this RAID set
2157 completely broken */
2158 if (((queue->raidPtr->Disks[queue->col].status ==
2159 rf_ds_optimal) ||
2160 (queue->raidPtr->Disks[queue->col].status ==
2161 rf_ds_used_spare)) &&
2162 (queue->raidPtr->numFailures <
2163 queue->raidPtr->Layout.map->faultsTolerated)) {
2164 printf("raid%d: IO Error. Marking %s as failed.\n",
2165 queue->raidPtr->raidid,
2166 queue->raidPtr->Disks[queue->col].devname);
2167 queue->raidPtr->Disks[queue->col].status =
2168 rf_ds_failed;
2169 queue->raidPtr->status = rf_rs_degraded;
2170 queue->raidPtr->numFailures++;
2171 queue->raidPtr->numNewFailures++;
2172 } else { /* Disk is already dead... */
2173 /* printf("Disk already marked as dead!\n"); */
2174 }
2175
2176 }
2177
2178 /* Fill in the error value */
2179
2180 req->error = bp->b_error;
2181
2182 simple_lock(&queue->raidPtr->iodone_lock);
2183
2184 /* Drop this one on the "finished" queue... */
2185 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2186
2187 /* Let the raidio thread know there is work to be done. */
2188 wakeup(&(queue->raidPtr->iodone));
2189
2190 simple_unlock(&queue->raidPtr->iodone_lock);
2191
2192 splx(s);
2193 }
2194
2195
2196
2197 /*
2198 * initialize a buf structure for doing an I/O in the kernel.
2199 */
2200 static void
2201 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2202 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2203 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2204 struct proc *b_proc)
2205 {
2206 /* bp->b_flags = B_PHYS | rw_flag; */
2207 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2208 bp->b_oflags = 0;
2209 bp->b_cflags = 0;
2210 bp->b_bcount = numSect << logBytesPerSector;
2211 bp->b_bufsize = bp->b_bcount;
2212 bp->b_error = 0;
2213 bp->b_dev = dev;
2214 bp->b_data = bf;
2215 bp->b_blkno = startSect;
2216 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2217 if (bp->b_bcount == 0) {
2218 panic("bp->b_bcount is zero in InitBP!!");
2219 }
2220 bp->b_proc = b_proc;
2221 bp->b_iodone = cbFunc;
2222 bp->b_private = cbArg;
2223 }
2224
2225 static void
2226 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2227 struct disklabel *lp)
2228 {
2229 memset(lp, 0, sizeof(*lp));
2230
2231 /* fabricate a label... */
2232 lp->d_secperunit = raidPtr->totalSectors;
2233 lp->d_secsize = raidPtr->bytesPerSector;
2234 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2235 lp->d_ntracks = 4 * raidPtr->numCol;
2236 lp->d_ncylinders = raidPtr->totalSectors /
2237 (lp->d_nsectors * lp->d_ntracks);
2238 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2239
2240 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2241 lp->d_type = DTYPE_RAID;
2242 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2243 lp->d_rpm = 3600;
2244 lp->d_interleave = 1;
2245 lp->d_flags = 0;
2246
2247 lp->d_partitions[RAW_PART].p_offset = 0;
2248 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2249 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2250 lp->d_npartitions = RAW_PART + 1;
2251
2252 lp->d_magic = DISKMAGIC;
2253 lp->d_magic2 = DISKMAGIC;
2254 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2255
2256 }
2257 /*
2258 * Read the disklabel from the raid device. If one is not present, fake one
2259 * up.
2260 */
2261 static void
2262 raidgetdisklabel(dev_t dev)
2263 {
2264 int unit = raidunit(dev);
2265 struct raid_softc *rs = &raid_softc[unit];
2266 const char *errstring;
2267 struct disklabel *lp = rs->sc_dkdev.dk_label;
2268 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2269 RF_Raid_t *raidPtr;
2270
2271 db1_printf(("Getting the disklabel...\n"));
2272
2273 memset(clp, 0, sizeof(*clp));
2274
2275 raidPtr = raidPtrs[unit];
2276
2277 raidgetdefaultlabel(raidPtr, rs, lp);
2278
2279 /*
2280 * Call the generic disklabel extraction routine.
2281 */
2282 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2283 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2284 if (errstring)
2285 raidmakedisklabel(rs);
2286 else {
2287 int i;
2288 struct partition *pp;
2289
2290 /*
2291 * Sanity check whether the found disklabel is valid.
2292 *
2293 * This is necessary since total size of the raid device
2294 * may vary when an interleave is changed even though exactly
2295 * same components are used, and old disklabel may used
2296 * if that is found.
2297 */
2298 if (lp->d_secperunit != rs->sc_size)
2299 printf("raid%d: WARNING: %s: "
2300 "total sector size in disklabel (%d) != "
2301 "the size of raid (%ld)\n", unit, rs->sc_xname,
2302 lp->d_secperunit, (long) rs->sc_size);
2303 for (i = 0; i < lp->d_npartitions; i++) {
2304 pp = &lp->d_partitions[i];
2305 if (pp->p_offset + pp->p_size > rs->sc_size)
2306 printf("raid%d: WARNING: %s: end of partition `%c' "
2307 "exceeds the size of raid (%ld)\n",
2308 unit, rs->sc_xname, 'a' + i, (long) rs->sc_size);
2309 }
2310 }
2311
2312 }
2313 /*
2314 * Take care of things one might want to take care of in the event
2315 * that a disklabel isn't present.
2316 */
2317 static void
2318 raidmakedisklabel(struct raid_softc *rs)
2319 {
2320 struct disklabel *lp = rs->sc_dkdev.dk_label;
2321 db1_printf(("Making a label..\n"));
2322
2323 /*
2324 * For historical reasons, if there's no disklabel present
2325 * the raw partition must be marked FS_BSDFFS.
2326 */
2327
2328 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2329
2330 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2331
2332 lp->d_checksum = dkcksum(lp);
2333 }
2334 /*
2335 * Wait interruptibly for an exclusive lock.
2336 *
2337 * XXX
2338 * Several drivers do this; it should be abstracted and made MP-safe.
2339 * (Hmm... where have we seen this warning before :-> GO )
2340 */
2341 static int
2342 raidlock(struct raid_softc *rs)
2343 {
2344 int error;
2345
2346 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2347 rs->sc_flags |= RAIDF_WANTED;
2348 if ((error =
2349 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2350 return (error);
2351 }
2352 rs->sc_flags |= RAIDF_LOCKED;
2353 return (0);
2354 }
2355 /*
2356 * Unlock and wake up any waiters.
2357 */
2358 static void
2359 raidunlock(struct raid_softc *rs)
2360 {
2361
2362 rs->sc_flags &= ~RAIDF_LOCKED;
2363 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2364 rs->sc_flags &= ~RAIDF_WANTED;
2365 wakeup(rs);
2366 }
2367 }
2368
2369
2370 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2371 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2372
2373 int
2374 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2375 {
2376 RF_ComponentLabel_t clabel;
2377 raidread_component_label(dev, b_vp, &clabel);
2378 clabel.mod_counter = mod_counter;
2379 clabel.clean = RF_RAID_CLEAN;
2380 raidwrite_component_label(dev, b_vp, &clabel);
2381 return(0);
2382 }
2383
2384
2385 int
2386 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2387 {
2388 RF_ComponentLabel_t clabel;
2389 raidread_component_label(dev, b_vp, &clabel);
2390 clabel.mod_counter = mod_counter;
2391 clabel.clean = RF_RAID_DIRTY;
2392 raidwrite_component_label(dev, b_vp, &clabel);
2393 return(0);
2394 }
2395
2396 /* ARGSUSED */
2397 int
2398 raidread_component_label(dev_t dev, struct vnode *b_vp,
2399 RF_ComponentLabel_t *clabel)
2400 {
2401 struct buf *bp;
2402 const struct bdevsw *bdev;
2403 int error;
2404
2405 /* XXX should probably ensure that we don't try to do this if
2406 someone has changed rf_protected_sectors. */
2407
2408 if (b_vp == NULL) {
2409 /* For whatever reason, this component is not valid.
2410 Don't try to read a component label from it. */
2411 return(EINVAL);
2412 }
2413
2414 /* get a block of the appropriate size... */
2415 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2416 bp->b_dev = dev;
2417
2418 /* get our ducks in a row for the read */
2419 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2420 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2421 bp->b_flags |= B_READ;
2422 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2423
2424 bdev = bdevsw_lookup(bp->b_dev);
2425 if (bdev == NULL)
2426 return (ENXIO);
2427 (*bdev->d_strategy)(bp);
2428
2429 error = biowait(bp);
2430
2431 if (!error) {
2432 memcpy(clabel, bp->b_data,
2433 sizeof(RF_ComponentLabel_t));
2434 }
2435
2436 brelse(bp, 0);
2437 return(error);
2438 }
2439 /* ARGSUSED */
2440 int
2441 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2442 RF_ComponentLabel_t *clabel)
2443 {
2444 struct buf *bp;
2445 const struct bdevsw *bdev;
2446 int error;
2447
2448 /* get a block of the appropriate size... */
2449 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2450 bp->b_dev = dev;
2451
2452 /* get our ducks in a row for the write */
2453 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2454 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2455 bp->b_flags |= B_WRITE;
2456 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2457
2458 memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
2459
2460 memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
2461
2462 bdev = bdevsw_lookup(bp->b_dev);
2463 if (bdev == NULL)
2464 return (ENXIO);
2465 (*bdev->d_strategy)(bp);
2466 error = biowait(bp);
2467 brelse(bp, 0);
2468 if (error) {
2469 #if 1
2470 printf("Failed to write RAID component info!\n");
2471 #endif
2472 }
2473
2474 return(error);
2475 }
2476
2477 void
2478 rf_markalldirty(RF_Raid_t *raidPtr)
2479 {
2480 RF_ComponentLabel_t clabel;
2481 int sparecol;
2482 int c;
2483 int j;
2484 int scol = -1;
2485
2486 raidPtr->mod_counter++;
2487 for (c = 0; c < raidPtr->numCol; c++) {
2488 /* we don't want to touch (at all) a disk that has
2489 failed */
2490 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2491 raidread_component_label(
2492 raidPtr->Disks[c].dev,
2493 raidPtr->raid_cinfo[c].ci_vp,
2494 &clabel);
2495 if (clabel.status == rf_ds_spared) {
2496 /* XXX do something special...
2497 but whatever you do, don't
2498 try to access it!! */
2499 } else {
2500 raidmarkdirty(
2501 raidPtr->Disks[c].dev,
2502 raidPtr->raid_cinfo[c].ci_vp,
2503 raidPtr->mod_counter);
2504 }
2505 }
2506 }
2507
2508 for( c = 0; c < raidPtr->numSpare ; c++) {
2509 sparecol = raidPtr->numCol + c;
2510 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2511 /*
2512
2513 we claim this disk is "optimal" if it's
2514 rf_ds_used_spare, as that means it should be
2515 directly substitutable for the disk it replaced.
2516 We note that too...
2517
2518 */
2519
2520 for(j=0;j<raidPtr->numCol;j++) {
2521 if (raidPtr->Disks[j].spareCol == sparecol) {
2522 scol = j;
2523 break;
2524 }
2525 }
2526
2527 raidread_component_label(
2528 raidPtr->Disks[sparecol].dev,
2529 raidPtr->raid_cinfo[sparecol].ci_vp,
2530 &clabel);
2531 /* make sure status is noted */
2532
2533 raid_init_component_label(raidPtr, &clabel);
2534
2535 clabel.row = 0;
2536 clabel.column = scol;
2537 /* Note: we *don't* change status from rf_ds_used_spare
2538 to rf_ds_optimal */
2539 /* clabel.status = rf_ds_optimal; */
2540
2541 raidmarkdirty(raidPtr->Disks[sparecol].dev,
2542 raidPtr->raid_cinfo[sparecol].ci_vp,
2543 raidPtr->mod_counter);
2544 }
2545 }
2546 }
2547
2548
2549 void
2550 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2551 {
2552 RF_ComponentLabel_t clabel;
2553 int sparecol;
2554 int c;
2555 int j;
2556 int scol;
2557
2558 scol = -1;
2559
2560 /* XXX should do extra checks to make sure things really are clean,
2561 rather than blindly setting the clean bit... */
2562
2563 raidPtr->mod_counter++;
2564
2565 for (c = 0; c < raidPtr->numCol; c++) {
2566 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2567 raidread_component_label(
2568 raidPtr->Disks[c].dev,
2569 raidPtr->raid_cinfo[c].ci_vp,
2570 &clabel);
2571 /* make sure status is noted */
2572 clabel.status = rf_ds_optimal;
2573
2574 /* bump the counter */
2575 clabel.mod_counter = raidPtr->mod_counter;
2576
2577 /* note what unit we are configured as */
2578 clabel.last_unit = raidPtr->raidid;
2579
2580 raidwrite_component_label(
2581 raidPtr->Disks[c].dev,
2582 raidPtr->raid_cinfo[c].ci_vp,
2583 &clabel);
2584 if (final == RF_FINAL_COMPONENT_UPDATE) {
2585 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2586 raidmarkclean(
2587 raidPtr->Disks[c].dev,
2588 raidPtr->raid_cinfo[c].ci_vp,
2589 raidPtr->mod_counter);
2590 }
2591 }
2592 }
2593 /* else we don't touch it.. */
2594 }
2595
2596 for( c = 0; c < raidPtr->numSpare ; c++) {
2597 sparecol = raidPtr->numCol + c;
2598 /* Need to ensure that the reconstruct actually completed! */
2599 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2600 /*
2601
2602 we claim this disk is "optimal" if it's
2603 rf_ds_used_spare, as that means it should be
2604 directly substitutable for the disk it replaced.
2605 We note that too...
2606
2607 */
2608
2609 for(j=0;j<raidPtr->numCol;j++) {
2610 if (raidPtr->Disks[j].spareCol == sparecol) {
2611 scol = j;
2612 break;
2613 }
2614 }
2615
2616 /* XXX shouldn't *really* need this... */
2617 raidread_component_label(
2618 raidPtr->Disks[sparecol].dev,
2619 raidPtr->raid_cinfo[sparecol].ci_vp,
2620 &clabel);
2621 /* make sure status is noted */
2622
2623 raid_init_component_label(raidPtr, &clabel);
2624
2625 clabel.mod_counter = raidPtr->mod_counter;
2626 clabel.column = scol;
2627 clabel.status = rf_ds_optimal;
2628 clabel.last_unit = raidPtr->raidid;
2629
2630 raidwrite_component_label(
2631 raidPtr->Disks[sparecol].dev,
2632 raidPtr->raid_cinfo[sparecol].ci_vp,
2633 &clabel);
2634 if (final == RF_FINAL_COMPONENT_UPDATE) {
2635 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2636 raidmarkclean( raidPtr->Disks[sparecol].dev,
2637 raidPtr->raid_cinfo[sparecol].ci_vp,
2638 raidPtr->mod_counter);
2639 }
2640 }
2641 }
2642 }
2643 }
2644
2645 void
2646 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2647 {
2648
2649 if (vp != NULL) {
2650 if (auto_configured == 1) {
2651 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2652 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2653 vput(vp);
2654
2655 } else {
2656 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2657 }
2658 }
2659 }
2660
2661
2662 void
2663 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2664 {
2665 int r,c;
2666 struct vnode *vp;
2667 int acd;
2668
2669
2670 /* We take this opportunity to close the vnodes like we should.. */
2671
2672 for (c = 0; c < raidPtr->numCol; c++) {
2673 vp = raidPtr->raid_cinfo[c].ci_vp;
2674 acd = raidPtr->Disks[c].auto_configured;
2675 rf_close_component(raidPtr, vp, acd);
2676 raidPtr->raid_cinfo[c].ci_vp = NULL;
2677 raidPtr->Disks[c].auto_configured = 0;
2678 }
2679
2680 for (r = 0; r < raidPtr->numSpare; r++) {
2681 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2682 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2683 rf_close_component(raidPtr, vp, acd);
2684 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2685 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2686 }
2687 }
2688
2689
2690 void
2691 rf_ReconThread(struct rf_recon_req *req)
2692 {
2693 int s;
2694 RF_Raid_t *raidPtr;
2695
2696 s = splbio();
2697 raidPtr = (RF_Raid_t *) req->raidPtr;
2698 raidPtr->recon_in_progress = 1;
2699
2700 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2701 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2702
2703 RF_Free(req, sizeof(*req));
2704
2705 raidPtr->recon_in_progress = 0;
2706 splx(s);
2707
2708 /* That's all... */
2709 kthread_exit(0); /* does not return */
2710 }
2711
2712 void
2713 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2714 {
2715 int retcode;
2716 int s;
2717
2718 raidPtr->parity_rewrite_stripes_done = 0;
2719 raidPtr->parity_rewrite_in_progress = 1;
2720 s = splbio();
2721 retcode = rf_RewriteParity(raidPtr);
2722 splx(s);
2723 if (retcode) {
2724 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2725 } else {
2726 /* set the clean bit! If we shutdown correctly,
2727 the clean bit on each component label will get
2728 set */
2729 raidPtr->parity_good = RF_RAID_CLEAN;
2730 }
2731 raidPtr->parity_rewrite_in_progress = 0;
2732
2733 /* Anyone waiting for us to stop? If so, inform them... */
2734 if (raidPtr->waitShutdown) {
2735 wakeup(&raidPtr->parity_rewrite_in_progress);
2736 }
2737
2738 /* That's all... */
2739 kthread_exit(0); /* does not return */
2740 }
2741
2742
2743 void
2744 rf_CopybackThread(RF_Raid_t *raidPtr)
2745 {
2746 int s;
2747
2748 raidPtr->copyback_in_progress = 1;
2749 s = splbio();
2750 rf_CopybackReconstructedData(raidPtr);
2751 splx(s);
2752 raidPtr->copyback_in_progress = 0;
2753
2754 /* That's all... */
2755 kthread_exit(0); /* does not return */
2756 }
2757
2758
2759 void
2760 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2761 {
2762 int s;
2763 RF_Raid_t *raidPtr;
2764
2765 s = splbio();
2766 raidPtr = req->raidPtr;
2767 raidPtr->recon_in_progress = 1;
2768 rf_ReconstructInPlace(raidPtr, req->col);
2769 RF_Free(req, sizeof(*req));
2770 raidPtr->recon_in_progress = 0;
2771 splx(s);
2772
2773 /* That's all... */
2774 kthread_exit(0); /* does not return */
2775 }
2776
2777 static RF_AutoConfig_t *
2778 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2779 const char *cname, RF_SectorCount_t size)
2780 {
2781 int good_one = 0;
2782 RF_ComponentLabel_t *clabel;
2783 RF_AutoConfig_t *ac;
2784
2785 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2786 if (clabel == NULL) {
2787 oomem:
2788 while(ac_list) {
2789 ac = ac_list;
2790 if (ac->clabel)
2791 free(ac->clabel, M_RAIDFRAME);
2792 ac_list = ac_list->next;
2793 free(ac, M_RAIDFRAME);
2794 }
2795 printf("RAID auto config: out of memory!\n");
2796 return NULL; /* XXX probably should panic? */
2797 }
2798
2799 if (!raidread_component_label(dev, vp, clabel)) {
2800 /* Got the label. Does it look reasonable? */
2801 if (rf_reasonable_label(clabel) &&
2802 (clabel->partitionSize <= size)) {
2803 #ifdef DEBUG
2804 printf("Component on: %s: %llu\n",
2805 cname, (unsigned long long)size);
2806 rf_print_component_label(clabel);
2807 #endif
2808 /* if it's reasonable, add it, else ignore it. */
2809 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2810 M_NOWAIT);
2811 if (ac == NULL) {
2812 free(clabel, M_RAIDFRAME);
2813 goto oomem;
2814 }
2815 strlcpy(ac->devname, cname, sizeof(ac->devname));
2816 ac->dev = dev;
2817 ac->vp = vp;
2818 ac->clabel = clabel;
2819 ac->next = ac_list;
2820 ac_list = ac;
2821 good_one = 1;
2822 }
2823 }
2824 if (!good_one) {
2825 /* cleanup */
2826 free(clabel, M_RAIDFRAME);
2827 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2828 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2829 vput(vp);
2830 }
2831 return ac_list;
2832 }
2833
2834 RF_AutoConfig_t *
2835 rf_find_raid_components()
2836 {
2837 struct vnode *vp;
2838 struct disklabel label;
2839 struct device *dv;
2840 dev_t dev;
2841 int bmajor, bminor, wedge;
2842 int error;
2843 int i;
2844 RF_AutoConfig_t *ac_list;
2845
2846
2847 /* initialize the AutoConfig list */
2848 ac_list = NULL;
2849
2850 /* we begin by trolling through *all* the devices on the system */
2851
2852 for (dv = alldevs.tqh_first; dv != NULL;
2853 dv = dv->dv_list.tqe_next) {
2854
2855 /* we are only interested in disks... */
2856 if (device_class(dv) != DV_DISK)
2857 continue;
2858
2859 /* we don't care about floppies... */
2860 if (device_is_a(dv, "fd")) {
2861 continue;
2862 }
2863
2864 /* we don't care about CD's... */
2865 if (device_is_a(dv, "cd")) {
2866 continue;
2867 }
2868
2869 /* we don't care about md's... */
2870 if (device_is_a(dv, "md")) {
2871 continue;
2872 }
2873
2874 /* hdfd is the Atari/Hades floppy driver */
2875 if (device_is_a(dv, "hdfd")) {
2876 continue;
2877 }
2878
2879 /* fdisa is the Atari/Milan floppy driver */
2880 if (device_is_a(dv, "fdisa")) {
2881 continue;
2882 }
2883
2884 /* need to find the device_name_to_block_device_major stuff */
2885 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2886
2887 /* get a vnode for the raw partition of this disk */
2888
2889 wedge = device_is_a(dv, "dk");
2890 bminor = minor(device_unit(dv));
2891 dev = wedge ? makedev(bmajor, bminor) :
2892 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2893 if (bdevvp(dev, &vp))
2894 panic("RAID can't alloc vnode");
2895
2896 error = VOP_OPEN(vp, FREAD, NOCRED);
2897
2898 if (error) {
2899 /* "Who cares." Continue looking
2900 for something that exists*/
2901 vput(vp);
2902 continue;
2903 }
2904
2905 if (wedge) {
2906 struct dkwedge_info dkw;
2907 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2908 NOCRED);
2909 if (error) {
2910 printf("RAIDframe: can't get wedge info for "
2911 "dev %s (%d)\n", device_xname(dv), error);
2912 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2913 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2914 vput(vp);
2915 continue;
2916 }
2917
2918 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2919 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2920 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2921 vput(vp);
2922 continue;
2923 }
2924
2925 ac_list = rf_get_component(ac_list, dev, vp,
2926 device_xname(dv), dkw.dkw_size);
2927 continue;
2928 }
2929
2930 /* Ok, the disk exists. Go get the disklabel. */
2931 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2932 if (error) {
2933 /*
2934 * XXX can't happen - open() would
2935 * have errored out (or faked up one)
2936 */
2937 if (error != ENOTTY)
2938 printf("RAIDframe: can't get label for dev "
2939 "%s (%d)\n", device_xname(dv), error);
2940 }
2941
2942 /* don't need this any more. We'll allocate it again
2943 a little later if we really do... */
2944 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2945 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2946 vput(vp);
2947
2948 if (error)
2949 continue;
2950
2951 for (i = 0; i < label.d_npartitions; i++) {
2952 char cname[sizeof(ac_list->devname)];
2953
2954 /* We only support partitions marked as RAID */
2955 if (label.d_partitions[i].p_fstype != FS_RAID)
2956 continue;
2957
2958 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2959 if (bdevvp(dev, &vp))
2960 panic("RAID can't alloc vnode");
2961
2962 error = VOP_OPEN(vp, FREAD, NOCRED);
2963 if (error) {
2964 /* Whatever... */
2965 vput(vp);
2966 continue;
2967 }
2968 snprintf(cname, sizeof(cname), "%s%c",
2969 device_xname(dv), 'a' + i);
2970 ac_list = rf_get_component(ac_list, dev, vp, cname,
2971 label.d_partitions[i].p_size);
2972 }
2973 }
2974 return ac_list;
2975 }
2976
2977
2978 static int
2979 rf_reasonable_label(RF_ComponentLabel_t *clabel)
2980 {
2981
2982 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2983 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2984 ((clabel->clean == RF_RAID_CLEAN) ||
2985 (clabel->clean == RF_RAID_DIRTY)) &&
2986 clabel->row >=0 &&
2987 clabel->column >= 0 &&
2988 clabel->num_rows > 0 &&
2989 clabel->num_columns > 0 &&
2990 clabel->row < clabel->num_rows &&
2991 clabel->column < clabel->num_columns &&
2992 clabel->blockSize > 0 &&
2993 clabel->numBlocks > 0) {
2994 /* label looks reasonable enough... */
2995 return(1);
2996 }
2997 return(0);
2998 }
2999
3000
3001 #ifdef DEBUG
3002 void
3003 rf_print_component_label(RF_ComponentLabel_t *clabel)
3004 {
3005 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3006 clabel->row, clabel->column,
3007 clabel->num_rows, clabel->num_columns);
3008 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3009 clabel->version, clabel->serial_number,
3010 clabel->mod_counter);
3011 printf(" Clean: %s Status: %d\n",
3012 clabel->clean ? "Yes" : "No", clabel->status );
3013 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3014 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3015 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
3016 (char) clabel->parityConfig, clabel->blockSize,
3017 clabel->numBlocks);
3018 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
3019 printf(" Contains root partition: %s\n",
3020 clabel->root_partition ? "Yes" : "No" );
3021 printf(" Last configured as: raid%d\n", clabel->last_unit );
3022 #if 0
3023 printf(" Config order: %d\n", clabel->config_order);
3024 #endif
3025
3026 }
3027 #endif
3028
3029 RF_ConfigSet_t *
3030 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3031 {
3032 RF_AutoConfig_t *ac;
3033 RF_ConfigSet_t *config_sets;
3034 RF_ConfigSet_t *cset;
3035 RF_AutoConfig_t *ac_next;
3036
3037
3038 config_sets = NULL;
3039
3040 /* Go through the AutoConfig list, and figure out which components
3041 belong to what sets. */
3042 ac = ac_list;
3043 while(ac!=NULL) {
3044 /* we're going to putz with ac->next, so save it here
3045 for use at the end of the loop */
3046 ac_next = ac->next;
3047
3048 if (config_sets == NULL) {
3049 /* will need at least this one... */
3050 config_sets = (RF_ConfigSet_t *)
3051 malloc(sizeof(RF_ConfigSet_t),
3052 M_RAIDFRAME, M_NOWAIT);
3053 if (config_sets == NULL) {
3054 panic("rf_create_auto_sets: No memory!");
3055 }
3056 /* this one is easy :) */
3057 config_sets->ac = ac;
3058 config_sets->next = NULL;
3059 config_sets->rootable = 0;
3060 ac->next = NULL;
3061 } else {
3062 /* which set does this component fit into? */
3063 cset = config_sets;
3064 while(cset!=NULL) {
3065 if (rf_does_it_fit(cset, ac)) {
3066 /* looks like it matches... */
3067 ac->next = cset->ac;
3068 cset->ac = ac;
3069 break;
3070 }
3071 cset = cset->next;
3072 }
3073 if (cset==NULL) {
3074 /* didn't find a match above... new set..*/
3075 cset = (RF_ConfigSet_t *)
3076 malloc(sizeof(RF_ConfigSet_t),
3077 M_RAIDFRAME, M_NOWAIT);
3078 if (cset == NULL) {
3079 panic("rf_create_auto_sets: No memory!");
3080 }
3081 cset->ac = ac;
3082 ac->next = NULL;
3083 cset->next = config_sets;
3084 cset->rootable = 0;
3085 config_sets = cset;
3086 }
3087 }
3088 ac = ac_next;
3089 }
3090
3091
3092 return(config_sets);
3093 }
3094
3095 static int
3096 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3097 {
3098 RF_ComponentLabel_t *clabel1, *clabel2;
3099
3100 /* If this one matches the *first* one in the set, that's good
3101 enough, since the other members of the set would have been
3102 through here too... */
3103 /* note that we are not checking partitionSize here..
3104
3105 Note that we are also not checking the mod_counters here.
3106 If everything else matches execpt the mod_counter, that's
3107 good enough for this test. We will deal with the mod_counters
3108 a little later in the autoconfiguration process.
3109
3110 (clabel1->mod_counter == clabel2->mod_counter) &&
3111
3112 The reason we don't check for this is that failed disks
3113 will have lower modification counts. If those disks are
3114 not added to the set they used to belong to, then they will
3115 form their own set, which may result in 2 different sets,
3116 for example, competing to be configured at raid0, and
3117 perhaps competing to be the root filesystem set. If the
3118 wrong ones get configured, or both attempt to become /,
3119 weird behaviour and or serious lossage will occur. Thus we
3120 need to bring them into the fold here, and kick them out at
3121 a later point.
3122
3123 */
3124
3125 clabel1 = cset->ac->clabel;
3126 clabel2 = ac->clabel;
3127 if ((clabel1->version == clabel2->version) &&
3128 (clabel1->serial_number == clabel2->serial_number) &&
3129 (clabel1->num_rows == clabel2->num_rows) &&
3130 (clabel1->num_columns == clabel2->num_columns) &&
3131 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3132 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3133 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3134 (clabel1->parityConfig == clabel2->parityConfig) &&
3135 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3136 (clabel1->blockSize == clabel2->blockSize) &&
3137 (clabel1->numBlocks == clabel2->numBlocks) &&
3138 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3139 (clabel1->root_partition == clabel2->root_partition) &&
3140 (clabel1->last_unit == clabel2->last_unit) &&
3141 (clabel1->config_order == clabel2->config_order)) {
3142 /* if it get's here, it almost *has* to be a match */
3143 } else {
3144 /* it's not consistent with somebody in the set..
3145 punt */
3146 return(0);
3147 }
3148 /* all was fine.. it must fit... */
3149 return(1);
3150 }
3151
3152 int
3153 rf_have_enough_components(RF_ConfigSet_t *cset)
3154 {
3155 RF_AutoConfig_t *ac;
3156 RF_AutoConfig_t *auto_config;
3157 RF_ComponentLabel_t *clabel;
3158 int c;
3159 int num_cols;
3160 int num_missing;
3161 int mod_counter;
3162 int mod_counter_found;
3163 int even_pair_failed;
3164 char parity_type;
3165
3166
3167 /* check to see that we have enough 'live' components
3168 of this set. If so, we can configure it if necessary */
3169
3170 num_cols = cset->ac->clabel->num_columns;
3171 parity_type = cset->ac->clabel->parityConfig;
3172
3173 /* XXX Check for duplicate components!?!?!? */
3174
3175 /* Determine what the mod_counter is supposed to be for this set. */
3176
3177 mod_counter_found = 0;
3178 mod_counter = 0;
3179 ac = cset->ac;
3180 while(ac!=NULL) {
3181 if (mod_counter_found==0) {
3182 mod_counter = ac->clabel->mod_counter;
3183 mod_counter_found = 1;
3184 } else {
3185 if (ac->clabel->mod_counter > mod_counter) {
3186 mod_counter = ac->clabel->mod_counter;
3187 }
3188 }
3189 ac = ac->next;
3190 }
3191
3192 num_missing = 0;
3193 auto_config = cset->ac;
3194
3195 even_pair_failed = 0;
3196 for(c=0; c<num_cols; c++) {
3197 ac = auto_config;
3198 while(ac!=NULL) {
3199 if ((ac->clabel->column == c) &&
3200 (ac->clabel->mod_counter == mod_counter)) {
3201 /* it's this one... */
3202 #ifdef DEBUG
3203 printf("Found: %s at %d\n",
3204 ac->devname,c);
3205 #endif
3206 break;
3207 }
3208 ac=ac->next;
3209 }
3210 if (ac==NULL) {
3211 /* Didn't find one here! */
3212 /* special case for RAID 1, especially
3213 where there are more than 2
3214 components (where RAIDframe treats
3215 things a little differently :( ) */
3216 if (parity_type == '1') {
3217 if (c%2 == 0) { /* even component */
3218 even_pair_failed = 1;
3219 } else { /* odd component. If
3220 we're failed, and
3221 so is the even
3222 component, it's
3223 "Good Night, Charlie" */
3224 if (even_pair_failed == 1) {
3225 return(0);
3226 }
3227 }
3228 } else {
3229 /* normal accounting */
3230 num_missing++;
3231 }
3232 }
3233 if ((parity_type == '1') && (c%2 == 1)) {
3234 /* Just did an even component, and we didn't
3235 bail.. reset the even_pair_failed flag,
3236 and go on to the next component.... */
3237 even_pair_failed = 0;
3238 }
3239 }
3240
3241 clabel = cset->ac->clabel;
3242
3243 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3244 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3245 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3246 /* XXX this needs to be made *much* more general */
3247 /* Too many failures */
3248 return(0);
3249 }
3250 /* otherwise, all is well, and we've got enough to take a kick
3251 at autoconfiguring this set */
3252 return(1);
3253 }
3254
3255 void
3256 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3257 RF_Raid_t *raidPtr)
3258 {
3259 RF_ComponentLabel_t *clabel;
3260 int i;
3261
3262 clabel = ac->clabel;
3263
3264 /* 1. Fill in the common stuff */
3265 config->numRow = clabel->num_rows = 1;
3266 config->numCol = clabel->num_columns;
3267 config->numSpare = 0; /* XXX should this be set here? */
3268 config->sectPerSU = clabel->sectPerSU;
3269 config->SUsPerPU = clabel->SUsPerPU;
3270 config->SUsPerRU = clabel->SUsPerRU;
3271 config->parityConfig = clabel->parityConfig;
3272 /* XXX... */
3273 strcpy(config->diskQueueType,"fifo");
3274 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3275 config->layoutSpecificSize = 0; /* XXX ?? */
3276
3277 while(ac!=NULL) {
3278 /* row/col values will be in range due to the checks
3279 in reasonable_label() */
3280 strcpy(config->devnames[0][ac->clabel->column],
3281 ac->devname);
3282 ac = ac->next;
3283 }
3284
3285 for(i=0;i<RF_MAXDBGV;i++) {
3286 config->debugVars[i][0] = 0;
3287 }
3288 }
3289
3290 int
3291 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3292 {
3293 RF_ComponentLabel_t clabel;
3294 struct vnode *vp;
3295 dev_t dev;
3296 int column;
3297 int sparecol;
3298
3299 raidPtr->autoconfigure = new_value;
3300
3301 for(column=0; column<raidPtr->numCol; column++) {
3302 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3303 dev = raidPtr->Disks[column].dev;
3304 vp = raidPtr->raid_cinfo[column].ci_vp;
3305 raidread_component_label(dev, vp, &clabel);
3306 clabel.autoconfigure = new_value;
3307 raidwrite_component_label(dev, vp, &clabel);
3308 }
3309 }
3310 for(column = 0; column < raidPtr->numSpare ; column++) {
3311 sparecol = raidPtr->numCol + column;
3312 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3313 dev = raidPtr->Disks[sparecol].dev;
3314 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3315 raidread_component_label(dev, vp, &clabel);
3316 clabel.autoconfigure = new_value;
3317 raidwrite_component_label(dev, vp, &clabel);
3318 }
3319 }
3320 return(new_value);
3321 }
3322
3323 int
3324 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3325 {
3326 RF_ComponentLabel_t clabel;
3327 struct vnode *vp;
3328 dev_t dev;
3329 int column;
3330 int sparecol;
3331
3332 raidPtr->root_partition = new_value;
3333 for(column=0; column<raidPtr->numCol; column++) {
3334 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3335 dev = raidPtr->Disks[column].dev;
3336 vp = raidPtr->raid_cinfo[column].ci_vp;
3337 raidread_component_label(dev, vp, &clabel);
3338 clabel.root_partition = new_value;
3339 raidwrite_component_label(dev, vp, &clabel);
3340 }
3341 }
3342 for(column = 0; column < raidPtr->numSpare ; column++) {
3343 sparecol = raidPtr->numCol + column;
3344 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3345 dev = raidPtr->Disks[sparecol].dev;
3346 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3347 raidread_component_label(dev, vp, &clabel);
3348 clabel.root_partition = new_value;
3349 raidwrite_component_label(dev, vp, &clabel);
3350 }
3351 }
3352 return(new_value);
3353 }
3354
3355 void
3356 rf_release_all_vps(RF_ConfigSet_t *cset)
3357 {
3358 RF_AutoConfig_t *ac;
3359
3360 ac = cset->ac;
3361 while(ac!=NULL) {
3362 /* Close the vp, and give it back */
3363 if (ac->vp) {
3364 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3365 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3366 vput(ac->vp);
3367 ac->vp = NULL;
3368 }
3369 ac = ac->next;
3370 }
3371 }
3372
3373
3374 void
3375 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3376 {
3377 RF_AutoConfig_t *ac;
3378 RF_AutoConfig_t *next_ac;
3379
3380 ac = cset->ac;
3381 while(ac!=NULL) {
3382 next_ac = ac->next;
3383 /* nuke the label */
3384 free(ac->clabel, M_RAIDFRAME);
3385 /* cleanup the config structure */
3386 free(ac, M_RAIDFRAME);
3387 /* "next.." */
3388 ac = next_ac;
3389 }
3390 /* and, finally, nuke the config set */
3391 free(cset, M_RAIDFRAME);
3392 }
3393
3394
3395 void
3396 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3397 {
3398 /* current version number */
3399 clabel->version = RF_COMPONENT_LABEL_VERSION;
3400 clabel->serial_number = raidPtr->serial_number;
3401 clabel->mod_counter = raidPtr->mod_counter;
3402 clabel->num_rows = 1;
3403 clabel->num_columns = raidPtr->numCol;
3404 clabel->clean = RF_RAID_DIRTY; /* not clean */
3405 clabel->status = rf_ds_optimal; /* "It's good!" */
3406
3407 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3408 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3409 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3410
3411 clabel->blockSize = raidPtr->bytesPerSector;
3412 clabel->numBlocks = raidPtr->sectorsPerDisk;
3413
3414 /* XXX not portable */
3415 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3416 clabel->maxOutstanding = raidPtr->maxOutstanding;
3417 clabel->autoconfigure = raidPtr->autoconfigure;
3418 clabel->root_partition = raidPtr->root_partition;
3419 clabel->last_unit = raidPtr->raidid;
3420 clabel->config_order = raidPtr->config_order;
3421 }
3422
3423 int
3424 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3425 {
3426 RF_Raid_t *raidPtr;
3427 RF_Config_t *config;
3428 int raidID;
3429 int retcode;
3430
3431 #ifdef DEBUG
3432 printf("RAID autoconfigure\n");
3433 #endif
3434
3435 retcode = 0;
3436 *unit = -1;
3437
3438 /* 1. Create a config structure */
3439
3440 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3441 M_RAIDFRAME,
3442 M_NOWAIT);
3443 if (config==NULL) {
3444 printf("Out of mem!?!?\n");
3445 /* XXX do something more intelligent here. */
3446 return(1);
3447 }
3448
3449 memset(config, 0, sizeof(RF_Config_t));
3450
3451 /*
3452 2. Figure out what RAID ID this one is supposed to live at
3453 See if we can get the same RAID dev that it was configured
3454 on last time..
3455 */
3456
3457 raidID = cset->ac->clabel->last_unit;
3458 if ((raidID < 0) || (raidID >= numraid)) {
3459 /* let's not wander off into lala land. */
3460 raidID = numraid - 1;
3461 }
3462 if (raidPtrs[raidID]->valid != 0) {
3463
3464 /*
3465 Nope... Go looking for an alternative...
3466 Start high so we don't immediately use raid0 if that's
3467 not taken.
3468 */
3469
3470 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3471 if (raidPtrs[raidID]->valid == 0) {
3472 /* can use this one! */
3473 break;
3474 }
3475 }
3476 }
3477
3478 if (raidID < 0) {
3479 /* punt... */
3480 printf("Unable to auto configure this set!\n");
3481 printf("(Out of RAID devs!)\n");
3482 free(config, M_RAIDFRAME);
3483 return(1);
3484 }
3485
3486 #ifdef DEBUG
3487 printf("Configuring raid%d:\n",raidID);
3488 #endif
3489
3490 raidPtr = raidPtrs[raidID];
3491
3492 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3493 raidPtr->raidid = raidID;
3494 raidPtr->openings = RAIDOUTSTANDING;
3495
3496 /* 3. Build the configuration structure */
3497 rf_create_configuration(cset->ac, config, raidPtr);
3498
3499 /* 4. Do the configuration */
3500 retcode = rf_Configure(raidPtr, config, cset->ac);
3501
3502 if (retcode == 0) {
3503
3504 raidinit(raidPtrs[raidID]);
3505
3506 rf_markalldirty(raidPtrs[raidID]);
3507 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3508 if (cset->ac->clabel->root_partition==1) {
3509 /* everything configured just fine. Make a note
3510 that this set is eligible to be root. */
3511 cset->rootable = 1;
3512 /* XXX do this here? */
3513 raidPtrs[raidID]->root_partition = 1;
3514 }
3515 }
3516
3517 /* 5. Cleanup */
3518 free(config, M_RAIDFRAME);
3519
3520 *unit = raidID;
3521 return(retcode);
3522 }
3523
3524 void
3525 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3526 {
3527 struct buf *bp;
3528
3529 bp = (struct buf *)desc->bp;
3530 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3531 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3532 }
3533
3534 void
3535 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3536 size_t xmin, size_t xmax)
3537 {
3538 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3539 pool_sethiwat(p, xmax);
3540 pool_prime(p, xmin);
3541 pool_setlowat(p, xmin);
3542 }
3543
3544 /*
3545 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3546 * if there is IO pending and if that IO could possibly be done for a
3547 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3548 * otherwise.
3549 *
3550 */
3551
3552 int
3553 rf_buf_queue_check(int raidid)
3554 {
3555 if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
3556 raidPtrs[raidid]->openings > 0) {
3557 /* there is work to do */
3558 return 0;
3559 }
3560 /* default is nothing to do */
3561 return 1;
3562 }
3563
3564 int
3565 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3566 {
3567 struct partinfo dpart;
3568 struct dkwedge_info dkw;
3569 int error;
3570
3571 error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred);
3572 if (error == 0) {
3573 diskPtr->blockSize = dpart.disklab->d_secsize;
3574 diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3575 diskPtr->partitionSize = dpart.part->p_size;
3576 return 0;
3577 }
3578
3579 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred);
3580 if (error == 0) {
3581 diskPtr->blockSize = 512; /* XXX */
3582 diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3583 diskPtr->partitionSize = dkw.dkw_size;
3584 return 0;
3585 }
3586 return error;
3587 }
3588
3589 static int
3590 raid_match(struct device *self, struct cfdata *cfdata,
3591 void *aux)
3592 {
3593 return 1;
3594 }
3595
3596 static void
3597 raid_attach(struct device *parent, struct device *self,
3598 void *aux)
3599 {
3600 int i, unit;
3601 int bmaj = bdevsw_lookup_major(&raid_bdevsw);
3602 int cmaj = cdevsw_lookup_major(&raid_cdevsw);
3603
3604 unit = device_unit(self);
3605
3606 /* register device nodes */
3607 for (i = 0; i < MAXPARTITIONS; i++) {
3608 /* block devices */
3609 device_register_name(MAKEDISKDEV(bmaj, unit, i), self, false,
3610 DEV_DISK, "raid%d%c", unit, 'a' + i);
3611 /* char devices */
3612 device_register_name(MAKEDISKDEV(cmaj, unit, i), self, true,
3613 DEV_DISK, "rraid%d%c", unit, 'a' + i);
3614 }
3615 }
3616
3617
3618 static int
3619 raid_detach(struct device *self, int flags)
3620 {
3621 struct raid_softc *rs = (struct raid_softc *)self;
3622
3623 if (rs->sc_flags & RAIDF_INITED)
3624 return EBUSY;
3625
3626 device_deregister_all(self);
3627
3628 return 0;
3629 }
3630
3631 static void
3632 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3633 {
3634 prop_dictionary_t disk_info, odisk_info, geom;
3635 disk_info = prop_dictionary_create();
3636 geom = prop_dictionary_create();
3637 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3638 raidPtr->totalSectors);
3639 prop_dictionary_set_uint32(geom, "sector-size",
3640 raidPtr->bytesPerSector);
3641
3642 prop_dictionary_set_uint16(geom, "sectors-per-track",
3643 raidPtr->Layout.dataSectorsPerStripe);
3644 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3645 4 * raidPtr->numCol);
3646
3647 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3648 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3649 (4 * raidPtr->numCol)));
3650
3651 prop_dictionary_set(disk_info, "geometry", geom);
3652 prop_object_release(geom);
3653 prop_dictionary_set(device_properties(rs->sc_dev),
3654 "disk-info", disk_info);
3655 odisk_info = rs->sc_dkdev.dk_info;
3656 rs->sc_dkdev.dk_info = disk_info;
3657 if (odisk_info)
3658 prop_object_release(odisk_info);
3659 }
3660