rf_netbsdkintf.c revision 1.290 1 /* $NetBSD: rf_netbsdkintf.c,v 1.290 2011/05/10 05:08:51 mrg Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.290 2011/05/10 05:08:51 mrg Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #include "raid.h"
110 #endif
111
112 #include <sys/param.h>
113 #include <sys/errno.h>
114 #include <sys/pool.h>
115 #include <sys/proc.h>
116 #include <sys/queue.h>
117 #include <sys/disk.h>
118 #include <sys/device.h>
119 #include <sys/stat.h>
120 #include <sys/ioctl.h>
121 #include <sys/fcntl.h>
122 #include <sys/systm.h>
123 #include <sys/vnode.h>
124 #include <sys/disklabel.h>
125 #include <sys/conf.h>
126 #include <sys/buf.h>
127 #include <sys/bufq.h>
128 #include <sys/reboot.h>
129 #include <sys/kauth.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #ifdef DEBUG
156 int rf_kdebug_level = 0;
157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
158 #else /* DEBUG */
159 #define db1_printf(a) { }
160 #endif /* DEBUG */
161
162 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 static rf_declare_mutex2(rf_sparet_wait_mutex);
166 static rf_declare_cond2(rf_sparet_wait_cv);
167 static rf_declare_cond2(rf_sparet_resp_cv);
168
169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
170 * spare table */
171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
172 * installation process */
173 #endif
174
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181 void *, int, struct proc *);
182 static void raidinit(RF_Raid_t *);
183
184 void raidattach(int);
185 static int raid_match(device_t, cfdata_t, void *);
186 static void raid_attach(device_t, device_t, void *);
187 static int raid_detach(device_t, int);
188
189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t);
191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
192 daddr_t, daddr_t, int);
193
194 static int raidwrite_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196 static int raidread_component_label(unsigned,
197 dev_t, struct vnode *, RF_ComponentLabel_t *);
198
199
200 dev_type_open(raidopen);
201 dev_type_close(raidclose);
202 dev_type_read(raidread);
203 dev_type_write(raidwrite);
204 dev_type_ioctl(raidioctl);
205 dev_type_strategy(raidstrategy);
206 dev_type_dump(raiddump);
207 dev_type_size(raidsize);
208
209 const struct bdevsw raid_bdevsw = {
210 raidopen, raidclose, raidstrategy, raidioctl,
211 raiddump, raidsize, D_DISK
212 };
213
214 const struct cdevsw raid_cdevsw = {
215 raidopen, raidclose, raidread, raidwrite, raidioctl,
216 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
217 };
218
219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
220
221 /* XXX Not sure if the following should be replacing the raidPtrs above,
222 or if it should be used in conjunction with that...
223 */
224
225 struct raid_softc {
226 device_t sc_dev;
227 int sc_flags; /* flags */
228 int sc_cflags; /* configuration flags */
229 uint64_t sc_size; /* size of the raid device */
230 char sc_xname[20]; /* XXX external name */
231 struct disk sc_dkdev; /* generic disk device info */
232 struct bufq_state *buf_queue; /* used for the device queue */
233 };
234 /* sc_flags */
235 #define RAIDF_INITED 0x01 /* unit has been initialized */
236 #define RAIDF_WLABEL 0x02 /* label area is writable */
237 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
238 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
239 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
240 #define RAIDF_LOCKED 0x80 /* unit is locked */
241
242 #define raidunit(x) DISKUNIT(x)
243 int numraid = 0;
244
245 extern struct cfdriver raid_cd;
246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
247 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
248 DVF_DETACH_SHUTDOWN);
249
250 /*
251 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
252 * Be aware that large numbers can allow the driver to consume a lot of
253 * kernel memory, especially on writes, and in degraded mode reads.
254 *
255 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
256 * a single 64K write will typically require 64K for the old data,
257 * 64K for the old parity, and 64K for the new parity, for a total
258 * of 192K (if the parity buffer is not re-used immediately).
259 * Even it if is used immediately, that's still 128K, which when multiplied
260 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
261 *
262 * Now in degraded mode, for example, a 64K read on the above setup may
263 * require data reconstruction, which will require *all* of the 4 remaining
264 * disks to participate -- 4 * 32K/disk == 128K again.
265 */
266
267 #ifndef RAIDOUTSTANDING
268 #define RAIDOUTSTANDING 6
269 #endif
270
271 #define RAIDLABELDEV(dev) \
272 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
273
274 /* declared here, and made public, for the benefit of KVM stuff.. */
275 struct raid_softc *raid_softc;
276
277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
278 struct disklabel *);
279 static void raidgetdisklabel(dev_t);
280 static void raidmakedisklabel(struct raid_softc *);
281
282 static int raidlock(struct raid_softc *);
283 static void raidunlock(struct raid_softc *);
284
285 static int raid_detach_unlocked(struct raid_softc *);
286
287 static void rf_markalldirty(RF_Raid_t *);
288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
289
290 void rf_ReconThread(struct rf_recon_req *);
291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
292 void rf_CopybackThread(RF_Raid_t *raidPtr);
293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
294 int rf_autoconfig(device_t);
295 void rf_buildroothack(RF_ConfigSet_t *);
296
297 RF_AutoConfig_t *rf_find_raid_components(void);
298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
300 static int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
302 int rf_set_autoconfig(RF_Raid_t *, int);
303 int rf_set_rootpartition(RF_Raid_t *, int);
304 void rf_release_all_vps(RF_ConfigSet_t *);
305 void rf_cleanup_config_set(RF_ConfigSet_t *);
306 int rf_have_enough_components(RF_ConfigSet_t *);
307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
309
310 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
311 allow autoconfig to take place.
312 Note that this is overridden by having
313 RAID_AUTOCONFIG as an option in the
314 kernel config file. */
315
316 struct RF_Pools_s rf_pools;
317
318 void
319 raidattach(int num)
320 {
321 int raidID;
322 int i, rc;
323
324 aprint_debug("raidattach: Asked for %d units\n", num);
325
326 if (num <= 0) {
327 #ifdef DIAGNOSTIC
328 panic("raidattach: count <= 0");
329 #endif
330 return;
331 }
332 /* This is where all the initialization stuff gets done. */
333
334 numraid = num;
335
336 /* Make some space for requested number of units... */
337
338 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
339 if (raidPtrs == NULL) {
340 panic("raidPtrs is NULL!!");
341 }
342
343 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
344 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
345 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
346 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
347
348 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
349 #endif
350
351 for (i = 0; i < num; i++)
352 raidPtrs[i] = NULL;
353 rc = rf_BootRaidframe();
354 if (rc == 0)
355 aprint_verbose("Kernelized RAIDframe activated\n");
356 else
357 panic("Serious error booting RAID!!");
358
359 /* put together some datastructures like the CCD device does.. This
360 * lets us lock the device and what-not when it gets opened. */
361
362 raid_softc = (struct raid_softc *)
363 malloc(num * sizeof(struct raid_softc),
364 M_RAIDFRAME, M_NOWAIT);
365 if (raid_softc == NULL) {
366 aprint_error("WARNING: no memory for RAIDframe driver\n");
367 return;
368 }
369
370 memset(raid_softc, 0, num * sizeof(struct raid_softc));
371
372 for (raidID = 0; raidID < num; raidID++) {
373 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
374
375 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
376 (RF_Raid_t *));
377 if (raidPtrs[raidID] == NULL) {
378 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
379 numraid = raidID;
380 return;
381 }
382 }
383
384 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
385 aprint_error("raidattach: config_cfattach_attach failed?\n");
386 }
387
388 #ifdef RAID_AUTOCONFIG
389 raidautoconfig = 1;
390 #endif
391
392 /*
393 * Register a finalizer which will be used to auto-config RAID
394 * sets once all real hardware devices have been found.
395 */
396 if (config_finalize_register(NULL, rf_autoconfig) != 0)
397 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
398 }
399
400 int
401 rf_autoconfig(device_t self)
402 {
403 RF_AutoConfig_t *ac_list;
404 RF_ConfigSet_t *config_sets;
405
406 if (raidautoconfig == 0)
407 return (0);
408
409 /* XXX This code can only be run once. */
410 raidautoconfig = 0;
411
412 /* 1. locate all RAID components on the system */
413 aprint_debug("Searching for RAID components...\n");
414 ac_list = rf_find_raid_components();
415
416 /* 2. Sort them into their respective sets. */
417 config_sets = rf_create_auto_sets(ac_list);
418
419 /*
420 * 3. Evaluate each set andconfigure the valid ones.
421 * This gets done in rf_buildroothack().
422 */
423 rf_buildroothack(config_sets);
424
425 return 1;
426 }
427
428 void
429 rf_buildroothack(RF_ConfigSet_t *config_sets)
430 {
431 RF_ConfigSet_t *cset;
432 RF_ConfigSet_t *next_cset;
433 int retcode;
434 int raidID;
435 int rootID;
436 int col;
437 int num_root;
438 char *devname;
439
440 rootID = 0;
441 num_root = 0;
442 cset = config_sets;
443 while (cset != NULL) {
444 next_cset = cset->next;
445 if (rf_have_enough_components(cset) &&
446 cset->ac->clabel->autoconfigure==1) {
447 retcode = rf_auto_config_set(cset,&raidID);
448 if (!retcode) {
449 aprint_debug("raid%d: configured ok\n", raidID);
450 if (cset->rootable) {
451 rootID = raidID;
452 num_root++;
453 }
454 } else {
455 /* The autoconfig didn't work :( */
456 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
457 rf_release_all_vps(cset);
458 }
459 } else {
460 /* we're not autoconfiguring this set...
461 release the associated resources */
462 rf_release_all_vps(cset);
463 }
464 /* cleanup */
465 rf_cleanup_config_set(cset);
466 cset = next_cset;
467 }
468
469 /* if the user has specified what the root device should be
470 then we don't touch booted_device or boothowto... */
471
472 if (rootspec != NULL)
473 return;
474
475 /* we found something bootable... */
476
477 if (num_root == 1) {
478 booted_device = raid_softc[rootID].sc_dev;
479 } else if (num_root > 1) {
480
481 /*
482 * Maybe the MD code can help. If it cannot, then
483 * setroot() will discover that we have no
484 * booted_device and will ask the user if nothing was
485 * hardwired in the kernel config file
486 */
487
488 if (booted_device == NULL)
489 cpu_rootconf();
490 if (booted_device == NULL)
491 return;
492
493 num_root = 0;
494 for (raidID = 0; raidID < numraid; raidID++) {
495 if (raidPtrs[raidID]->valid == 0)
496 continue;
497
498 if (raidPtrs[raidID]->root_partition == 0)
499 continue;
500
501 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
502 devname = raidPtrs[raidID]->Disks[col].devname;
503 devname += sizeof("/dev/") - 1;
504 if (strncmp(devname, device_xname(booted_device),
505 strlen(device_xname(booted_device))) != 0)
506 continue;
507 aprint_debug("raid%d includes boot device %s\n",
508 raidID, devname);
509 num_root++;
510 rootID = raidID;
511 }
512 }
513
514 if (num_root == 1) {
515 booted_device = raid_softc[rootID].sc_dev;
516 } else {
517 /* we can't guess.. require the user to answer... */
518 boothowto |= RB_ASKNAME;
519 }
520 }
521 }
522
523
524 int
525 raidsize(dev_t dev)
526 {
527 struct raid_softc *rs;
528 struct disklabel *lp;
529 int part, unit, omask, size;
530
531 unit = raidunit(dev);
532 if (unit >= numraid)
533 return (-1);
534 rs = &raid_softc[unit];
535
536 if ((rs->sc_flags & RAIDF_INITED) == 0)
537 return (-1);
538
539 part = DISKPART(dev);
540 omask = rs->sc_dkdev.dk_openmask & (1 << part);
541 lp = rs->sc_dkdev.dk_label;
542
543 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
544 return (-1);
545
546 if (lp->d_partitions[part].p_fstype != FS_SWAP)
547 size = -1;
548 else
549 size = lp->d_partitions[part].p_size *
550 (lp->d_secsize / DEV_BSIZE);
551
552 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
553 return (-1);
554
555 return (size);
556
557 }
558
559 int
560 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
561 {
562 int unit = raidunit(dev);
563 struct raid_softc *rs;
564 const struct bdevsw *bdev;
565 struct disklabel *lp;
566 RF_Raid_t *raidPtr;
567 daddr_t offset;
568 int part, c, sparecol, j, scol, dumpto;
569 int error = 0;
570
571 if (unit >= numraid)
572 return (ENXIO);
573
574 rs = &raid_softc[unit];
575 raidPtr = raidPtrs[unit];
576
577 if ((rs->sc_flags & RAIDF_INITED) == 0)
578 return ENXIO;
579
580 /* we only support dumping to RAID 1 sets */
581 if (raidPtr->Layout.numDataCol != 1 ||
582 raidPtr->Layout.numParityCol != 1)
583 return EINVAL;
584
585
586 if ((error = raidlock(rs)) != 0)
587 return error;
588
589 if (size % DEV_BSIZE != 0) {
590 error = EINVAL;
591 goto out;
592 }
593
594 if (blkno + size / DEV_BSIZE > rs->sc_size) {
595 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
596 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
597 size / DEV_BSIZE, rs->sc_size);
598 error = EINVAL;
599 goto out;
600 }
601
602 part = DISKPART(dev);
603 lp = rs->sc_dkdev.dk_label;
604 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
605
606 /* figure out what device is alive.. */
607
608 /*
609 Look for a component to dump to. The preference for the
610 component to dump to is as follows:
611 1) the master
612 2) a used_spare of the master
613 3) the slave
614 4) a used_spare of the slave
615 */
616
617 dumpto = -1;
618 for (c = 0; c < raidPtr->numCol; c++) {
619 if (raidPtr->Disks[c].status == rf_ds_optimal) {
620 /* this might be the one */
621 dumpto = c;
622 break;
623 }
624 }
625
626 /*
627 At this point we have possibly selected a live master or a
628 live slave. We now check to see if there is a spared
629 master (or a spared slave), if we didn't find a live master
630 or a live slave.
631 */
632
633 for (c = 0; c < raidPtr->numSpare; c++) {
634 sparecol = raidPtr->numCol + c;
635 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
636 /* How about this one? */
637 scol = -1;
638 for(j=0;j<raidPtr->numCol;j++) {
639 if (raidPtr->Disks[j].spareCol == sparecol) {
640 scol = j;
641 break;
642 }
643 }
644 if (scol == 0) {
645 /*
646 We must have found a spared master!
647 We'll take that over anything else
648 found so far. (We couldn't have
649 found a real master before, since
650 this is a used spare, and it's
651 saying that it's replacing the
652 master.) On reboot (with
653 autoconfiguration turned on)
654 sparecol will become the 1st
655 component (component0) of this set.
656 */
657 dumpto = sparecol;
658 break;
659 } else if (scol != -1) {
660 /*
661 Must be a spared slave. We'll dump
662 to that if we havn't found anything
663 else so far.
664 */
665 if (dumpto == -1)
666 dumpto = sparecol;
667 }
668 }
669 }
670
671 if (dumpto == -1) {
672 /* we couldn't find any live components to dump to!?!?
673 */
674 error = EINVAL;
675 goto out;
676 }
677
678 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
679
680 /*
681 Note that blkno is relative to this particular partition.
682 By adding the offset of this partition in the RAID
683 set, and also adding RF_PROTECTED_SECTORS, we get a
684 value that is relative to the partition used for the
685 underlying component.
686 */
687
688 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
689 blkno + offset, va, size);
690
691 out:
692 raidunlock(rs);
693
694 return error;
695 }
696 /* ARGSUSED */
697 int
698 raidopen(dev_t dev, int flags, int fmt,
699 struct lwp *l)
700 {
701 int unit = raidunit(dev);
702 struct raid_softc *rs;
703 struct disklabel *lp;
704 int part, pmask;
705 int error = 0;
706
707 if (unit >= numraid)
708 return (ENXIO);
709 rs = &raid_softc[unit];
710
711 if ((error = raidlock(rs)) != 0)
712 return (error);
713
714 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
715 error = EBUSY;
716 goto bad;
717 }
718
719 lp = rs->sc_dkdev.dk_label;
720
721 part = DISKPART(dev);
722
723 /*
724 * If there are wedges, and this is not RAW_PART, then we
725 * need to fail.
726 */
727 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
728 error = EBUSY;
729 goto bad;
730 }
731 pmask = (1 << part);
732
733 if ((rs->sc_flags & RAIDF_INITED) &&
734 (rs->sc_dkdev.dk_openmask == 0))
735 raidgetdisklabel(dev);
736
737 /* make sure that this partition exists */
738
739 if (part != RAW_PART) {
740 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
741 ((part >= lp->d_npartitions) ||
742 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
743 error = ENXIO;
744 goto bad;
745 }
746 }
747 /* Prevent this unit from being unconfigured while open. */
748 switch (fmt) {
749 case S_IFCHR:
750 rs->sc_dkdev.dk_copenmask |= pmask;
751 break;
752
753 case S_IFBLK:
754 rs->sc_dkdev.dk_bopenmask |= pmask;
755 break;
756 }
757
758 if ((rs->sc_dkdev.dk_openmask == 0) &&
759 ((rs->sc_flags & RAIDF_INITED) != 0)) {
760 /* First one... mark things as dirty... Note that we *MUST*
761 have done a configure before this. I DO NOT WANT TO BE
762 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
763 THAT THEY BELONG TOGETHER!!!!! */
764 /* XXX should check to see if we're only open for reading
765 here... If so, we needn't do this, but then need some
766 other way of keeping track of what's happened.. */
767
768 rf_markalldirty(raidPtrs[unit]);
769 }
770
771
772 rs->sc_dkdev.dk_openmask =
773 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
774
775 bad:
776 raidunlock(rs);
777
778 return (error);
779
780
781 }
782 /* ARGSUSED */
783 int
784 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
785 {
786 int unit = raidunit(dev);
787 struct raid_softc *rs;
788 int error = 0;
789 int part;
790
791 if (unit >= numraid)
792 return (ENXIO);
793 rs = &raid_softc[unit];
794
795 if ((error = raidlock(rs)) != 0)
796 return (error);
797
798 part = DISKPART(dev);
799
800 /* ...that much closer to allowing unconfiguration... */
801 switch (fmt) {
802 case S_IFCHR:
803 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
804 break;
805
806 case S_IFBLK:
807 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
808 break;
809 }
810 rs->sc_dkdev.dk_openmask =
811 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
812
813 if ((rs->sc_dkdev.dk_openmask == 0) &&
814 ((rs->sc_flags & RAIDF_INITED) != 0)) {
815 /* Last one... device is not unconfigured yet.
816 Device shutdown has taken care of setting the
817 clean bits if RAIDF_INITED is not set
818 mark things as clean... */
819
820 rf_update_component_labels(raidPtrs[unit],
821 RF_FINAL_COMPONENT_UPDATE);
822
823 /* If the kernel is shutting down, it will detach
824 * this RAID set soon enough.
825 */
826 }
827
828 raidunlock(rs);
829 return (0);
830
831 }
832
833 void
834 raidstrategy(struct buf *bp)
835 {
836 unsigned int raidID = raidunit(bp->b_dev);
837 RF_Raid_t *raidPtr;
838 struct raid_softc *rs = &raid_softc[raidID];
839 int wlabel;
840
841 if ((rs->sc_flags & RAIDF_INITED) ==0) {
842 bp->b_error = ENXIO;
843 goto done;
844 }
845 if (raidID >= numraid || !raidPtrs[raidID]) {
846 bp->b_error = ENODEV;
847 goto done;
848 }
849 raidPtr = raidPtrs[raidID];
850 if (!raidPtr->valid) {
851 bp->b_error = ENODEV;
852 goto done;
853 }
854 if (bp->b_bcount == 0) {
855 db1_printf(("b_bcount is zero..\n"));
856 goto done;
857 }
858
859 /*
860 * Do bounds checking and adjust transfer. If there's an
861 * error, the bounds check will flag that for us.
862 */
863
864 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
865 if (DISKPART(bp->b_dev) == RAW_PART) {
866 uint64_t size; /* device size in DEV_BSIZE unit */
867
868 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
869 size = raidPtr->totalSectors <<
870 (raidPtr->logBytesPerSector - DEV_BSHIFT);
871 } else {
872 size = raidPtr->totalSectors >>
873 (DEV_BSHIFT - raidPtr->logBytesPerSector);
874 }
875 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
876 goto done;
877 }
878 } else {
879 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
880 db1_printf(("Bounds check failed!!:%d %d\n",
881 (int) bp->b_blkno, (int) wlabel));
882 goto done;
883 }
884 }
885
886 rf_lock_mutex2(raidPtr->iodone_lock);
887
888 bp->b_resid = 0;
889
890 /* stuff it onto our queue */
891 bufq_put(rs->buf_queue, bp);
892
893 /* scheduled the IO to happen at the next convenient time */
894 rf_signal_cond2(raidPtr->iodone_cv);
895 rf_unlock_mutex2(raidPtr->iodone_lock);
896
897 return;
898
899 done:
900 bp->b_resid = bp->b_bcount;
901 biodone(bp);
902 }
903 /* ARGSUSED */
904 int
905 raidread(dev_t dev, struct uio *uio, int flags)
906 {
907 int unit = raidunit(dev);
908 struct raid_softc *rs;
909
910 if (unit >= numraid)
911 return (ENXIO);
912 rs = &raid_softc[unit];
913
914 if ((rs->sc_flags & RAIDF_INITED) == 0)
915 return (ENXIO);
916
917 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
918
919 }
920 /* ARGSUSED */
921 int
922 raidwrite(dev_t dev, struct uio *uio, int flags)
923 {
924 int unit = raidunit(dev);
925 struct raid_softc *rs;
926
927 if (unit >= numraid)
928 return (ENXIO);
929 rs = &raid_softc[unit];
930
931 if ((rs->sc_flags & RAIDF_INITED) == 0)
932 return (ENXIO);
933
934 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
935
936 }
937
938 static int
939 raid_detach_unlocked(struct raid_softc *rs)
940 {
941 int error;
942 RF_Raid_t *raidPtr;
943
944 raidPtr = raidPtrs[device_unit(rs->sc_dev)];
945
946 /*
947 * If somebody has a partition mounted, we shouldn't
948 * shutdown.
949 */
950 if (rs->sc_dkdev.dk_openmask != 0)
951 return EBUSY;
952
953 if ((rs->sc_flags & RAIDF_INITED) == 0)
954 ; /* not initialized: nothing to do */
955 else if ((error = rf_Shutdown(raidPtr)) != 0)
956 return error;
957 else
958 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
959
960 /* Detach the disk. */
961 dkwedge_delall(&rs->sc_dkdev);
962 disk_detach(&rs->sc_dkdev);
963 disk_destroy(&rs->sc_dkdev);
964
965 aprint_normal_dev(rs->sc_dev, "detached\n");
966
967 return 0;
968 }
969
970 int
971 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
972 {
973 int unit = raidunit(dev);
974 int error = 0;
975 int part, pmask;
976 cfdata_t cf;
977 struct raid_softc *rs;
978 RF_Config_t *k_cfg, *u_cfg;
979 RF_Raid_t *raidPtr;
980 RF_RaidDisk_t *diskPtr;
981 RF_AccTotals_t *totals;
982 RF_DeviceConfig_t *d_cfg, **ucfgp;
983 u_char *specific_buf;
984 int retcode = 0;
985 int column;
986 /* int raidid; */
987 struct rf_recon_req *rrcopy, *rr;
988 RF_ComponentLabel_t *clabel;
989 RF_ComponentLabel_t *ci_label;
990 RF_ComponentLabel_t **clabel_ptr;
991 RF_SingleComponent_t *sparePtr,*componentPtr;
992 RF_SingleComponent_t component;
993 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
994 int i, j, d;
995 #ifdef __HAVE_OLD_DISKLABEL
996 struct disklabel newlabel;
997 #endif
998 struct dkwedge_info *dkw;
999
1000 if (unit >= numraid)
1001 return (ENXIO);
1002 rs = &raid_softc[unit];
1003 raidPtr = raidPtrs[unit];
1004
1005 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1006 (int) DISKPART(dev), (int) unit, cmd));
1007
1008 /* Must be open for writes for these commands... */
1009 switch (cmd) {
1010 #ifdef DIOCGSECTORSIZE
1011 case DIOCGSECTORSIZE:
1012 *(u_int *)data = raidPtr->bytesPerSector;
1013 return 0;
1014 case DIOCGMEDIASIZE:
1015 *(off_t *)data =
1016 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1017 return 0;
1018 #endif
1019 case DIOCSDINFO:
1020 case DIOCWDINFO:
1021 #ifdef __HAVE_OLD_DISKLABEL
1022 case ODIOCWDINFO:
1023 case ODIOCSDINFO:
1024 #endif
1025 case DIOCWLABEL:
1026 case DIOCAWEDGE:
1027 case DIOCDWEDGE:
1028 if ((flag & FWRITE) == 0)
1029 return (EBADF);
1030 }
1031
1032 /* Must be initialized for these... */
1033 switch (cmd) {
1034 case DIOCGDINFO:
1035 case DIOCSDINFO:
1036 case DIOCWDINFO:
1037 #ifdef __HAVE_OLD_DISKLABEL
1038 case ODIOCGDINFO:
1039 case ODIOCWDINFO:
1040 case ODIOCSDINFO:
1041 case ODIOCGDEFLABEL:
1042 #endif
1043 case DIOCGPART:
1044 case DIOCWLABEL:
1045 case DIOCGDEFLABEL:
1046 case DIOCAWEDGE:
1047 case DIOCDWEDGE:
1048 case DIOCLWEDGES:
1049 case DIOCCACHESYNC:
1050 case RAIDFRAME_SHUTDOWN:
1051 case RAIDFRAME_REWRITEPARITY:
1052 case RAIDFRAME_GET_INFO:
1053 case RAIDFRAME_RESET_ACCTOTALS:
1054 case RAIDFRAME_GET_ACCTOTALS:
1055 case RAIDFRAME_KEEP_ACCTOTALS:
1056 case RAIDFRAME_GET_SIZE:
1057 case RAIDFRAME_FAIL_DISK:
1058 case RAIDFRAME_COPYBACK:
1059 case RAIDFRAME_CHECK_RECON_STATUS:
1060 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1061 case RAIDFRAME_GET_COMPONENT_LABEL:
1062 case RAIDFRAME_SET_COMPONENT_LABEL:
1063 case RAIDFRAME_ADD_HOT_SPARE:
1064 case RAIDFRAME_REMOVE_HOT_SPARE:
1065 case RAIDFRAME_INIT_LABELS:
1066 case RAIDFRAME_REBUILD_IN_PLACE:
1067 case RAIDFRAME_CHECK_PARITY:
1068 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1069 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1070 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1071 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1072 case RAIDFRAME_SET_AUTOCONFIG:
1073 case RAIDFRAME_SET_ROOT:
1074 case RAIDFRAME_DELETE_COMPONENT:
1075 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1076 case RAIDFRAME_PARITYMAP_STATUS:
1077 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1078 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1079 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1080 if ((rs->sc_flags & RAIDF_INITED) == 0)
1081 return (ENXIO);
1082 }
1083
1084 switch (cmd) {
1085 #ifdef COMPAT_50
1086 case RAIDFRAME_GET_INFO50:
1087 return rf_get_info50(raidPtr, data);
1088
1089 case RAIDFRAME_CONFIGURE50:
1090 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1091 return retcode;
1092 goto config;
1093 #endif
1094 /* configure the system */
1095 case RAIDFRAME_CONFIGURE:
1096
1097 if (raidPtr->valid) {
1098 /* There is a valid RAID set running on this unit! */
1099 printf("raid%d: Device already configured!\n",unit);
1100 return(EINVAL);
1101 }
1102
1103 /* copy-in the configuration information */
1104 /* data points to a pointer to the configuration structure */
1105
1106 u_cfg = *((RF_Config_t **) data);
1107 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1108 if (k_cfg == NULL) {
1109 return (ENOMEM);
1110 }
1111 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1112 if (retcode) {
1113 RF_Free(k_cfg, sizeof(RF_Config_t));
1114 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1115 retcode));
1116 return (retcode);
1117 }
1118 goto config;
1119 config:
1120 /* allocate a buffer for the layout-specific data, and copy it
1121 * in */
1122 if (k_cfg->layoutSpecificSize) {
1123 if (k_cfg->layoutSpecificSize > 10000) {
1124 /* sanity check */
1125 RF_Free(k_cfg, sizeof(RF_Config_t));
1126 return (EINVAL);
1127 }
1128 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1129 (u_char *));
1130 if (specific_buf == NULL) {
1131 RF_Free(k_cfg, sizeof(RF_Config_t));
1132 return (ENOMEM);
1133 }
1134 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1135 k_cfg->layoutSpecificSize);
1136 if (retcode) {
1137 RF_Free(k_cfg, sizeof(RF_Config_t));
1138 RF_Free(specific_buf,
1139 k_cfg->layoutSpecificSize);
1140 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1141 retcode));
1142 return (retcode);
1143 }
1144 } else
1145 specific_buf = NULL;
1146 k_cfg->layoutSpecific = specific_buf;
1147
1148 /* should do some kind of sanity check on the configuration.
1149 * Store the sum of all the bytes in the last byte? */
1150
1151 /* configure the system */
1152
1153 /*
1154 * Clear the entire RAID descriptor, just to make sure
1155 * there is no stale data left in the case of a
1156 * reconfiguration
1157 */
1158 memset(raidPtr, 0, sizeof(*raidPtr));
1159 raidPtr->raidid = unit;
1160
1161 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1162
1163 if (retcode == 0) {
1164
1165 /* allow this many simultaneous IO's to
1166 this RAID device */
1167 raidPtr->openings = RAIDOUTSTANDING;
1168
1169 raidinit(raidPtr);
1170 rf_markalldirty(raidPtr);
1171 }
1172 /* free the buffers. No return code here. */
1173 if (k_cfg->layoutSpecificSize) {
1174 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1175 }
1176 RF_Free(k_cfg, sizeof(RF_Config_t));
1177
1178 return (retcode);
1179
1180 /* shutdown the system */
1181 case RAIDFRAME_SHUTDOWN:
1182
1183 part = DISKPART(dev);
1184 pmask = (1 << part);
1185
1186 if ((error = raidlock(rs)) != 0)
1187 return (error);
1188
1189 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1190 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1191 (rs->sc_dkdev.dk_copenmask & pmask)))
1192 retcode = EBUSY;
1193 else {
1194 rs->sc_flags |= RAIDF_SHUTDOWN;
1195 rs->sc_dkdev.dk_copenmask &= ~pmask;
1196 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1197 rs->sc_dkdev.dk_openmask &= ~pmask;
1198 retcode = 0;
1199 }
1200
1201 raidunlock(rs);
1202
1203 if (retcode != 0)
1204 return retcode;
1205
1206 /* free the pseudo device attach bits */
1207
1208 cf = device_cfdata(rs->sc_dev);
1209 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1210 free(cf, M_RAIDFRAME);
1211
1212 return (retcode);
1213 case RAIDFRAME_GET_COMPONENT_LABEL:
1214 clabel_ptr = (RF_ComponentLabel_t **) data;
1215 /* need to read the component label for the disk indicated
1216 by row,column in clabel */
1217
1218 /*
1219 * Perhaps there should be an option to skip the in-core
1220 * copy and hit the disk, as with disklabel(8).
1221 */
1222 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1223
1224 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1225
1226 if (retcode) {
1227 RF_Free(clabel, sizeof(*clabel));
1228 return retcode;
1229 }
1230
1231 clabel->row = 0; /* Don't allow looking at anything else.*/
1232
1233 column = clabel->column;
1234
1235 if ((column < 0) || (column >= raidPtr->numCol +
1236 raidPtr->numSpare)) {
1237 RF_Free(clabel, sizeof(*clabel));
1238 return EINVAL;
1239 }
1240
1241 RF_Free(clabel, sizeof(*clabel));
1242
1243 clabel = raidget_component_label(raidPtr, column);
1244
1245 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1246
1247 #if 0
1248 case RAIDFRAME_SET_COMPONENT_LABEL:
1249 clabel = (RF_ComponentLabel_t *) data;
1250
1251 /* XXX check the label for valid stuff... */
1252 /* Note that some things *should not* get modified --
1253 the user should be re-initing the labels instead of
1254 trying to patch things.
1255 */
1256
1257 raidid = raidPtr->raidid;
1258 #ifdef DEBUG
1259 printf("raid%d: Got component label:\n", raidid);
1260 printf("raid%d: Version: %d\n", raidid, clabel->version);
1261 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1262 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1263 printf("raid%d: Column: %d\n", raidid, clabel->column);
1264 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1265 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1266 printf("raid%d: Status: %d\n", raidid, clabel->status);
1267 #endif
1268 clabel->row = 0;
1269 column = clabel->column;
1270
1271 if ((column < 0) || (column >= raidPtr->numCol)) {
1272 return(EINVAL);
1273 }
1274
1275 /* XXX this isn't allowed to do anything for now :-) */
1276
1277 /* XXX and before it is, we need to fill in the rest
1278 of the fields!?!?!?! */
1279 memcpy(raidget_component_label(raidPtr, column),
1280 clabel, sizeof(*clabel));
1281 raidflush_component_label(raidPtr, column);
1282 return (0);
1283 #endif
1284
1285 case RAIDFRAME_INIT_LABELS:
1286 clabel = (RF_ComponentLabel_t *) data;
1287 /*
1288 we only want the serial number from
1289 the above. We get all the rest of the information
1290 from the config that was used to create this RAID
1291 set.
1292 */
1293
1294 raidPtr->serial_number = clabel->serial_number;
1295
1296 for(column=0;column<raidPtr->numCol;column++) {
1297 diskPtr = &raidPtr->Disks[column];
1298 if (!RF_DEAD_DISK(diskPtr->status)) {
1299 ci_label = raidget_component_label(raidPtr,
1300 column);
1301 /* Zeroing this is important. */
1302 memset(ci_label, 0, sizeof(*ci_label));
1303 raid_init_component_label(raidPtr, ci_label);
1304 ci_label->serial_number =
1305 raidPtr->serial_number;
1306 ci_label->row = 0; /* we dont' pretend to support more */
1307 rf_component_label_set_partitionsize(ci_label,
1308 diskPtr->partitionSize);
1309 ci_label->column = column;
1310 raidflush_component_label(raidPtr, column);
1311 }
1312 /* XXXjld what about the spares? */
1313 }
1314
1315 return (retcode);
1316 case RAIDFRAME_SET_AUTOCONFIG:
1317 d = rf_set_autoconfig(raidPtr, *(int *) data);
1318 printf("raid%d: New autoconfig value is: %d\n",
1319 raidPtr->raidid, d);
1320 *(int *) data = d;
1321 return (retcode);
1322
1323 case RAIDFRAME_SET_ROOT:
1324 d = rf_set_rootpartition(raidPtr, *(int *) data);
1325 printf("raid%d: New rootpartition value is: %d\n",
1326 raidPtr->raidid, d);
1327 *(int *) data = d;
1328 return (retcode);
1329
1330 /* initialize all parity */
1331 case RAIDFRAME_REWRITEPARITY:
1332
1333 if (raidPtr->Layout.map->faultsTolerated == 0) {
1334 /* Parity for RAID 0 is trivially correct */
1335 raidPtr->parity_good = RF_RAID_CLEAN;
1336 return(0);
1337 }
1338
1339 if (raidPtr->parity_rewrite_in_progress == 1) {
1340 /* Re-write is already in progress! */
1341 return(EINVAL);
1342 }
1343
1344 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1345 rf_RewriteParityThread,
1346 raidPtr,"raid_parity");
1347 return (retcode);
1348
1349
1350 case RAIDFRAME_ADD_HOT_SPARE:
1351 sparePtr = (RF_SingleComponent_t *) data;
1352 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1353 retcode = rf_add_hot_spare(raidPtr, &component);
1354 return(retcode);
1355
1356 case RAIDFRAME_REMOVE_HOT_SPARE:
1357 return(retcode);
1358
1359 case RAIDFRAME_DELETE_COMPONENT:
1360 componentPtr = (RF_SingleComponent_t *)data;
1361 memcpy( &component, componentPtr,
1362 sizeof(RF_SingleComponent_t));
1363 retcode = rf_delete_component(raidPtr, &component);
1364 return(retcode);
1365
1366 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1367 componentPtr = (RF_SingleComponent_t *)data;
1368 memcpy( &component, componentPtr,
1369 sizeof(RF_SingleComponent_t));
1370 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1371 return(retcode);
1372
1373 case RAIDFRAME_REBUILD_IN_PLACE:
1374
1375 if (raidPtr->Layout.map->faultsTolerated == 0) {
1376 /* Can't do this on a RAID 0!! */
1377 return(EINVAL);
1378 }
1379
1380 if (raidPtr->recon_in_progress == 1) {
1381 /* a reconstruct is already in progress! */
1382 return(EINVAL);
1383 }
1384
1385 componentPtr = (RF_SingleComponent_t *) data;
1386 memcpy( &component, componentPtr,
1387 sizeof(RF_SingleComponent_t));
1388 component.row = 0; /* we don't support any more */
1389 column = component.column;
1390
1391 if ((column < 0) || (column >= raidPtr->numCol)) {
1392 return(EINVAL);
1393 }
1394
1395 RF_LOCK_MUTEX(raidPtr->mutex);
1396 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1397 (raidPtr->numFailures > 0)) {
1398 /* XXX 0 above shouldn't be constant!!! */
1399 /* some component other than this has failed.
1400 Let's not make things worse than they already
1401 are... */
1402 printf("raid%d: Unable to reconstruct to disk at:\n",
1403 raidPtr->raidid);
1404 printf("raid%d: Col: %d Too many failures.\n",
1405 raidPtr->raidid, column);
1406 RF_UNLOCK_MUTEX(raidPtr->mutex);
1407 return (EINVAL);
1408 }
1409 if (raidPtr->Disks[column].status ==
1410 rf_ds_reconstructing) {
1411 printf("raid%d: Unable to reconstruct to disk at:\n",
1412 raidPtr->raidid);
1413 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1414
1415 RF_UNLOCK_MUTEX(raidPtr->mutex);
1416 return (EINVAL);
1417 }
1418 if (raidPtr->Disks[column].status == rf_ds_spared) {
1419 RF_UNLOCK_MUTEX(raidPtr->mutex);
1420 return (EINVAL);
1421 }
1422 RF_UNLOCK_MUTEX(raidPtr->mutex);
1423
1424 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1425 if (rrcopy == NULL)
1426 return(ENOMEM);
1427
1428 rrcopy->raidPtr = (void *) raidPtr;
1429 rrcopy->col = column;
1430
1431 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1432 rf_ReconstructInPlaceThread,
1433 rrcopy,"raid_reconip");
1434 return(retcode);
1435
1436 case RAIDFRAME_GET_INFO:
1437 if (!raidPtr->valid)
1438 return (ENODEV);
1439 ucfgp = (RF_DeviceConfig_t **) data;
1440 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1441 (RF_DeviceConfig_t *));
1442 if (d_cfg == NULL)
1443 return (ENOMEM);
1444 d_cfg->rows = 1; /* there is only 1 row now */
1445 d_cfg->cols = raidPtr->numCol;
1446 d_cfg->ndevs = raidPtr->numCol;
1447 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1448 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1449 return (ENOMEM);
1450 }
1451 d_cfg->nspares = raidPtr->numSpare;
1452 if (d_cfg->nspares >= RF_MAX_DISKS) {
1453 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1454 return (ENOMEM);
1455 }
1456 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1457 d = 0;
1458 for (j = 0; j < d_cfg->cols; j++) {
1459 d_cfg->devs[d] = raidPtr->Disks[j];
1460 d++;
1461 }
1462 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1463 d_cfg->spares[i] = raidPtr->Disks[j];
1464 }
1465 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1466 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1467
1468 return (retcode);
1469
1470 case RAIDFRAME_CHECK_PARITY:
1471 *(int *) data = raidPtr->parity_good;
1472 return (0);
1473
1474 case RAIDFRAME_PARITYMAP_STATUS:
1475 if (rf_paritymap_ineligible(raidPtr))
1476 return EINVAL;
1477 rf_paritymap_status(raidPtr->parity_map,
1478 (struct rf_pmstat *)data);
1479 return 0;
1480
1481 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1482 if (rf_paritymap_ineligible(raidPtr))
1483 return EINVAL;
1484 if (raidPtr->parity_map == NULL)
1485 return ENOENT; /* ??? */
1486 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1487 (struct rf_pmparams *)data, 1))
1488 return EINVAL;
1489 return 0;
1490
1491 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1492 if (rf_paritymap_ineligible(raidPtr))
1493 return EINVAL;
1494 *(int *) data = rf_paritymap_get_disable(raidPtr);
1495 return 0;
1496
1497 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1498 if (rf_paritymap_ineligible(raidPtr))
1499 return EINVAL;
1500 rf_paritymap_set_disable(raidPtr, *(int *)data);
1501 /* XXX should errors be passed up? */
1502 return 0;
1503
1504 case RAIDFRAME_RESET_ACCTOTALS:
1505 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1506 return (0);
1507
1508 case RAIDFRAME_GET_ACCTOTALS:
1509 totals = (RF_AccTotals_t *) data;
1510 *totals = raidPtr->acc_totals;
1511 return (0);
1512
1513 case RAIDFRAME_KEEP_ACCTOTALS:
1514 raidPtr->keep_acc_totals = *(int *)data;
1515 return (0);
1516
1517 case RAIDFRAME_GET_SIZE:
1518 *(int *) data = raidPtr->totalSectors;
1519 return (0);
1520
1521 /* fail a disk & optionally start reconstruction */
1522 case RAIDFRAME_FAIL_DISK:
1523
1524 if (raidPtr->Layout.map->faultsTolerated == 0) {
1525 /* Can't do this on a RAID 0!! */
1526 return(EINVAL);
1527 }
1528
1529 rr = (struct rf_recon_req *) data;
1530 rr->row = 0;
1531 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1532 return (EINVAL);
1533
1534
1535 RF_LOCK_MUTEX(raidPtr->mutex);
1536 if (raidPtr->status == rf_rs_reconstructing) {
1537 /* you can't fail a disk while we're reconstructing! */
1538 /* XXX wrong for RAID6 */
1539 RF_UNLOCK_MUTEX(raidPtr->mutex);
1540 return (EINVAL);
1541 }
1542 if ((raidPtr->Disks[rr->col].status ==
1543 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1544 /* some other component has failed. Let's not make
1545 things worse. XXX wrong for RAID6 */
1546 RF_UNLOCK_MUTEX(raidPtr->mutex);
1547 return (EINVAL);
1548 }
1549 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1550 /* Can't fail a spared disk! */
1551 RF_UNLOCK_MUTEX(raidPtr->mutex);
1552 return (EINVAL);
1553 }
1554 RF_UNLOCK_MUTEX(raidPtr->mutex);
1555
1556 /* make a copy of the recon request so that we don't rely on
1557 * the user's buffer */
1558 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1559 if (rrcopy == NULL)
1560 return(ENOMEM);
1561 memcpy(rrcopy, rr, sizeof(*rr));
1562 rrcopy->raidPtr = (void *) raidPtr;
1563
1564 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1565 rf_ReconThread,
1566 rrcopy,"raid_recon");
1567 return (0);
1568
1569 /* invoke a copyback operation after recon on whatever disk
1570 * needs it, if any */
1571 case RAIDFRAME_COPYBACK:
1572
1573 if (raidPtr->Layout.map->faultsTolerated == 0) {
1574 /* This makes no sense on a RAID 0!! */
1575 return(EINVAL);
1576 }
1577
1578 if (raidPtr->copyback_in_progress == 1) {
1579 /* Copyback is already in progress! */
1580 return(EINVAL);
1581 }
1582
1583 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1584 rf_CopybackThread,
1585 raidPtr,"raid_copyback");
1586 return (retcode);
1587
1588 /* return the percentage completion of reconstruction */
1589 case RAIDFRAME_CHECK_RECON_STATUS:
1590 if (raidPtr->Layout.map->faultsTolerated == 0) {
1591 /* This makes no sense on a RAID 0, so tell the
1592 user it's done. */
1593 *(int *) data = 100;
1594 return(0);
1595 }
1596 if (raidPtr->status != rf_rs_reconstructing)
1597 *(int *) data = 100;
1598 else {
1599 if (raidPtr->reconControl->numRUsTotal > 0) {
1600 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1601 } else {
1602 *(int *) data = 0;
1603 }
1604 }
1605 return (0);
1606 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1607 progressInfoPtr = (RF_ProgressInfo_t **) data;
1608 if (raidPtr->status != rf_rs_reconstructing) {
1609 progressInfo.remaining = 0;
1610 progressInfo.completed = 100;
1611 progressInfo.total = 100;
1612 } else {
1613 progressInfo.total =
1614 raidPtr->reconControl->numRUsTotal;
1615 progressInfo.completed =
1616 raidPtr->reconControl->numRUsComplete;
1617 progressInfo.remaining = progressInfo.total -
1618 progressInfo.completed;
1619 }
1620 retcode = copyout(&progressInfo, *progressInfoPtr,
1621 sizeof(RF_ProgressInfo_t));
1622 return (retcode);
1623
1624 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1625 if (raidPtr->Layout.map->faultsTolerated == 0) {
1626 /* This makes no sense on a RAID 0, so tell the
1627 user it's done. */
1628 *(int *) data = 100;
1629 return(0);
1630 }
1631 if (raidPtr->parity_rewrite_in_progress == 1) {
1632 *(int *) data = 100 *
1633 raidPtr->parity_rewrite_stripes_done /
1634 raidPtr->Layout.numStripe;
1635 } else {
1636 *(int *) data = 100;
1637 }
1638 return (0);
1639
1640 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1641 progressInfoPtr = (RF_ProgressInfo_t **) data;
1642 if (raidPtr->parity_rewrite_in_progress == 1) {
1643 progressInfo.total = raidPtr->Layout.numStripe;
1644 progressInfo.completed =
1645 raidPtr->parity_rewrite_stripes_done;
1646 progressInfo.remaining = progressInfo.total -
1647 progressInfo.completed;
1648 } else {
1649 progressInfo.remaining = 0;
1650 progressInfo.completed = 100;
1651 progressInfo.total = 100;
1652 }
1653 retcode = copyout(&progressInfo, *progressInfoPtr,
1654 sizeof(RF_ProgressInfo_t));
1655 return (retcode);
1656
1657 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1658 if (raidPtr->Layout.map->faultsTolerated == 0) {
1659 /* This makes no sense on a RAID 0 */
1660 *(int *) data = 100;
1661 return(0);
1662 }
1663 if (raidPtr->copyback_in_progress == 1) {
1664 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1665 raidPtr->Layout.numStripe;
1666 } else {
1667 *(int *) data = 100;
1668 }
1669 return (0);
1670
1671 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1672 progressInfoPtr = (RF_ProgressInfo_t **) data;
1673 if (raidPtr->copyback_in_progress == 1) {
1674 progressInfo.total = raidPtr->Layout.numStripe;
1675 progressInfo.completed =
1676 raidPtr->copyback_stripes_done;
1677 progressInfo.remaining = progressInfo.total -
1678 progressInfo.completed;
1679 } else {
1680 progressInfo.remaining = 0;
1681 progressInfo.completed = 100;
1682 progressInfo.total = 100;
1683 }
1684 retcode = copyout(&progressInfo, *progressInfoPtr,
1685 sizeof(RF_ProgressInfo_t));
1686 return (retcode);
1687
1688 /* the sparetable daemon calls this to wait for the kernel to
1689 * need a spare table. this ioctl does not return until a
1690 * spare table is needed. XXX -- calling mpsleep here in the
1691 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1692 * -- I should either compute the spare table in the kernel,
1693 * or have a different -- XXX XXX -- interface (a different
1694 * character device) for delivering the table -- XXX */
1695 #if 0
1696 case RAIDFRAME_SPARET_WAIT:
1697 rf_lock_mutex2(rf_sparet_wait_mutex);
1698 while (!rf_sparet_wait_queue)
1699 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1700 waitreq = rf_sparet_wait_queue;
1701 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1702 rf_unlock_mutex2(rf_sparet_wait_mutex);
1703
1704 /* structure assignment */
1705 *((RF_SparetWait_t *) data) = *waitreq;
1706
1707 RF_Free(waitreq, sizeof(*waitreq));
1708 return (0);
1709
1710 /* wakes up a process waiting on SPARET_WAIT and puts an error
1711 * code in it that will cause the dameon to exit */
1712 case RAIDFRAME_ABORT_SPARET_WAIT:
1713 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1714 waitreq->fcol = -1;
1715 rf_lock_mutex2(rf_sparet_wait_mutex);
1716 waitreq->next = rf_sparet_wait_queue;
1717 rf_sparet_wait_queue = waitreq;
1718 rf_broadcast_conf2(rf_sparet_wait_cv);
1719 rf_unlock_mutex2(rf_sparet_wait_mutex);
1720 return (0);
1721
1722 /* used by the spare table daemon to deliver a spare table
1723 * into the kernel */
1724 case RAIDFRAME_SEND_SPARET:
1725
1726 /* install the spare table */
1727 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1728
1729 /* respond to the requestor. the return status of the spare
1730 * table installation is passed in the "fcol" field */
1731 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1732 waitreq->fcol = retcode;
1733 rf_lock_mutex2(rf_sparet_wait_mutex);
1734 waitreq->next = rf_sparet_resp_queue;
1735 rf_sparet_resp_queue = waitreq;
1736 rf_broadcast_cond2(rf_sparet_resp_cv);
1737 rf_unlock_mutex2(rf_sparet_wait_mutex);
1738
1739 return (retcode);
1740 #endif
1741
1742 default:
1743 break; /* fall through to the os-specific code below */
1744
1745 }
1746
1747 if (!raidPtr->valid)
1748 return (EINVAL);
1749
1750 /*
1751 * Add support for "regular" device ioctls here.
1752 */
1753
1754 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1755 if (error != EPASSTHROUGH)
1756 return (error);
1757
1758 switch (cmd) {
1759 case DIOCGDINFO:
1760 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1761 break;
1762 #ifdef __HAVE_OLD_DISKLABEL
1763 case ODIOCGDINFO:
1764 newlabel = *(rs->sc_dkdev.dk_label);
1765 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1766 return ENOTTY;
1767 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1768 break;
1769 #endif
1770
1771 case DIOCGPART:
1772 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1773 ((struct partinfo *) data)->part =
1774 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1775 break;
1776
1777 case DIOCWDINFO:
1778 case DIOCSDINFO:
1779 #ifdef __HAVE_OLD_DISKLABEL
1780 case ODIOCWDINFO:
1781 case ODIOCSDINFO:
1782 #endif
1783 {
1784 struct disklabel *lp;
1785 #ifdef __HAVE_OLD_DISKLABEL
1786 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1787 memset(&newlabel, 0, sizeof newlabel);
1788 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1789 lp = &newlabel;
1790 } else
1791 #endif
1792 lp = (struct disklabel *)data;
1793
1794 if ((error = raidlock(rs)) != 0)
1795 return (error);
1796
1797 rs->sc_flags |= RAIDF_LABELLING;
1798
1799 error = setdisklabel(rs->sc_dkdev.dk_label,
1800 lp, 0, rs->sc_dkdev.dk_cpulabel);
1801 if (error == 0) {
1802 if (cmd == DIOCWDINFO
1803 #ifdef __HAVE_OLD_DISKLABEL
1804 || cmd == ODIOCWDINFO
1805 #endif
1806 )
1807 error = writedisklabel(RAIDLABELDEV(dev),
1808 raidstrategy, rs->sc_dkdev.dk_label,
1809 rs->sc_dkdev.dk_cpulabel);
1810 }
1811 rs->sc_flags &= ~RAIDF_LABELLING;
1812
1813 raidunlock(rs);
1814
1815 if (error)
1816 return (error);
1817 break;
1818 }
1819
1820 case DIOCWLABEL:
1821 if (*(int *) data != 0)
1822 rs->sc_flags |= RAIDF_WLABEL;
1823 else
1824 rs->sc_flags &= ~RAIDF_WLABEL;
1825 break;
1826
1827 case DIOCGDEFLABEL:
1828 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1829 break;
1830
1831 #ifdef __HAVE_OLD_DISKLABEL
1832 case ODIOCGDEFLABEL:
1833 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1834 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1835 return ENOTTY;
1836 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1837 break;
1838 #endif
1839
1840 case DIOCAWEDGE:
1841 case DIOCDWEDGE:
1842 dkw = (void *)data;
1843
1844 /* If the ioctl happens here, the parent is us. */
1845 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1846 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1847
1848 case DIOCLWEDGES:
1849 return dkwedge_list(&rs->sc_dkdev,
1850 (struct dkwedge_list *)data, l);
1851 case DIOCCACHESYNC:
1852 return rf_sync_component_caches(raidPtr);
1853 default:
1854 retcode = ENOTTY;
1855 }
1856 return (retcode);
1857
1858 }
1859
1860
1861 /* raidinit -- complete the rest of the initialization for the
1862 RAIDframe device. */
1863
1864
1865 static void
1866 raidinit(RF_Raid_t *raidPtr)
1867 {
1868 cfdata_t cf;
1869 struct raid_softc *rs;
1870 int unit;
1871
1872 unit = raidPtr->raidid;
1873
1874 rs = &raid_softc[unit];
1875
1876 /* XXX should check return code first... */
1877 rs->sc_flags |= RAIDF_INITED;
1878
1879 /* XXX doesn't check bounds. */
1880 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1881
1882 /* attach the pseudo device */
1883 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1884 cf->cf_name = raid_cd.cd_name;
1885 cf->cf_atname = raid_cd.cd_name;
1886 cf->cf_unit = unit;
1887 cf->cf_fstate = FSTATE_STAR;
1888
1889 rs->sc_dev = config_attach_pseudo(cf);
1890
1891 if (rs->sc_dev == NULL) {
1892 printf("raid%d: config_attach_pseudo failed\n",
1893 raidPtr->raidid);
1894 rs->sc_flags &= ~RAIDF_INITED;
1895 free(cf, M_RAIDFRAME);
1896 return;
1897 }
1898
1899 /* disk_attach actually creates space for the CPU disklabel, among
1900 * other things, so it's critical to call this *BEFORE* we try putzing
1901 * with disklabels. */
1902
1903 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1904 disk_attach(&rs->sc_dkdev);
1905 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1906
1907 /* XXX There may be a weird interaction here between this, and
1908 * protectedSectors, as used in RAIDframe. */
1909
1910 rs->sc_size = raidPtr->totalSectors;
1911
1912 dkwedge_discover(&rs->sc_dkdev);
1913
1914 rf_set_properties(rs, raidPtr);
1915
1916 }
1917 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1918 /* wake up the daemon & tell it to get us a spare table
1919 * XXX
1920 * the entries in the queues should be tagged with the raidPtr
1921 * so that in the extremely rare case that two recons happen at once,
1922 * we know for which device were requesting a spare table
1923 * XXX
1924 *
1925 * XXX This code is not currently used. GO
1926 */
1927 int
1928 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1929 {
1930 int retcode;
1931
1932 rf_lock_mutex2(rf_sparet_wait_mutex);
1933 req->next = rf_sparet_wait_queue;
1934 rf_sparet_wait_queue = req;
1935 rf_broadcast_cond2(rf_sparet_wait_cv);
1936
1937 /* mpsleep unlocks the mutex */
1938 while (!rf_sparet_resp_queue) {
1939 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1940 }
1941 req = rf_sparet_resp_queue;
1942 rf_sparet_resp_queue = req->next;
1943 rf_unlock_mutex2(rf_sparet_wait_mutex);
1944
1945 retcode = req->fcol;
1946 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1947 * alloc'd */
1948 return (retcode);
1949 }
1950 #endif
1951
1952 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1953 * bp & passes it down.
1954 * any calls originating in the kernel must use non-blocking I/O
1955 * do some extra sanity checking to return "appropriate" error values for
1956 * certain conditions (to make some standard utilities work)
1957 *
1958 * Formerly known as: rf_DoAccessKernel
1959 */
1960 void
1961 raidstart(RF_Raid_t *raidPtr)
1962 {
1963 RF_SectorCount_t num_blocks, pb, sum;
1964 RF_RaidAddr_t raid_addr;
1965 struct partition *pp;
1966 daddr_t blocknum;
1967 int unit;
1968 struct raid_softc *rs;
1969 int do_async;
1970 struct buf *bp;
1971 int rc;
1972
1973 unit = raidPtr->raidid;
1974 rs = &raid_softc[unit];
1975
1976 /* quick check to see if anything has died recently */
1977 RF_LOCK_MUTEX(raidPtr->mutex);
1978 if (raidPtr->numNewFailures > 0) {
1979 RF_UNLOCK_MUTEX(raidPtr->mutex);
1980 rf_update_component_labels(raidPtr,
1981 RF_NORMAL_COMPONENT_UPDATE);
1982 RF_LOCK_MUTEX(raidPtr->mutex);
1983 raidPtr->numNewFailures--;
1984 }
1985
1986 /* Check to see if we're at the limit... */
1987 while (raidPtr->openings > 0) {
1988 RF_UNLOCK_MUTEX(raidPtr->mutex);
1989
1990 /* get the next item, if any, from the queue */
1991 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
1992 /* nothing more to do */
1993 return;
1994 }
1995
1996 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1997 * partition.. Need to make it absolute to the underlying
1998 * device.. */
1999
2000 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2001 if (DISKPART(bp->b_dev) != RAW_PART) {
2002 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2003 blocknum += pp->p_offset;
2004 }
2005
2006 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2007 (int) blocknum));
2008
2009 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2010 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2011
2012 /* *THIS* is where we adjust what block we're going to...
2013 * but DO NOT TOUCH bp->b_blkno!!! */
2014 raid_addr = blocknum;
2015
2016 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2017 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2018 sum = raid_addr + num_blocks + pb;
2019 if (1 || rf_debugKernelAccess) {
2020 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2021 (int) raid_addr, (int) sum, (int) num_blocks,
2022 (int) pb, (int) bp->b_resid));
2023 }
2024 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2025 || (sum < num_blocks) || (sum < pb)) {
2026 bp->b_error = ENOSPC;
2027 bp->b_resid = bp->b_bcount;
2028 biodone(bp);
2029 RF_LOCK_MUTEX(raidPtr->mutex);
2030 continue;
2031 }
2032 /*
2033 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2034 */
2035
2036 if (bp->b_bcount & raidPtr->sectorMask) {
2037 bp->b_error = EINVAL;
2038 bp->b_resid = bp->b_bcount;
2039 biodone(bp);
2040 RF_LOCK_MUTEX(raidPtr->mutex);
2041 continue;
2042
2043 }
2044 db1_printf(("Calling DoAccess..\n"));
2045
2046
2047 RF_LOCK_MUTEX(raidPtr->mutex);
2048 raidPtr->openings--;
2049 RF_UNLOCK_MUTEX(raidPtr->mutex);
2050
2051 /*
2052 * Everything is async.
2053 */
2054 do_async = 1;
2055
2056 disk_busy(&rs->sc_dkdev);
2057
2058 /* XXX we're still at splbio() here... do we *really*
2059 need to be? */
2060
2061 /* don't ever condition on bp->b_flags & B_WRITE.
2062 * always condition on B_READ instead */
2063
2064 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2065 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2066 do_async, raid_addr, num_blocks,
2067 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2068
2069 if (rc) {
2070 bp->b_error = rc;
2071 bp->b_resid = bp->b_bcount;
2072 biodone(bp);
2073 /* continue loop */
2074 }
2075
2076 RF_LOCK_MUTEX(raidPtr->mutex);
2077 }
2078 RF_UNLOCK_MUTEX(raidPtr->mutex);
2079 }
2080
2081
2082
2083
2084 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2085
2086 int
2087 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2088 {
2089 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2090 struct buf *bp;
2091
2092 req->queue = queue;
2093 bp = req->bp;
2094
2095 switch (req->type) {
2096 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2097 /* XXX need to do something extra here.. */
2098 /* I'm leaving this in, as I've never actually seen it used,
2099 * and I'd like folks to report it... GO */
2100 printf(("WAKEUP CALLED\n"));
2101 queue->numOutstanding++;
2102
2103 bp->b_flags = 0;
2104 bp->b_private = req;
2105
2106 KernelWakeupFunc(bp);
2107 break;
2108
2109 case RF_IO_TYPE_READ:
2110 case RF_IO_TYPE_WRITE:
2111 #if RF_ACC_TRACE > 0
2112 if (req->tracerec) {
2113 RF_ETIMER_START(req->tracerec->timer);
2114 }
2115 #endif
2116 InitBP(bp, queue->rf_cinfo->ci_vp,
2117 op, queue->rf_cinfo->ci_dev,
2118 req->sectorOffset, req->numSector,
2119 req->buf, KernelWakeupFunc, (void *) req,
2120 queue->raidPtr->logBytesPerSector, req->b_proc);
2121
2122 if (rf_debugKernelAccess) {
2123 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2124 (long) bp->b_blkno));
2125 }
2126 queue->numOutstanding++;
2127 queue->last_deq_sector = req->sectorOffset;
2128 /* acc wouldn't have been let in if there were any pending
2129 * reqs at any other priority */
2130 queue->curPriority = req->priority;
2131
2132 db1_printf(("Going for %c to unit %d col %d\n",
2133 req->type, queue->raidPtr->raidid,
2134 queue->col));
2135 db1_printf(("sector %d count %d (%d bytes) %d\n",
2136 (int) req->sectorOffset, (int) req->numSector,
2137 (int) (req->numSector <<
2138 queue->raidPtr->logBytesPerSector),
2139 (int) queue->raidPtr->logBytesPerSector));
2140
2141 /*
2142 * XXX: drop lock here since this can block at
2143 * least with backing SCSI devices. Retake it
2144 * to minimize fuss with calling interfaces.
2145 */
2146
2147 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2148 bdev_strategy(bp);
2149 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2150 break;
2151
2152 default:
2153 panic("bad req->type in rf_DispatchKernelIO");
2154 }
2155 db1_printf(("Exiting from DispatchKernelIO\n"));
2156
2157 return (0);
2158 }
2159 /* this is the callback function associated with a I/O invoked from
2160 kernel code.
2161 */
2162 static void
2163 KernelWakeupFunc(struct buf *bp)
2164 {
2165 RF_DiskQueueData_t *req = NULL;
2166 RF_DiskQueue_t *queue;
2167
2168 db1_printf(("recovering the request queue:\n"));
2169
2170 req = bp->b_private;
2171
2172 queue = (RF_DiskQueue_t *) req->queue;
2173
2174 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2175
2176 #if RF_ACC_TRACE > 0
2177 if (req->tracerec) {
2178 RF_ETIMER_STOP(req->tracerec->timer);
2179 RF_ETIMER_EVAL(req->tracerec->timer);
2180 rf_lock_mutex2(rf_tracing_mutex);
2181 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2182 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2183 req->tracerec->num_phys_ios++;
2184 rf_unlock_mutex2(rf_tracing_mutex);
2185 }
2186 #endif
2187
2188 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2189 * ballistic, and mark the component as hosed... */
2190
2191 if (bp->b_error != 0) {
2192 /* Mark the disk as dead */
2193 /* but only mark it once... */
2194 /* and only if it wouldn't leave this RAID set
2195 completely broken */
2196 if (((queue->raidPtr->Disks[queue->col].status ==
2197 rf_ds_optimal) ||
2198 (queue->raidPtr->Disks[queue->col].status ==
2199 rf_ds_used_spare)) &&
2200 (queue->raidPtr->numFailures <
2201 queue->raidPtr->Layout.map->faultsTolerated)) {
2202 printf("raid%d: IO Error. Marking %s as failed.\n",
2203 queue->raidPtr->raidid,
2204 queue->raidPtr->Disks[queue->col].devname);
2205 queue->raidPtr->Disks[queue->col].status =
2206 rf_ds_failed;
2207 queue->raidPtr->status = rf_rs_degraded;
2208 queue->raidPtr->numFailures++;
2209 queue->raidPtr->numNewFailures++;
2210 } else { /* Disk is already dead... */
2211 /* printf("Disk already marked as dead!\n"); */
2212 }
2213
2214 }
2215
2216 /* Fill in the error value */
2217 req->error = bp->b_error;
2218
2219 /* Drop this one on the "finished" queue... */
2220 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2221
2222 /* Let the raidio thread know there is work to be done. */
2223 rf_signal_cond2(queue->raidPtr->iodone_cv);
2224
2225 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2226 }
2227
2228
2229 /*
2230 * initialize a buf structure for doing an I/O in the kernel.
2231 */
2232 static void
2233 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2234 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2235 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2236 struct proc *b_proc)
2237 {
2238 /* bp->b_flags = B_PHYS | rw_flag; */
2239 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2240 bp->b_oflags = 0;
2241 bp->b_cflags = 0;
2242 bp->b_bcount = numSect << logBytesPerSector;
2243 bp->b_bufsize = bp->b_bcount;
2244 bp->b_error = 0;
2245 bp->b_dev = dev;
2246 bp->b_data = bf;
2247 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2248 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2249 if (bp->b_bcount == 0) {
2250 panic("bp->b_bcount is zero in InitBP!!");
2251 }
2252 bp->b_proc = b_proc;
2253 bp->b_iodone = cbFunc;
2254 bp->b_private = cbArg;
2255 }
2256
2257 static void
2258 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2259 struct disklabel *lp)
2260 {
2261 memset(lp, 0, sizeof(*lp));
2262
2263 /* fabricate a label... */
2264 lp->d_secperunit = raidPtr->totalSectors;
2265 lp->d_secsize = raidPtr->bytesPerSector;
2266 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2267 lp->d_ntracks = 4 * raidPtr->numCol;
2268 lp->d_ncylinders = raidPtr->totalSectors /
2269 (lp->d_nsectors * lp->d_ntracks);
2270 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2271
2272 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2273 lp->d_type = DTYPE_RAID;
2274 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2275 lp->d_rpm = 3600;
2276 lp->d_interleave = 1;
2277 lp->d_flags = 0;
2278
2279 lp->d_partitions[RAW_PART].p_offset = 0;
2280 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2281 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2282 lp->d_npartitions = RAW_PART + 1;
2283
2284 lp->d_magic = DISKMAGIC;
2285 lp->d_magic2 = DISKMAGIC;
2286 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2287
2288 }
2289 /*
2290 * Read the disklabel from the raid device. If one is not present, fake one
2291 * up.
2292 */
2293 static void
2294 raidgetdisklabel(dev_t dev)
2295 {
2296 int unit = raidunit(dev);
2297 struct raid_softc *rs = &raid_softc[unit];
2298 const char *errstring;
2299 struct disklabel *lp = rs->sc_dkdev.dk_label;
2300 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2301 RF_Raid_t *raidPtr;
2302
2303 db1_printf(("Getting the disklabel...\n"));
2304
2305 memset(clp, 0, sizeof(*clp));
2306
2307 raidPtr = raidPtrs[unit];
2308
2309 raidgetdefaultlabel(raidPtr, rs, lp);
2310
2311 /*
2312 * Call the generic disklabel extraction routine.
2313 */
2314 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2315 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2316 if (errstring)
2317 raidmakedisklabel(rs);
2318 else {
2319 int i;
2320 struct partition *pp;
2321
2322 /*
2323 * Sanity check whether the found disklabel is valid.
2324 *
2325 * This is necessary since total size of the raid device
2326 * may vary when an interleave is changed even though exactly
2327 * same components are used, and old disklabel may used
2328 * if that is found.
2329 */
2330 if (lp->d_secperunit != rs->sc_size)
2331 printf("raid%d: WARNING: %s: "
2332 "total sector size in disklabel (%" PRIu32 ") != "
2333 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2334 lp->d_secperunit, rs->sc_size);
2335 for (i = 0; i < lp->d_npartitions; i++) {
2336 pp = &lp->d_partitions[i];
2337 if (pp->p_offset + pp->p_size > rs->sc_size)
2338 printf("raid%d: WARNING: %s: end of partition `%c' "
2339 "exceeds the size of raid (%" PRIu64 ")\n",
2340 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2341 }
2342 }
2343
2344 }
2345 /*
2346 * Take care of things one might want to take care of in the event
2347 * that a disklabel isn't present.
2348 */
2349 static void
2350 raidmakedisklabel(struct raid_softc *rs)
2351 {
2352 struct disklabel *lp = rs->sc_dkdev.dk_label;
2353 db1_printf(("Making a label..\n"));
2354
2355 /*
2356 * For historical reasons, if there's no disklabel present
2357 * the raw partition must be marked FS_BSDFFS.
2358 */
2359
2360 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2361
2362 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2363
2364 lp->d_checksum = dkcksum(lp);
2365 }
2366 /*
2367 * Wait interruptibly for an exclusive lock.
2368 *
2369 * XXX
2370 * Several drivers do this; it should be abstracted and made MP-safe.
2371 * (Hmm... where have we seen this warning before :-> GO )
2372 */
2373 static int
2374 raidlock(struct raid_softc *rs)
2375 {
2376 int error;
2377
2378 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2379 rs->sc_flags |= RAIDF_WANTED;
2380 if ((error =
2381 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2382 return (error);
2383 }
2384 rs->sc_flags |= RAIDF_LOCKED;
2385 return (0);
2386 }
2387 /*
2388 * Unlock and wake up any waiters.
2389 */
2390 static void
2391 raidunlock(struct raid_softc *rs)
2392 {
2393
2394 rs->sc_flags &= ~RAIDF_LOCKED;
2395 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2396 rs->sc_flags &= ~RAIDF_WANTED;
2397 wakeup(rs);
2398 }
2399 }
2400
2401
2402 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2403 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2404 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2405
2406 static daddr_t
2407 rf_component_info_offset(void)
2408 {
2409
2410 return RF_COMPONENT_INFO_OFFSET;
2411 }
2412
2413 static daddr_t
2414 rf_component_info_size(unsigned secsize)
2415 {
2416 daddr_t info_size;
2417
2418 KASSERT(secsize);
2419 if (secsize > RF_COMPONENT_INFO_SIZE)
2420 info_size = secsize;
2421 else
2422 info_size = RF_COMPONENT_INFO_SIZE;
2423
2424 return info_size;
2425 }
2426
2427 static daddr_t
2428 rf_parity_map_offset(RF_Raid_t *raidPtr)
2429 {
2430 daddr_t map_offset;
2431
2432 KASSERT(raidPtr->bytesPerSector);
2433 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2434 map_offset = raidPtr->bytesPerSector;
2435 else
2436 map_offset = RF_COMPONENT_INFO_SIZE;
2437 map_offset += rf_component_info_offset();
2438
2439 return map_offset;
2440 }
2441
2442 static daddr_t
2443 rf_parity_map_size(RF_Raid_t *raidPtr)
2444 {
2445 daddr_t map_size;
2446
2447 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2448 map_size = raidPtr->bytesPerSector;
2449 else
2450 map_size = RF_PARITY_MAP_SIZE;
2451
2452 return map_size;
2453 }
2454
2455 int
2456 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2457 {
2458 RF_ComponentLabel_t *clabel;
2459
2460 clabel = raidget_component_label(raidPtr, col);
2461 clabel->clean = RF_RAID_CLEAN;
2462 raidflush_component_label(raidPtr, col);
2463 return(0);
2464 }
2465
2466
2467 int
2468 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2469 {
2470 RF_ComponentLabel_t *clabel;
2471
2472 clabel = raidget_component_label(raidPtr, col);
2473 clabel->clean = RF_RAID_DIRTY;
2474 raidflush_component_label(raidPtr, col);
2475 return(0);
2476 }
2477
2478 int
2479 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2480 {
2481 KASSERT(raidPtr->bytesPerSector);
2482 return raidread_component_label(raidPtr->bytesPerSector,
2483 raidPtr->Disks[col].dev,
2484 raidPtr->raid_cinfo[col].ci_vp,
2485 &raidPtr->raid_cinfo[col].ci_label);
2486 }
2487
2488 RF_ComponentLabel_t *
2489 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2490 {
2491 return &raidPtr->raid_cinfo[col].ci_label;
2492 }
2493
2494 int
2495 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2496 {
2497 RF_ComponentLabel_t *label;
2498
2499 label = &raidPtr->raid_cinfo[col].ci_label;
2500 label->mod_counter = raidPtr->mod_counter;
2501 #ifndef RF_NO_PARITY_MAP
2502 label->parity_map_modcount = label->mod_counter;
2503 #endif
2504 return raidwrite_component_label(raidPtr->bytesPerSector,
2505 raidPtr->Disks[col].dev,
2506 raidPtr->raid_cinfo[col].ci_vp, label);
2507 }
2508
2509
2510 static int
2511 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2512 RF_ComponentLabel_t *clabel)
2513 {
2514 return raidread_component_area(dev, b_vp, clabel,
2515 sizeof(RF_ComponentLabel_t),
2516 rf_component_info_offset(),
2517 rf_component_info_size(secsize));
2518 }
2519
2520 /* ARGSUSED */
2521 static int
2522 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2523 size_t msize, daddr_t offset, daddr_t dsize)
2524 {
2525 struct buf *bp;
2526 const struct bdevsw *bdev;
2527 int error;
2528
2529 /* XXX should probably ensure that we don't try to do this if
2530 someone has changed rf_protected_sectors. */
2531
2532 if (b_vp == NULL) {
2533 /* For whatever reason, this component is not valid.
2534 Don't try to read a component label from it. */
2535 return(EINVAL);
2536 }
2537
2538 /* get a block of the appropriate size... */
2539 bp = geteblk((int)dsize);
2540 bp->b_dev = dev;
2541
2542 /* get our ducks in a row for the read */
2543 bp->b_blkno = offset / DEV_BSIZE;
2544 bp->b_bcount = dsize;
2545 bp->b_flags |= B_READ;
2546 bp->b_resid = dsize;
2547
2548 bdev = bdevsw_lookup(bp->b_dev);
2549 if (bdev == NULL)
2550 return (ENXIO);
2551 (*bdev->d_strategy)(bp);
2552
2553 error = biowait(bp);
2554
2555 if (!error) {
2556 memcpy(data, bp->b_data, msize);
2557 }
2558
2559 brelse(bp, 0);
2560 return(error);
2561 }
2562
2563
2564 static int
2565 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2566 RF_ComponentLabel_t *clabel)
2567 {
2568 return raidwrite_component_area(dev, b_vp, clabel,
2569 sizeof(RF_ComponentLabel_t),
2570 rf_component_info_offset(),
2571 rf_component_info_size(secsize), 0);
2572 }
2573
2574 /* ARGSUSED */
2575 static int
2576 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2577 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2578 {
2579 struct buf *bp;
2580 const struct bdevsw *bdev;
2581 int error;
2582
2583 /* get a block of the appropriate size... */
2584 bp = geteblk((int)dsize);
2585 bp->b_dev = dev;
2586
2587 /* get our ducks in a row for the write */
2588 bp->b_blkno = offset / DEV_BSIZE;
2589 bp->b_bcount = dsize;
2590 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2591 bp->b_resid = dsize;
2592
2593 memset(bp->b_data, 0, dsize);
2594 memcpy(bp->b_data, data, msize);
2595
2596 bdev = bdevsw_lookup(bp->b_dev);
2597 if (bdev == NULL)
2598 return (ENXIO);
2599 (*bdev->d_strategy)(bp);
2600 if (asyncp)
2601 return 0;
2602 error = biowait(bp);
2603 brelse(bp, 0);
2604 if (error) {
2605 #if 1
2606 printf("Failed to write RAID component info!\n");
2607 #endif
2608 }
2609
2610 return(error);
2611 }
2612
2613 void
2614 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2615 {
2616 int c;
2617
2618 for (c = 0; c < raidPtr->numCol; c++) {
2619 /* Skip dead disks. */
2620 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2621 continue;
2622 /* XXXjld: what if an error occurs here? */
2623 raidwrite_component_area(raidPtr->Disks[c].dev,
2624 raidPtr->raid_cinfo[c].ci_vp, map,
2625 RF_PARITYMAP_NBYTE,
2626 rf_parity_map_offset(raidPtr),
2627 rf_parity_map_size(raidPtr), 0);
2628 }
2629 }
2630
2631 void
2632 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2633 {
2634 struct rf_paritymap_ondisk tmp;
2635 int c,first;
2636
2637 first=1;
2638 for (c = 0; c < raidPtr->numCol; c++) {
2639 /* Skip dead disks. */
2640 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2641 continue;
2642 raidread_component_area(raidPtr->Disks[c].dev,
2643 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2644 RF_PARITYMAP_NBYTE,
2645 rf_parity_map_offset(raidPtr),
2646 rf_parity_map_size(raidPtr));
2647 if (first) {
2648 memcpy(map, &tmp, sizeof(*map));
2649 first = 0;
2650 } else {
2651 rf_paritymap_merge(map, &tmp);
2652 }
2653 }
2654 }
2655
2656 void
2657 rf_markalldirty(RF_Raid_t *raidPtr)
2658 {
2659 RF_ComponentLabel_t *clabel;
2660 int sparecol;
2661 int c;
2662 int j;
2663 int scol = -1;
2664
2665 raidPtr->mod_counter++;
2666 for (c = 0; c < raidPtr->numCol; c++) {
2667 /* we don't want to touch (at all) a disk that has
2668 failed */
2669 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2670 clabel = raidget_component_label(raidPtr, c);
2671 if (clabel->status == rf_ds_spared) {
2672 /* XXX do something special...
2673 but whatever you do, don't
2674 try to access it!! */
2675 } else {
2676 raidmarkdirty(raidPtr, c);
2677 }
2678 }
2679 }
2680
2681 for( c = 0; c < raidPtr->numSpare ; c++) {
2682 sparecol = raidPtr->numCol + c;
2683 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2684 /*
2685
2686 we claim this disk is "optimal" if it's
2687 rf_ds_used_spare, as that means it should be
2688 directly substitutable for the disk it replaced.
2689 We note that too...
2690
2691 */
2692
2693 for(j=0;j<raidPtr->numCol;j++) {
2694 if (raidPtr->Disks[j].spareCol == sparecol) {
2695 scol = j;
2696 break;
2697 }
2698 }
2699
2700 clabel = raidget_component_label(raidPtr, sparecol);
2701 /* make sure status is noted */
2702
2703 raid_init_component_label(raidPtr, clabel);
2704
2705 clabel->row = 0;
2706 clabel->column = scol;
2707 /* Note: we *don't* change status from rf_ds_used_spare
2708 to rf_ds_optimal */
2709 /* clabel.status = rf_ds_optimal; */
2710
2711 raidmarkdirty(raidPtr, sparecol);
2712 }
2713 }
2714 }
2715
2716
2717 void
2718 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2719 {
2720 RF_ComponentLabel_t *clabel;
2721 int sparecol;
2722 int c;
2723 int j;
2724 int scol;
2725
2726 scol = -1;
2727
2728 /* XXX should do extra checks to make sure things really are clean,
2729 rather than blindly setting the clean bit... */
2730
2731 raidPtr->mod_counter++;
2732
2733 for (c = 0; c < raidPtr->numCol; c++) {
2734 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2735 clabel = raidget_component_label(raidPtr, c);
2736 /* make sure status is noted */
2737 clabel->status = rf_ds_optimal;
2738
2739 /* note what unit we are configured as */
2740 clabel->last_unit = raidPtr->raidid;
2741
2742 raidflush_component_label(raidPtr, c);
2743 if (final == RF_FINAL_COMPONENT_UPDATE) {
2744 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2745 raidmarkclean(raidPtr, c);
2746 }
2747 }
2748 }
2749 /* else we don't touch it.. */
2750 }
2751
2752 for( c = 0; c < raidPtr->numSpare ; c++) {
2753 sparecol = raidPtr->numCol + c;
2754 /* Need to ensure that the reconstruct actually completed! */
2755 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2756 /*
2757
2758 we claim this disk is "optimal" if it's
2759 rf_ds_used_spare, as that means it should be
2760 directly substitutable for the disk it replaced.
2761 We note that too...
2762
2763 */
2764
2765 for(j=0;j<raidPtr->numCol;j++) {
2766 if (raidPtr->Disks[j].spareCol == sparecol) {
2767 scol = j;
2768 break;
2769 }
2770 }
2771
2772 /* XXX shouldn't *really* need this... */
2773 clabel = raidget_component_label(raidPtr, sparecol);
2774 /* make sure status is noted */
2775
2776 raid_init_component_label(raidPtr, clabel);
2777
2778 clabel->column = scol;
2779 clabel->status = rf_ds_optimal;
2780 clabel->last_unit = raidPtr->raidid;
2781
2782 raidflush_component_label(raidPtr, sparecol);
2783 if (final == RF_FINAL_COMPONENT_UPDATE) {
2784 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2785 raidmarkclean(raidPtr, sparecol);
2786 }
2787 }
2788 }
2789 }
2790 }
2791
2792 void
2793 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2794 {
2795
2796 if (vp != NULL) {
2797 if (auto_configured == 1) {
2798 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2799 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2800 vput(vp);
2801
2802 } else {
2803 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2804 }
2805 }
2806 }
2807
2808
2809 void
2810 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2811 {
2812 int r,c;
2813 struct vnode *vp;
2814 int acd;
2815
2816
2817 /* We take this opportunity to close the vnodes like we should.. */
2818
2819 for (c = 0; c < raidPtr->numCol; c++) {
2820 vp = raidPtr->raid_cinfo[c].ci_vp;
2821 acd = raidPtr->Disks[c].auto_configured;
2822 rf_close_component(raidPtr, vp, acd);
2823 raidPtr->raid_cinfo[c].ci_vp = NULL;
2824 raidPtr->Disks[c].auto_configured = 0;
2825 }
2826
2827 for (r = 0; r < raidPtr->numSpare; r++) {
2828 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2829 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2830 rf_close_component(raidPtr, vp, acd);
2831 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2832 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2833 }
2834 }
2835
2836
2837 void
2838 rf_ReconThread(struct rf_recon_req *req)
2839 {
2840 int s;
2841 RF_Raid_t *raidPtr;
2842
2843 s = splbio();
2844 raidPtr = (RF_Raid_t *) req->raidPtr;
2845 raidPtr->recon_in_progress = 1;
2846
2847 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2848 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2849
2850 RF_Free(req, sizeof(*req));
2851
2852 raidPtr->recon_in_progress = 0;
2853 splx(s);
2854
2855 /* That's all... */
2856 kthread_exit(0); /* does not return */
2857 }
2858
2859 void
2860 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2861 {
2862 int retcode;
2863 int s;
2864
2865 raidPtr->parity_rewrite_stripes_done = 0;
2866 raidPtr->parity_rewrite_in_progress = 1;
2867 s = splbio();
2868 retcode = rf_RewriteParity(raidPtr);
2869 splx(s);
2870 if (retcode) {
2871 printf("raid%d: Error re-writing parity (%d)!\n",
2872 raidPtr->raidid, retcode);
2873 } else {
2874 /* set the clean bit! If we shutdown correctly,
2875 the clean bit on each component label will get
2876 set */
2877 raidPtr->parity_good = RF_RAID_CLEAN;
2878 }
2879 raidPtr->parity_rewrite_in_progress = 0;
2880
2881 /* Anyone waiting for us to stop? If so, inform them... */
2882 if (raidPtr->waitShutdown) {
2883 wakeup(&raidPtr->parity_rewrite_in_progress);
2884 }
2885
2886 /* That's all... */
2887 kthread_exit(0); /* does not return */
2888 }
2889
2890
2891 void
2892 rf_CopybackThread(RF_Raid_t *raidPtr)
2893 {
2894 int s;
2895
2896 raidPtr->copyback_in_progress = 1;
2897 s = splbio();
2898 rf_CopybackReconstructedData(raidPtr);
2899 splx(s);
2900 raidPtr->copyback_in_progress = 0;
2901
2902 /* That's all... */
2903 kthread_exit(0); /* does not return */
2904 }
2905
2906
2907 void
2908 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2909 {
2910 int s;
2911 RF_Raid_t *raidPtr;
2912
2913 s = splbio();
2914 raidPtr = req->raidPtr;
2915 raidPtr->recon_in_progress = 1;
2916 rf_ReconstructInPlace(raidPtr, req->col);
2917 RF_Free(req, sizeof(*req));
2918 raidPtr->recon_in_progress = 0;
2919 splx(s);
2920
2921 /* That's all... */
2922 kthread_exit(0); /* does not return */
2923 }
2924
2925 static RF_AutoConfig_t *
2926 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2927 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2928 unsigned secsize)
2929 {
2930 int good_one = 0;
2931 RF_ComponentLabel_t *clabel;
2932 RF_AutoConfig_t *ac;
2933
2934 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2935 if (clabel == NULL) {
2936 oomem:
2937 while(ac_list) {
2938 ac = ac_list;
2939 if (ac->clabel)
2940 free(ac->clabel, M_RAIDFRAME);
2941 ac_list = ac_list->next;
2942 free(ac, M_RAIDFRAME);
2943 }
2944 printf("RAID auto config: out of memory!\n");
2945 return NULL; /* XXX probably should panic? */
2946 }
2947
2948 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2949 /* Got the label. Does it look reasonable? */
2950 if (rf_reasonable_label(clabel, numsecs) &&
2951 (rf_component_label_partitionsize(clabel) <= size)) {
2952 #ifdef DEBUG
2953 printf("Component on: %s: %llu\n",
2954 cname, (unsigned long long)size);
2955 rf_print_component_label(clabel);
2956 #endif
2957 /* if it's reasonable, add it, else ignore it. */
2958 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2959 M_NOWAIT);
2960 if (ac == NULL) {
2961 free(clabel, M_RAIDFRAME);
2962 goto oomem;
2963 }
2964 strlcpy(ac->devname, cname, sizeof(ac->devname));
2965 ac->dev = dev;
2966 ac->vp = vp;
2967 ac->clabel = clabel;
2968 ac->next = ac_list;
2969 ac_list = ac;
2970 good_one = 1;
2971 }
2972 }
2973 if (!good_one) {
2974 /* cleanup */
2975 free(clabel, M_RAIDFRAME);
2976 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2977 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2978 vput(vp);
2979 }
2980 return ac_list;
2981 }
2982
2983 RF_AutoConfig_t *
2984 rf_find_raid_components(void)
2985 {
2986 struct vnode *vp;
2987 struct disklabel label;
2988 device_t dv;
2989 deviter_t di;
2990 dev_t dev;
2991 int bmajor, bminor, wedge;
2992 int error;
2993 int i;
2994 RF_AutoConfig_t *ac_list;
2995 uint64_t numsecs;
2996 unsigned secsize;
2997
2998 /* initialize the AutoConfig list */
2999 ac_list = NULL;
3000
3001 /* we begin by trolling through *all* the devices on the system */
3002
3003 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3004 dv = deviter_next(&di)) {
3005
3006 /* we are only interested in disks... */
3007 if (device_class(dv) != DV_DISK)
3008 continue;
3009
3010 /* we don't care about floppies... */
3011 if (device_is_a(dv, "fd")) {
3012 continue;
3013 }
3014
3015 /* we don't care about CD's... */
3016 if (device_is_a(dv, "cd")) {
3017 continue;
3018 }
3019
3020 /* we don't care about md's... */
3021 if (device_is_a(dv, "md")) {
3022 continue;
3023 }
3024
3025 /* hdfd is the Atari/Hades floppy driver */
3026 if (device_is_a(dv, "hdfd")) {
3027 continue;
3028 }
3029
3030 /* fdisa is the Atari/Milan floppy driver */
3031 if (device_is_a(dv, "fdisa")) {
3032 continue;
3033 }
3034
3035 /* need to find the device_name_to_block_device_major stuff */
3036 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3037
3038 /* get a vnode for the raw partition of this disk */
3039
3040 wedge = device_is_a(dv, "dk");
3041 bminor = minor(device_unit(dv));
3042 dev = wedge ? makedev(bmajor, bminor) :
3043 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3044 if (bdevvp(dev, &vp))
3045 panic("RAID can't alloc vnode");
3046
3047 error = VOP_OPEN(vp, FREAD, NOCRED);
3048
3049 if (error) {
3050 /* "Who cares." Continue looking
3051 for something that exists*/
3052 vput(vp);
3053 continue;
3054 }
3055
3056 error = getdisksize(vp, &numsecs, &secsize);
3057 if (error) {
3058 vput(vp);
3059 continue;
3060 }
3061 if (wedge) {
3062 struct dkwedge_info dkw;
3063 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3064 NOCRED);
3065 if (error) {
3066 printf("RAIDframe: can't get wedge info for "
3067 "dev %s (%d)\n", device_xname(dv), error);
3068 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3069 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3070 vput(vp);
3071 continue;
3072 }
3073
3074 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3075 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3076 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3077 vput(vp);
3078 continue;
3079 }
3080
3081 ac_list = rf_get_component(ac_list, dev, vp,
3082 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3083 continue;
3084 }
3085
3086 /* Ok, the disk exists. Go get the disklabel. */
3087 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3088 if (error) {
3089 /*
3090 * XXX can't happen - open() would
3091 * have errored out (or faked up one)
3092 */
3093 if (error != ENOTTY)
3094 printf("RAIDframe: can't get label for dev "
3095 "%s (%d)\n", device_xname(dv), error);
3096 }
3097
3098 /* don't need this any more. We'll allocate it again
3099 a little later if we really do... */
3100 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3101 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3102 vput(vp);
3103
3104 if (error)
3105 continue;
3106
3107 for (i = 0; i < label.d_npartitions; i++) {
3108 char cname[sizeof(ac_list->devname)];
3109
3110 /* We only support partitions marked as RAID */
3111 if (label.d_partitions[i].p_fstype != FS_RAID)
3112 continue;
3113
3114 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3115 if (bdevvp(dev, &vp))
3116 panic("RAID can't alloc vnode");
3117
3118 error = VOP_OPEN(vp, FREAD, NOCRED);
3119 if (error) {
3120 /* Whatever... */
3121 vput(vp);
3122 continue;
3123 }
3124 snprintf(cname, sizeof(cname), "%s%c",
3125 device_xname(dv), 'a' + i);
3126 ac_list = rf_get_component(ac_list, dev, vp, cname,
3127 label.d_partitions[i].p_size, numsecs, secsize);
3128 }
3129 }
3130 deviter_release(&di);
3131 return ac_list;
3132 }
3133
3134
3135 static int
3136 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3137 {
3138
3139 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3140 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3141 ((clabel->clean == RF_RAID_CLEAN) ||
3142 (clabel->clean == RF_RAID_DIRTY)) &&
3143 clabel->row >=0 &&
3144 clabel->column >= 0 &&
3145 clabel->num_rows > 0 &&
3146 clabel->num_columns > 0 &&
3147 clabel->row < clabel->num_rows &&
3148 clabel->column < clabel->num_columns &&
3149 clabel->blockSize > 0 &&
3150 /*
3151 * numBlocksHi may contain garbage, but it is ok since
3152 * the type is unsigned. If it is really garbage,
3153 * rf_fix_old_label_size() will fix it.
3154 */
3155 rf_component_label_numblocks(clabel) > 0) {
3156 /*
3157 * label looks reasonable enough...
3158 * let's make sure it has no old garbage.
3159 */
3160 rf_fix_old_label_size(clabel, numsecs);
3161 return(1);
3162 }
3163 return(0);
3164 }
3165
3166
3167 /*
3168 * For reasons yet unknown, some old component labels have garbage in
3169 * the newer numBlocksHi region, and this causes lossage. Since those
3170 * disks will also have numsecs set to less than 32 bits of sectors,
3171 * we can determine when this corruption has occured, and fix it.
3172 *
3173 * The exact same problem, with the same unknown reason, happens to
3174 * the partitionSizeHi member as well.
3175 */
3176 static void
3177 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3178 {
3179
3180 if (numsecs < ((uint64_t)1 << 32)) {
3181 if (clabel->numBlocksHi) {
3182 printf("WARNING: total sectors < 32 bits, yet "
3183 "numBlocksHi set\n"
3184 "WARNING: resetting numBlocksHi to zero.\n");
3185 clabel->numBlocksHi = 0;
3186 }
3187
3188 if (clabel->partitionSizeHi) {
3189 printf("WARNING: total sectors < 32 bits, yet "
3190 "partitionSizeHi set\n"
3191 "WARNING: resetting partitionSizeHi to zero.\n");
3192 clabel->partitionSizeHi = 0;
3193 }
3194 }
3195 }
3196
3197
3198 #ifdef DEBUG
3199 void
3200 rf_print_component_label(RF_ComponentLabel_t *clabel)
3201 {
3202 uint64_t numBlocks;
3203
3204 numBlocks = rf_component_label_numblocks(clabel);
3205
3206 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3207 clabel->row, clabel->column,
3208 clabel->num_rows, clabel->num_columns);
3209 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3210 clabel->version, clabel->serial_number,
3211 clabel->mod_counter);
3212 printf(" Clean: %s Status: %d\n",
3213 clabel->clean ? "Yes" : "No", clabel->status);
3214 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3215 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3216 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3217 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3218 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3219 printf(" Contains root partition: %s\n",
3220 clabel->root_partition ? "Yes" : "No");
3221 printf(" Last configured as: raid%d\n", clabel->last_unit);
3222 #if 0
3223 printf(" Config order: %d\n", clabel->config_order);
3224 #endif
3225
3226 }
3227 #endif
3228
3229 RF_ConfigSet_t *
3230 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3231 {
3232 RF_AutoConfig_t *ac;
3233 RF_ConfigSet_t *config_sets;
3234 RF_ConfigSet_t *cset;
3235 RF_AutoConfig_t *ac_next;
3236
3237
3238 config_sets = NULL;
3239
3240 /* Go through the AutoConfig list, and figure out which components
3241 belong to what sets. */
3242 ac = ac_list;
3243 while(ac!=NULL) {
3244 /* we're going to putz with ac->next, so save it here
3245 for use at the end of the loop */
3246 ac_next = ac->next;
3247
3248 if (config_sets == NULL) {
3249 /* will need at least this one... */
3250 config_sets = (RF_ConfigSet_t *)
3251 malloc(sizeof(RF_ConfigSet_t),
3252 M_RAIDFRAME, M_NOWAIT);
3253 if (config_sets == NULL) {
3254 panic("rf_create_auto_sets: No memory!");
3255 }
3256 /* this one is easy :) */
3257 config_sets->ac = ac;
3258 config_sets->next = NULL;
3259 config_sets->rootable = 0;
3260 ac->next = NULL;
3261 } else {
3262 /* which set does this component fit into? */
3263 cset = config_sets;
3264 while(cset!=NULL) {
3265 if (rf_does_it_fit(cset, ac)) {
3266 /* looks like it matches... */
3267 ac->next = cset->ac;
3268 cset->ac = ac;
3269 break;
3270 }
3271 cset = cset->next;
3272 }
3273 if (cset==NULL) {
3274 /* didn't find a match above... new set..*/
3275 cset = (RF_ConfigSet_t *)
3276 malloc(sizeof(RF_ConfigSet_t),
3277 M_RAIDFRAME, M_NOWAIT);
3278 if (cset == NULL) {
3279 panic("rf_create_auto_sets: No memory!");
3280 }
3281 cset->ac = ac;
3282 ac->next = NULL;
3283 cset->next = config_sets;
3284 cset->rootable = 0;
3285 config_sets = cset;
3286 }
3287 }
3288 ac = ac_next;
3289 }
3290
3291
3292 return(config_sets);
3293 }
3294
3295 static int
3296 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3297 {
3298 RF_ComponentLabel_t *clabel1, *clabel2;
3299
3300 /* If this one matches the *first* one in the set, that's good
3301 enough, since the other members of the set would have been
3302 through here too... */
3303 /* note that we are not checking partitionSize here..
3304
3305 Note that we are also not checking the mod_counters here.
3306 If everything else matches execpt the mod_counter, that's
3307 good enough for this test. We will deal with the mod_counters
3308 a little later in the autoconfiguration process.
3309
3310 (clabel1->mod_counter == clabel2->mod_counter) &&
3311
3312 The reason we don't check for this is that failed disks
3313 will have lower modification counts. If those disks are
3314 not added to the set they used to belong to, then they will
3315 form their own set, which may result in 2 different sets,
3316 for example, competing to be configured at raid0, and
3317 perhaps competing to be the root filesystem set. If the
3318 wrong ones get configured, or both attempt to become /,
3319 weird behaviour and or serious lossage will occur. Thus we
3320 need to bring them into the fold here, and kick them out at
3321 a later point.
3322
3323 */
3324
3325 clabel1 = cset->ac->clabel;
3326 clabel2 = ac->clabel;
3327 if ((clabel1->version == clabel2->version) &&
3328 (clabel1->serial_number == clabel2->serial_number) &&
3329 (clabel1->num_rows == clabel2->num_rows) &&
3330 (clabel1->num_columns == clabel2->num_columns) &&
3331 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3332 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3333 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3334 (clabel1->parityConfig == clabel2->parityConfig) &&
3335 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3336 (clabel1->blockSize == clabel2->blockSize) &&
3337 rf_component_label_numblocks(clabel1) ==
3338 rf_component_label_numblocks(clabel2) &&
3339 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3340 (clabel1->root_partition == clabel2->root_partition) &&
3341 (clabel1->last_unit == clabel2->last_unit) &&
3342 (clabel1->config_order == clabel2->config_order)) {
3343 /* if it get's here, it almost *has* to be a match */
3344 } else {
3345 /* it's not consistent with somebody in the set..
3346 punt */
3347 return(0);
3348 }
3349 /* all was fine.. it must fit... */
3350 return(1);
3351 }
3352
3353 int
3354 rf_have_enough_components(RF_ConfigSet_t *cset)
3355 {
3356 RF_AutoConfig_t *ac;
3357 RF_AutoConfig_t *auto_config;
3358 RF_ComponentLabel_t *clabel;
3359 int c;
3360 int num_cols;
3361 int num_missing;
3362 int mod_counter;
3363 int mod_counter_found;
3364 int even_pair_failed;
3365 char parity_type;
3366
3367
3368 /* check to see that we have enough 'live' components
3369 of this set. If so, we can configure it if necessary */
3370
3371 num_cols = cset->ac->clabel->num_columns;
3372 parity_type = cset->ac->clabel->parityConfig;
3373
3374 /* XXX Check for duplicate components!?!?!? */
3375
3376 /* Determine what the mod_counter is supposed to be for this set. */
3377
3378 mod_counter_found = 0;
3379 mod_counter = 0;
3380 ac = cset->ac;
3381 while(ac!=NULL) {
3382 if (mod_counter_found==0) {
3383 mod_counter = ac->clabel->mod_counter;
3384 mod_counter_found = 1;
3385 } else {
3386 if (ac->clabel->mod_counter > mod_counter) {
3387 mod_counter = ac->clabel->mod_counter;
3388 }
3389 }
3390 ac = ac->next;
3391 }
3392
3393 num_missing = 0;
3394 auto_config = cset->ac;
3395
3396 even_pair_failed = 0;
3397 for(c=0; c<num_cols; c++) {
3398 ac = auto_config;
3399 while(ac!=NULL) {
3400 if ((ac->clabel->column == c) &&
3401 (ac->clabel->mod_counter == mod_counter)) {
3402 /* it's this one... */
3403 #ifdef DEBUG
3404 printf("Found: %s at %d\n",
3405 ac->devname,c);
3406 #endif
3407 break;
3408 }
3409 ac=ac->next;
3410 }
3411 if (ac==NULL) {
3412 /* Didn't find one here! */
3413 /* special case for RAID 1, especially
3414 where there are more than 2
3415 components (where RAIDframe treats
3416 things a little differently :( ) */
3417 if (parity_type == '1') {
3418 if (c%2 == 0) { /* even component */
3419 even_pair_failed = 1;
3420 } else { /* odd component. If
3421 we're failed, and
3422 so is the even
3423 component, it's
3424 "Good Night, Charlie" */
3425 if (even_pair_failed == 1) {
3426 return(0);
3427 }
3428 }
3429 } else {
3430 /* normal accounting */
3431 num_missing++;
3432 }
3433 }
3434 if ((parity_type == '1') && (c%2 == 1)) {
3435 /* Just did an even component, and we didn't
3436 bail.. reset the even_pair_failed flag,
3437 and go on to the next component.... */
3438 even_pair_failed = 0;
3439 }
3440 }
3441
3442 clabel = cset->ac->clabel;
3443
3444 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3445 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3446 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3447 /* XXX this needs to be made *much* more general */
3448 /* Too many failures */
3449 return(0);
3450 }
3451 /* otherwise, all is well, and we've got enough to take a kick
3452 at autoconfiguring this set */
3453 return(1);
3454 }
3455
3456 void
3457 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3458 RF_Raid_t *raidPtr)
3459 {
3460 RF_ComponentLabel_t *clabel;
3461 int i;
3462
3463 clabel = ac->clabel;
3464
3465 /* 1. Fill in the common stuff */
3466 config->numRow = clabel->num_rows = 1;
3467 config->numCol = clabel->num_columns;
3468 config->numSpare = 0; /* XXX should this be set here? */
3469 config->sectPerSU = clabel->sectPerSU;
3470 config->SUsPerPU = clabel->SUsPerPU;
3471 config->SUsPerRU = clabel->SUsPerRU;
3472 config->parityConfig = clabel->parityConfig;
3473 /* XXX... */
3474 strcpy(config->diskQueueType,"fifo");
3475 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3476 config->layoutSpecificSize = 0; /* XXX ?? */
3477
3478 while(ac!=NULL) {
3479 /* row/col values will be in range due to the checks
3480 in reasonable_label() */
3481 strcpy(config->devnames[0][ac->clabel->column],
3482 ac->devname);
3483 ac = ac->next;
3484 }
3485
3486 for(i=0;i<RF_MAXDBGV;i++) {
3487 config->debugVars[i][0] = 0;
3488 }
3489 }
3490
3491 int
3492 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3493 {
3494 RF_ComponentLabel_t *clabel;
3495 int column;
3496 int sparecol;
3497
3498 raidPtr->autoconfigure = new_value;
3499
3500 for(column=0; column<raidPtr->numCol; column++) {
3501 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3502 clabel = raidget_component_label(raidPtr, column);
3503 clabel->autoconfigure = new_value;
3504 raidflush_component_label(raidPtr, column);
3505 }
3506 }
3507 for(column = 0; column < raidPtr->numSpare ; column++) {
3508 sparecol = raidPtr->numCol + column;
3509 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3510 clabel = raidget_component_label(raidPtr, sparecol);
3511 clabel->autoconfigure = new_value;
3512 raidflush_component_label(raidPtr, sparecol);
3513 }
3514 }
3515 return(new_value);
3516 }
3517
3518 int
3519 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3520 {
3521 RF_ComponentLabel_t *clabel;
3522 int column;
3523 int sparecol;
3524
3525 raidPtr->root_partition = new_value;
3526 for(column=0; column<raidPtr->numCol; column++) {
3527 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3528 clabel = raidget_component_label(raidPtr, column);
3529 clabel->root_partition = new_value;
3530 raidflush_component_label(raidPtr, column);
3531 }
3532 }
3533 for(column = 0; column < raidPtr->numSpare ; column++) {
3534 sparecol = raidPtr->numCol + column;
3535 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3536 clabel = raidget_component_label(raidPtr, sparecol);
3537 clabel->root_partition = new_value;
3538 raidflush_component_label(raidPtr, sparecol);
3539 }
3540 }
3541 return(new_value);
3542 }
3543
3544 void
3545 rf_release_all_vps(RF_ConfigSet_t *cset)
3546 {
3547 RF_AutoConfig_t *ac;
3548
3549 ac = cset->ac;
3550 while(ac!=NULL) {
3551 /* Close the vp, and give it back */
3552 if (ac->vp) {
3553 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3554 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3555 vput(ac->vp);
3556 ac->vp = NULL;
3557 }
3558 ac = ac->next;
3559 }
3560 }
3561
3562
3563 void
3564 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3565 {
3566 RF_AutoConfig_t *ac;
3567 RF_AutoConfig_t *next_ac;
3568
3569 ac = cset->ac;
3570 while(ac!=NULL) {
3571 next_ac = ac->next;
3572 /* nuke the label */
3573 free(ac->clabel, M_RAIDFRAME);
3574 /* cleanup the config structure */
3575 free(ac, M_RAIDFRAME);
3576 /* "next.." */
3577 ac = next_ac;
3578 }
3579 /* and, finally, nuke the config set */
3580 free(cset, M_RAIDFRAME);
3581 }
3582
3583
3584 void
3585 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3586 {
3587 /* current version number */
3588 clabel->version = RF_COMPONENT_LABEL_VERSION;
3589 clabel->serial_number = raidPtr->serial_number;
3590 clabel->mod_counter = raidPtr->mod_counter;
3591
3592 clabel->num_rows = 1;
3593 clabel->num_columns = raidPtr->numCol;
3594 clabel->clean = RF_RAID_DIRTY; /* not clean */
3595 clabel->status = rf_ds_optimal; /* "It's good!" */
3596
3597 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3598 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3599 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3600
3601 clabel->blockSize = raidPtr->bytesPerSector;
3602 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3603
3604 /* XXX not portable */
3605 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3606 clabel->maxOutstanding = raidPtr->maxOutstanding;
3607 clabel->autoconfigure = raidPtr->autoconfigure;
3608 clabel->root_partition = raidPtr->root_partition;
3609 clabel->last_unit = raidPtr->raidid;
3610 clabel->config_order = raidPtr->config_order;
3611
3612 #ifndef RF_NO_PARITY_MAP
3613 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3614 #endif
3615 }
3616
3617 int
3618 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3619 {
3620 RF_Raid_t *raidPtr;
3621 RF_Config_t *config;
3622 int raidID;
3623 int retcode;
3624
3625 #ifdef DEBUG
3626 printf("RAID autoconfigure\n");
3627 #endif
3628
3629 retcode = 0;
3630 *unit = -1;
3631
3632 /* 1. Create a config structure */
3633
3634 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3635 M_RAIDFRAME,
3636 M_NOWAIT);
3637 if (config==NULL) {
3638 printf("Out of mem!?!?\n");
3639 /* XXX do something more intelligent here. */
3640 return(1);
3641 }
3642
3643 memset(config, 0, sizeof(RF_Config_t));
3644
3645 /*
3646 2. Figure out what RAID ID this one is supposed to live at
3647 See if we can get the same RAID dev that it was configured
3648 on last time..
3649 */
3650
3651 raidID = cset->ac->clabel->last_unit;
3652 if ((raidID < 0) || (raidID >= numraid)) {
3653 /* let's not wander off into lala land. */
3654 raidID = numraid - 1;
3655 }
3656 if (raidPtrs[raidID]->valid != 0) {
3657
3658 /*
3659 Nope... Go looking for an alternative...
3660 Start high so we don't immediately use raid0 if that's
3661 not taken.
3662 */
3663
3664 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3665 if (raidPtrs[raidID]->valid == 0) {
3666 /* can use this one! */
3667 break;
3668 }
3669 }
3670 }
3671
3672 if (raidID < 0) {
3673 /* punt... */
3674 printf("Unable to auto configure this set!\n");
3675 printf("(Out of RAID devs!)\n");
3676 free(config, M_RAIDFRAME);
3677 return(1);
3678 }
3679
3680 #ifdef DEBUG
3681 printf("Configuring raid%d:\n",raidID);
3682 #endif
3683
3684 raidPtr = raidPtrs[raidID];
3685
3686 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3687 raidPtr->raidid = raidID;
3688 raidPtr->openings = RAIDOUTSTANDING;
3689
3690 /* 3. Build the configuration structure */
3691 rf_create_configuration(cset->ac, config, raidPtr);
3692
3693 /* 4. Do the configuration */
3694 retcode = rf_Configure(raidPtr, config, cset->ac);
3695
3696 if (retcode == 0) {
3697
3698 raidinit(raidPtrs[raidID]);
3699
3700 rf_markalldirty(raidPtrs[raidID]);
3701 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3702 if (cset->ac->clabel->root_partition==1) {
3703 /* everything configured just fine. Make a note
3704 that this set is eligible to be root. */
3705 cset->rootable = 1;
3706 /* XXX do this here? */
3707 raidPtrs[raidID]->root_partition = 1;
3708 }
3709 }
3710
3711 /* 5. Cleanup */
3712 free(config, M_RAIDFRAME);
3713
3714 *unit = raidID;
3715 return(retcode);
3716 }
3717
3718 void
3719 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3720 {
3721 struct buf *bp;
3722
3723 bp = (struct buf *)desc->bp;
3724 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3725 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3726 }
3727
3728 void
3729 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3730 size_t xmin, size_t xmax)
3731 {
3732 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3733 pool_sethiwat(p, xmax);
3734 pool_prime(p, xmin);
3735 pool_setlowat(p, xmin);
3736 }
3737
3738 /*
3739 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3740 * if there is IO pending and if that IO could possibly be done for a
3741 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3742 * otherwise.
3743 *
3744 */
3745
3746 int
3747 rf_buf_queue_check(int raidid)
3748 {
3749 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
3750 raidPtrs[raidid]->openings > 0) {
3751 /* there is work to do */
3752 return 0;
3753 }
3754 /* default is nothing to do */
3755 return 1;
3756 }
3757
3758 int
3759 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3760 {
3761 uint64_t numsecs;
3762 unsigned secsize;
3763 int error;
3764
3765 error = getdisksize(vp, &numsecs, &secsize);
3766 if (error == 0) {
3767 diskPtr->blockSize = secsize;
3768 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3769 diskPtr->partitionSize = numsecs;
3770 return 0;
3771 }
3772 return error;
3773 }
3774
3775 static int
3776 raid_match(device_t self, cfdata_t cfdata, void *aux)
3777 {
3778 return 1;
3779 }
3780
3781 static void
3782 raid_attach(device_t parent, device_t self, void *aux)
3783 {
3784
3785 }
3786
3787
3788 static int
3789 raid_detach(device_t self, int flags)
3790 {
3791 int error;
3792 struct raid_softc *rs = &raid_softc[device_unit(self)];
3793
3794 if ((error = raidlock(rs)) != 0)
3795 return (error);
3796
3797 error = raid_detach_unlocked(rs);
3798
3799 raidunlock(rs);
3800
3801 return error;
3802 }
3803
3804 static void
3805 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3806 {
3807 prop_dictionary_t disk_info, odisk_info, geom;
3808 disk_info = prop_dictionary_create();
3809 geom = prop_dictionary_create();
3810 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3811 raidPtr->totalSectors);
3812 prop_dictionary_set_uint32(geom, "sector-size",
3813 raidPtr->bytesPerSector);
3814
3815 prop_dictionary_set_uint16(geom, "sectors-per-track",
3816 raidPtr->Layout.dataSectorsPerStripe);
3817 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3818 4 * raidPtr->numCol);
3819
3820 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3821 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3822 (4 * raidPtr->numCol)));
3823
3824 prop_dictionary_set(disk_info, "geometry", geom);
3825 prop_object_release(geom);
3826 prop_dictionary_set(device_properties(rs->sc_dev),
3827 "disk-info", disk_info);
3828 odisk_info = rs->sc_dkdev.dk_info;
3829 rs->sc_dkdev.dk_info = disk_info;
3830 if (odisk_info)
3831 prop_object_release(odisk_info);
3832 }
3833
3834 /*
3835 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3836 * We end up returning whatever error was returned by the first cache flush
3837 * that fails.
3838 */
3839
3840 int
3841 rf_sync_component_caches(RF_Raid_t *raidPtr)
3842 {
3843 int c, sparecol;
3844 int e,error;
3845 int force = 1;
3846
3847 error = 0;
3848 for (c = 0; c < raidPtr->numCol; c++) {
3849 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3850 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3851 &force, FWRITE, NOCRED);
3852 if (e) {
3853 if (e != ENODEV)
3854 printf("raid%d: cache flush to component %s failed.\n",
3855 raidPtr->raidid, raidPtr->Disks[c].devname);
3856 if (error == 0) {
3857 error = e;
3858 }
3859 }
3860 }
3861 }
3862
3863 for( c = 0; c < raidPtr->numSpare ; c++) {
3864 sparecol = raidPtr->numCol + c;
3865 /* Need to ensure that the reconstruct actually completed! */
3866 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3867 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3868 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3869 if (e) {
3870 if (e != ENODEV)
3871 printf("raid%d: cache flush to component %s failed.\n",
3872 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3873 if (error == 0) {
3874 error = e;
3875 }
3876 }
3877 }
3878 }
3879 return error;
3880 }
3881