rf_netbsdkintf.c revision 1.294.2.1 1 /* $NetBSD: rf_netbsdkintf.c,v 1.294.2.1 2011/11/10 14:31:47 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.294.2.1 2011/11/10 14:31:47 yamt Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #include "raid.h"
110 #endif
111
112 #include <sys/param.h>
113 #include <sys/errno.h>
114 #include <sys/pool.h>
115 #include <sys/proc.h>
116 #include <sys/queue.h>
117 #include <sys/disk.h>
118 #include <sys/device.h>
119 #include <sys/stat.h>
120 #include <sys/ioctl.h>
121 #include <sys/fcntl.h>
122 #include <sys/systm.h>
123 #include <sys/vnode.h>
124 #include <sys/disklabel.h>
125 #include <sys/conf.h>
126 #include <sys/buf.h>
127 #include <sys/bufq.h>
128 #include <sys/reboot.h>
129 #include <sys/kauth.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #ifdef DEBUG
156 int rf_kdebug_level = 0;
157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
158 #else /* DEBUG */
159 #define db1_printf(a) { }
160 #endif /* DEBUG */
161
162 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 static rf_declare_mutex2(rf_sparet_wait_mutex);
166 static rf_declare_cond2(rf_sparet_wait_cv);
167 static rf_declare_cond2(rf_sparet_resp_cv);
168
169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
170 * spare table */
171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
172 * installation process */
173 #endif
174
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181 void *, int, struct proc *);
182 static void raidinit(RF_Raid_t *);
183
184 void raidattach(int);
185 static int raid_match(device_t, cfdata_t, void *);
186 static void raid_attach(device_t, device_t, void *);
187 static int raid_detach(device_t, int);
188
189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t);
191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
192 daddr_t, daddr_t, int);
193
194 static int raidwrite_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196 static int raidread_component_label(unsigned,
197 dev_t, struct vnode *, RF_ComponentLabel_t *);
198
199
200 dev_type_open(raidopen);
201 dev_type_close(raidclose);
202 dev_type_read(raidread);
203 dev_type_write(raidwrite);
204 dev_type_ioctl(raidioctl);
205 dev_type_strategy(raidstrategy);
206 dev_type_dump(raiddump);
207 dev_type_size(raidsize);
208
209 const struct bdevsw raid_bdevsw = {
210 raidopen, raidclose, raidstrategy, raidioctl,
211 raiddump, raidsize, D_DISK
212 };
213
214 const struct cdevsw raid_cdevsw = {
215 raidopen, raidclose, raidread, raidwrite, raidioctl,
216 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
217 };
218
219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
220
221 /* XXX Not sure if the following should be replacing the raidPtrs above,
222 or if it should be used in conjunction with that...
223 */
224
225 struct raid_softc {
226 device_t sc_dev;
227 int sc_flags; /* flags */
228 int sc_cflags; /* configuration flags */
229 uint64_t sc_size; /* size of the raid device */
230 char sc_xname[20]; /* XXX external name */
231 struct disk sc_dkdev; /* generic disk device info */
232 struct bufq_state *buf_queue; /* used for the device queue */
233 };
234 /* sc_flags */
235 #define RAIDF_INITED 0x01 /* unit has been initialized */
236 #define RAIDF_WLABEL 0x02 /* label area is writable */
237 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
238 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
239 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
240 #define RAIDF_LOCKED 0x80 /* unit is locked */
241
242 #define raidunit(x) DISKUNIT(x)
243 int numraid = 0;
244
245 extern struct cfdriver raid_cd;
246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
247 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
248 DVF_DETACH_SHUTDOWN);
249
250 /*
251 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
252 * Be aware that large numbers can allow the driver to consume a lot of
253 * kernel memory, especially on writes, and in degraded mode reads.
254 *
255 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
256 * a single 64K write will typically require 64K for the old data,
257 * 64K for the old parity, and 64K for the new parity, for a total
258 * of 192K (if the parity buffer is not re-used immediately).
259 * Even it if is used immediately, that's still 128K, which when multiplied
260 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
261 *
262 * Now in degraded mode, for example, a 64K read on the above setup may
263 * require data reconstruction, which will require *all* of the 4 remaining
264 * disks to participate -- 4 * 32K/disk == 128K again.
265 */
266
267 #ifndef RAIDOUTSTANDING
268 #define RAIDOUTSTANDING 6
269 #endif
270
271 #define RAIDLABELDEV(dev) \
272 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
273
274 /* declared here, and made public, for the benefit of KVM stuff.. */
275 struct raid_softc *raid_softc;
276
277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
278 struct disklabel *);
279 static void raidgetdisklabel(dev_t);
280 static void raidmakedisklabel(struct raid_softc *);
281
282 static int raidlock(struct raid_softc *);
283 static void raidunlock(struct raid_softc *);
284
285 static int raid_detach_unlocked(struct raid_softc *);
286
287 static void rf_markalldirty(RF_Raid_t *);
288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
289
290 void rf_ReconThread(struct rf_recon_req *);
291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
292 void rf_CopybackThread(RF_Raid_t *raidPtr);
293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
294 int rf_autoconfig(device_t);
295 void rf_buildroothack(RF_ConfigSet_t *);
296
297 RF_AutoConfig_t *rf_find_raid_components(void);
298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
300 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
302 int rf_set_autoconfig(RF_Raid_t *, int);
303 int rf_set_rootpartition(RF_Raid_t *, int);
304 void rf_release_all_vps(RF_ConfigSet_t *);
305 void rf_cleanup_config_set(RF_ConfigSet_t *);
306 int rf_have_enough_components(RF_ConfigSet_t *);
307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
309
310 /*
311 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
312 * Note that this is overridden by having RAID_AUTOCONFIG as an option
313 * in the kernel config file.
314 */
315 #ifdef RAID_AUTOCONFIG
316 int raidautoconfig = 1;
317 #else
318 int raidautoconfig = 0;
319 #endif
320 static bool raidautoconfigdone = false;
321
322 struct RF_Pools_s rf_pools;
323
324 void
325 raidattach(int num)
326 {
327 int raidID;
328 int i, rc;
329
330 aprint_debug("raidattach: Asked for %d units\n", num);
331
332 if (num <= 0) {
333 #ifdef DIAGNOSTIC
334 panic("raidattach: count <= 0");
335 #endif
336 return;
337 }
338 /* This is where all the initialization stuff gets done. */
339
340 numraid = num;
341
342 /* Make some space for requested number of units... */
343
344 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
345 if (raidPtrs == NULL) {
346 panic("raidPtrs is NULL!!");
347 }
348
349 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
350 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
351 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
352 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
353
354 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
355 #endif
356
357 for (i = 0; i < num; i++)
358 raidPtrs[i] = NULL;
359 rc = rf_BootRaidframe();
360 if (rc == 0)
361 aprint_verbose("Kernelized RAIDframe activated\n");
362 else
363 panic("Serious error booting RAID!!");
364
365 /* put together some datastructures like the CCD device does.. This
366 * lets us lock the device and what-not when it gets opened. */
367
368 raid_softc = (struct raid_softc *)
369 malloc(num * sizeof(struct raid_softc),
370 M_RAIDFRAME, M_NOWAIT);
371 if (raid_softc == NULL) {
372 aprint_error("WARNING: no memory for RAIDframe driver\n");
373 return;
374 }
375
376 memset(raid_softc, 0, num * sizeof(struct raid_softc));
377
378 for (raidID = 0; raidID < num; raidID++) {
379 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
380
381 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
382 (RF_Raid_t *));
383 if (raidPtrs[raidID] == NULL) {
384 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
385 numraid = raidID;
386 return;
387 }
388 }
389
390 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
391 aprint_error("raidattach: config_cfattach_attach failed?\n");
392 }
393
394 raidautoconfigdone = false;
395
396 /*
397 * Register a finalizer which will be used to auto-config RAID
398 * sets once all real hardware devices have been found.
399 */
400 if (config_finalize_register(NULL, rf_autoconfig) != 0)
401 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
402 }
403
404 int
405 rf_autoconfig(device_t self)
406 {
407 RF_AutoConfig_t *ac_list;
408 RF_ConfigSet_t *config_sets;
409
410 if (!raidautoconfig || raidautoconfigdone == true)
411 return (0);
412
413 /* XXX This code can only be run once. */
414 raidautoconfigdone = true;
415
416 /* 1. locate all RAID components on the system */
417 aprint_debug("Searching for RAID components...\n");
418 ac_list = rf_find_raid_components();
419
420 /* 2. Sort them into their respective sets. */
421 config_sets = rf_create_auto_sets(ac_list);
422
423 /*
424 * 3. Evaluate each set andconfigure the valid ones.
425 * This gets done in rf_buildroothack().
426 */
427 rf_buildroothack(config_sets);
428
429 return 1;
430 }
431
432 void
433 rf_buildroothack(RF_ConfigSet_t *config_sets)
434 {
435 RF_ConfigSet_t *cset;
436 RF_ConfigSet_t *next_cset;
437 int retcode;
438 int raidID;
439 int rootID;
440 int col;
441 int num_root;
442 char *devname;
443
444 rootID = 0;
445 num_root = 0;
446 cset = config_sets;
447 while (cset != NULL) {
448 next_cset = cset->next;
449 if (rf_have_enough_components(cset) &&
450 cset->ac->clabel->autoconfigure==1) {
451 retcode = rf_auto_config_set(cset,&raidID);
452 if (!retcode) {
453 aprint_debug("raid%d: configured ok\n", raidID);
454 if (cset->rootable) {
455 rootID = raidID;
456 num_root++;
457 }
458 } else {
459 /* The autoconfig didn't work :( */
460 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
461 rf_release_all_vps(cset);
462 }
463 } else {
464 /* we're not autoconfiguring this set...
465 release the associated resources */
466 rf_release_all_vps(cset);
467 }
468 /* cleanup */
469 rf_cleanup_config_set(cset);
470 cset = next_cset;
471 }
472
473 /* if the user has specified what the root device should be
474 then we don't touch booted_device or boothowto... */
475
476 if (rootspec != NULL)
477 return;
478
479 /* we found something bootable... */
480
481 if (num_root == 1) {
482 booted_device = raid_softc[rootID].sc_dev;
483 } else if (num_root > 1) {
484
485 /*
486 * Maybe the MD code can help. If it cannot, then
487 * setroot() will discover that we have no
488 * booted_device and will ask the user if nothing was
489 * hardwired in the kernel config file
490 */
491
492 if (booted_device == NULL)
493 cpu_rootconf();
494 if (booted_device == NULL)
495 return;
496
497 num_root = 0;
498 for (raidID = 0; raidID < numraid; raidID++) {
499 if (raidPtrs[raidID]->valid == 0)
500 continue;
501
502 if (raidPtrs[raidID]->root_partition == 0)
503 continue;
504
505 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
506 devname = raidPtrs[raidID]->Disks[col].devname;
507 devname += sizeof("/dev/") - 1;
508 if (strncmp(devname, device_xname(booted_device),
509 strlen(device_xname(booted_device))) != 0)
510 continue;
511 aprint_debug("raid%d includes boot device %s\n",
512 raidID, devname);
513 num_root++;
514 rootID = raidID;
515 }
516 }
517
518 if (num_root == 1) {
519 booted_device = raid_softc[rootID].sc_dev;
520 } else {
521 /* we can't guess.. require the user to answer... */
522 boothowto |= RB_ASKNAME;
523 }
524 }
525 }
526
527
528 int
529 raidsize(dev_t dev)
530 {
531 struct raid_softc *rs;
532 struct disklabel *lp;
533 int part, unit, omask, size;
534
535 unit = raidunit(dev);
536 if (unit >= numraid)
537 return (-1);
538 rs = &raid_softc[unit];
539
540 if ((rs->sc_flags & RAIDF_INITED) == 0)
541 return (-1);
542
543 part = DISKPART(dev);
544 omask = rs->sc_dkdev.dk_openmask & (1 << part);
545 lp = rs->sc_dkdev.dk_label;
546
547 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
548 return (-1);
549
550 if (lp->d_partitions[part].p_fstype != FS_SWAP)
551 size = -1;
552 else
553 size = lp->d_partitions[part].p_size *
554 (lp->d_secsize / DEV_BSIZE);
555
556 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
557 return (-1);
558
559 return (size);
560
561 }
562
563 int
564 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
565 {
566 int unit = raidunit(dev);
567 struct raid_softc *rs;
568 const struct bdevsw *bdev;
569 struct disklabel *lp;
570 RF_Raid_t *raidPtr;
571 daddr_t offset;
572 int part, c, sparecol, j, scol, dumpto;
573 int error = 0;
574
575 if (unit >= numraid)
576 return (ENXIO);
577
578 rs = &raid_softc[unit];
579 raidPtr = raidPtrs[unit];
580
581 if ((rs->sc_flags & RAIDF_INITED) == 0)
582 return ENXIO;
583
584 /* we only support dumping to RAID 1 sets */
585 if (raidPtr->Layout.numDataCol != 1 ||
586 raidPtr->Layout.numParityCol != 1)
587 return EINVAL;
588
589
590 if ((error = raidlock(rs)) != 0)
591 return error;
592
593 if (size % DEV_BSIZE != 0) {
594 error = EINVAL;
595 goto out;
596 }
597
598 if (blkno + size / DEV_BSIZE > rs->sc_size) {
599 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
600 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
601 size / DEV_BSIZE, rs->sc_size);
602 error = EINVAL;
603 goto out;
604 }
605
606 part = DISKPART(dev);
607 lp = rs->sc_dkdev.dk_label;
608 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
609
610 /* figure out what device is alive.. */
611
612 /*
613 Look for a component to dump to. The preference for the
614 component to dump to is as follows:
615 1) the master
616 2) a used_spare of the master
617 3) the slave
618 4) a used_spare of the slave
619 */
620
621 dumpto = -1;
622 for (c = 0; c < raidPtr->numCol; c++) {
623 if (raidPtr->Disks[c].status == rf_ds_optimal) {
624 /* this might be the one */
625 dumpto = c;
626 break;
627 }
628 }
629
630 /*
631 At this point we have possibly selected a live master or a
632 live slave. We now check to see if there is a spared
633 master (or a spared slave), if we didn't find a live master
634 or a live slave.
635 */
636
637 for (c = 0; c < raidPtr->numSpare; c++) {
638 sparecol = raidPtr->numCol + c;
639 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
640 /* How about this one? */
641 scol = -1;
642 for(j=0;j<raidPtr->numCol;j++) {
643 if (raidPtr->Disks[j].spareCol == sparecol) {
644 scol = j;
645 break;
646 }
647 }
648 if (scol == 0) {
649 /*
650 We must have found a spared master!
651 We'll take that over anything else
652 found so far. (We couldn't have
653 found a real master before, since
654 this is a used spare, and it's
655 saying that it's replacing the
656 master.) On reboot (with
657 autoconfiguration turned on)
658 sparecol will become the 1st
659 component (component0) of this set.
660 */
661 dumpto = sparecol;
662 break;
663 } else if (scol != -1) {
664 /*
665 Must be a spared slave. We'll dump
666 to that if we havn't found anything
667 else so far.
668 */
669 if (dumpto == -1)
670 dumpto = sparecol;
671 }
672 }
673 }
674
675 if (dumpto == -1) {
676 /* we couldn't find any live components to dump to!?!?
677 */
678 error = EINVAL;
679 goto out;
680 }
681
682 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
683
684 /*
685 Note that blkno is relative to this particular partition.
686 By adding the offset of this partition in the RAID
687 set, and also adding RF_PROTECTED_SECTORS, we get a
688 value that is relative to the partition used for the
689 underlying component.
690 */
691
692 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
693 blkno + offset, va, size);
694
695 out:
696 raidunlock(rs);
697
698 return error;
699 }
700 /* ARGSUSED */
701 int
702 raidopen(dev_t dev, int flags, int fmt,
703 struct lwp *l)
704 {
705 int unit = raidunit(dev);
706 struct raid_softc *rs;
707 struct disklabel *lp;
708 int part, pmask;
709 int error = 0;
710
711 if (unit >= numraid)
712 return (ENXIO);
713 rs = &raid_softc[unit];
714
715 if ((error = raidlock(rs)) != 0)
716 return (error);
717
718 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
719 error = EBUSY;
720 goto bad;
721 }
722
723 lp = rs->sc_dkdev.dk_label;
724
725 part = DISKPART(dev);
726
727 /*
728 * If there are wedges, and this is not RAW_PART, then we
729 * need to fail.
730 */
731 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
732 error = EBUSY;
733 goto bad;
734 }
735 pmask = (1 << part);
736
737 if ((rs->sc_flags & RAIDF_INITED) &&
738 (rs->sc_dkdev.dk_openmask == 0))
739 raidgetdisklabel(dev);
740
741 /* make sure that this partition exists */
742
743 if (part != RAW_PART) {
744 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
745 ((part >= lp->d_npartitions) ||
746 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
747 error = ENXIO;
748 goto bad;
749 }
750 }
751 /* Prevent this unit from being unconfigured while open. */
752 switch (fmt) {
753 case S_IFCHR:
754 rs->sc_dkdev.dk_copenmask |= pmask;
755 break;
756
757 case S_IFBLK:
758 rs->sc_dkdev.dk_bopenmask |= pmask;
759 break;
760 }
761
762 if ((rs->sc_dkdev.dk_openmask == 0) &&
763 ((rs->sc_flags & RAIDF_INITED) != 0)) {
764 /* First one... mark things as dirty... Note that we *MUST*
765 have done a configure before this. I DO NOT WANT TO BE
766 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
767 THAT THEY BELONG TOGETHER!!!!! */
768 /* XXX should check to see if we're only open for reading
769 here... If so, we needn't do this, but then need some
770 other way of keeping track of what's happened.. */
771
772 rf_markalldirty(raidPtrs[unit]);
773 }
774
775
776 rs->sc_dkdev.dk_openmask =
777 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
778
779 bad:
780 raidunlock(rs);
781
782 return (error);
783
784
785 }
786 /* ARGSUSED */
787 int
788 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
789 {
790 int unit = raidunit(dev);
791 struct raid_softc *rs;
792 int error = 0;
793 int part;
794
795 if (unit >= numraid)
796 return (ENXIO);
797 rs = &raid_softc[unit];
798
799 if ((error = raidlock(rs)) != 0)
800 return (error);
801
802 part = DISKPART(dev);
803
804 /* ...that much closer to allowing unconfiguration... */
805 switch (fmt) {
806 case S_IFCHR:
807 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
808 break;
809
810 case S_IFBLK:
811 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
812 break;
813 }
814 rs->sc_dkdev.dk_openmask =
815 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
816
817 if ((rs->sc_dkdev.dk_openmask == 0) &&
818 ((rs->sc_flags & RAIDF_INITED) != 0)) {
819 /* Last one... device is not unconfigured yet.
820 Device shutdown has taken care of setting the
821 clean bits if RAIDF_INITED is not set
822 mark things as clean... */
823
824 rf_update_component_labels(raidPtrs[unit],
825 RF_FINAL_COMPONENT_UPDATE);
826
827 /* If the kernel is shutting down, it will detach
828 * this RAID set soon enough.
829 */
830 }
831
832 raidunlock(rs);
833 return (0);
834
835 }
836
837 void
838 raidstrategy(struct buf *bp)
839 {
840 unsigned int raidID = raidunit(bp->b_dev);
841 RF_Raid_t *raidPtr;
842 struct raid_softc *rs = &raid_softc[raidID];
843 int wlabel;
844
845 if ((rs->sc_flags & RAIDF_INITED) ==0) {
846 bp->b_error = ENXIO;
847 goto done;
848 }
849 if (raidID >= numraid || !raidPtrs[raidID]) {
850 bp->b_error = ENODEV;
851 goto done;
852 }
853 raidPtr = raidPtrs[raidID];
854 if (!raidPtr->valid) {
855 bp->b_error = ENODEV;
856 goto done;
857 }
858 if (bp->b_bcount == 0) {
859 db1_printf(("b_bcount is zero..\n"));
860 goto done;
861 }
862
863 /*
864 * Do bounds checking and adjust transfer. If there's an
865 * error, the bounds check will flag that for us.
866 */
867
868 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
869 if (DISKPART(bp->b_dev) == RAW_PART) {
870 uint64_t size; /* device size in DEV_BSIZE unit */
871
872 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
873 size = raidPtr->totalSectors <<
874 (raidPtr->logBytesPerSector - DEV_BSHIFT);
875 } else {
876 size = raidPtr->totalSectors >>
877 (DEV_BSHIFT - raidPtr->logBytesPerSector);
878 }
879 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
880 goto done;
881 }
882 } else {
883 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
884 db1_printf(("Bounds check failed!!:%d %d\n",
885 (int) bp->b_blkno, (int) wlabel));
886 goto done;
887 }
888 }
889
890 rf_lock_mutex2(raidPtr->iodone_lock);
891
892 bp->b_resid = 0;
893
894 /* stuff it onto our queue */
895 bufq_put(rs->buf_queue, bp);
896
897 /* scheduled the IO to happen at the next convenient time */
898 rf_signal_cond2(raidPtr->iodone_cv);
899 rf_unlock_mutex2(raidPtr->iodone_lock);
900
901 return;
902
903 done:
904 bp->b_resid = bp->b_bcount;
905 biodone(bp);
906 }
907 /* ARGSUSED */
908 int
909 raidread(dev_t dev, struct uio *uio, int flags)
910 {
911 int unit = raidunit(dev);
912 struct raid_softc *rs;
913
914 if (unit >= numraid)
915 return (ENXIO);
916 rs = &raid_softc[unit];
917
918 if ((rs->sc_flags & RAIDF_INITED) == 0)
919 return (ENXIO);
920
921 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
922
923 }
924 /* ARGSUSED */
925 int
926 raidwrite(dev_t dev, struct uio *uio, int flags)
927 {
928 int unit = raidunit(dev);
929 struct raid_softc *rs;
930
931 if (unit >= numraid)
932 return (ENXIO);
933 rs = &raid_softc[unit];
934
935 if ((rs->sc_flags & RAIDF_INITED) == 0)
936 return (ENXIO);
937
938 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
939
940 }
941
942 static int
943 raid_detach_unlocked(struct raid_softc *rs)
944 {
945 int error;
946 RF_Raid_t *raidPtr;
947
948 raidPtr = raidPtrs[device_unit(rs->sc_dev)];
949
950 /*
951 * If somebody has a partition mounted, we shouldn't
952 * shutdown.
953 */
954 if (rs->sc_dkdev.dk_openmask != 0)
955 return EBUSY;
956
957 if ((rs->sc_flags & RAIDF_INITED) == 0)
958 ; /* not initialized: nothing to do */
959 else if ((error = rf_Shutdown(raidPtr)) != 0)
960 return error;
961 else
962 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
963
964 /* Detach the disk. */
965 dkwedge_delall(&rs->sc_dkdev);
966 disk_detach(&rs->sc_dkdev);
967 disk_destroy(&rs->sc_dkdev);
968
969 aprint_normal_dev(rs->sc_dev, "detached\n");
970
971 return 0;
972 }
973
974 int
975 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
976 {
977 int unit = raidunit(dev);
978 int error = 0;
979 int part, pmask;
980 cfdata_t cf;
981 struct raid_softc *rs;
982 RF_Config_t *k_cfg, *u_cfg;
983 RF_Raid_t *raidPtr;
984 RF_RaidDisk_t *diskPtr;
985 RF_AccTotals_t *totals;
986 RF_DeviceConfig_t *d_cfg, **ucfgp;
987 u_char *specific_buf;
988 int retcode = 0;
989 int column;
990 /* int raidid; */
991 struct rf_recon_req *rrcopy, *rr;
992 RF_ComponentLabel_t *clabel;
993 RF_ComponentLabel_t *ci_label;
994 RF_ComponentLabel_t **clabel_ptr;
995 RF_SingleComponent_t *sparePtr,*componentPtr;
996 RF_SingleComponent_t component;
997 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
998 int i, j, d;
999 #ifdef __HAVE_OLD_DISKLABEL
1000 struct disklabel newlabel;
1001 #endif
1002 struct dkwedge_info *dkw;
1003
1004 if (unit >= numraid)
1005 return (ENXIO);
1006 rs = &raid_softc[unit];
1007 raidPtr = raidPtrs[unit];
1008
1009 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1010 (int) DISKPART(dev), (int) unit, cmd));
1011
1012 /* Must be open for writes for these commands... */
1013 switch (cmd) {
1014 #ifdef DIOCGSECTORSIZE
1015 case DIOCGSECTORSIZE:
1016 *(u_int *)data = raidPtr->bytesPerSector;
1017 return 0;
1018 case DIOCGMEDIASIZE:
1019 *(off_t *)data =
1020 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1021 return 0;
1022 #endif
1023 case DIOCSDINFO:
1024 case DIOCWDINFO:
1025 #ifdef __HAVE_OLD_DISKLABEL
1026 case ODIOCWDINFO:
1027 case ODIOCSDINFO:
1028 #endif
1029 case DIOCWLABEL:
1030 case DIOCAWEDGE:
1031 case DIOCDWEDGE:
1032 if ((flag & FWRITE) == 0)
1033 return (EBADF);
1034 }
1035
1036 /* Must be initialized for these... */
1037 switch (cmd) {
1038 case DIOCGDINFO:
1039 case DIOCSDINFO:
1040 case DIOCWDINFO:
1041 #ifdef __HAVE_OLD_DISKLABEL
1042 case ODIOCGDINFO:
1043 case ODIOCWDINFO:
1044 case ODIOCSDINFO:
1045 case ODIOCGDEFLABEL:
1046 #endif
1047 case DIOCGPART:
1048 case DIOCWLABEL:
1049 case DIOCGDEFLABEL:
1050 case DIOCAWEDGE:
1051 case DIOCDWEDGE:
1052 case DIOCLWEDGES:
1053 case DIOCCACHESYNC:
1054 case RAIDFRAME_SHUTDOWN:
1055 case RAIDFRAME_REWRITEPARITY:
1056 case RAIDFRAME_GET_INFO:
1057 case RAIDFRAME_RESET_ACCTOTALS:
1058 case RAIDFRAME_GET_ACCTOTALS:
1059 case RAIDFRAME_KEEP_ACCTOTALS:
1060 case RAIDFRAME_GET_SIZE:
1061 case RAIDFRAME_FAIL_DISK:
1062 case RAIDFRAME_COPYBACK:
1063 case RAIDFRAME_CHECK_RECON_STATUS:
1064 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1065 case RAIDFRAME_GET_COMPONENT_LABEL:
1066 case RAIDFRAME_SET_COMPONENT_LABEL:
1067 case RAIDFRAME_ADD_HOT_SPARE:
1068 case RAIDFRAME_REMOVE_HOT_SPARE:
1069 case RAIDFRAME_INIT_LABELS:
1070 case RAIDFRAME_REBUILD_IN_PLACE:
1071 case RAIDFRAME_CHECK_PARITY:
1072 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1073 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1074 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1075 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1076 case RAIDFRAME_SET_AUTOCONFIG:
1077 case RAIDFRAME_SET_ROOT:
1078 case RAIDFRAME_DELETE_COMPONENT:
1079 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1080 case RAIDFRAME_PARITYMAP_STATUS:
1081 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1082 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1083 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1084 if ((rs->sc_flags & RAIDF_INITED) == 0)
1085 return (ENXIO);
1086 }
1087
1088 switch (cmd) {
1089 #ifdef COMPAT_50
1090 case RAIDFRAME_GET_INFO50:
1091 return rf_get_info50(raidPtr, data);
1092
1093 case RAIDFRAME_CONFIGURE50:
1094 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1095 return retcode;
1096 goto config;
1097 #endif
1098 /* configure the system */
1099 case RAIDFRAME_CONFIGURE:
1100
1101 if (raidPtr->valid) {
1102 /* There is a valid RAID set running on this unit! */
1103 printf("raid%d: Device already configured!\n",unit);
1104 return(EINVAL);
1105 }
1106
1107 /* copy-in the configuration information */
1108 /* data points to a pointer to the configuration structure */
1109
1110 u_cfg = *((RF_Config_t **) data);
1111 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1112 if (k_cfg == NULL) {
1113 return (ENOMEM);
1114 }
1115 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1116 if (retcode) {
1117 RF_Free(k_cfg, sizeof(RF_Config_t));
1118 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1119 retcode));
1120 return (retcode);
1121 }
1122 goto config;
1123 config:
1124 /* allocate a buffer for the layout-specific data, and copy it
1125 * in */
1126 if (k_cfg->layoutSpecificSize) {
1127 if (k_cfg->layoutSpecificSize > 10000) {
1128 /* sanity check */
1129 RF_Free(k_cfg, sizeof(RF_Config_t));
1130 return (EINVAL);
1131 }
1132 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1133 (u_char *));
1134 if (specific_buf == NULL) {
1135 RF_Free(k_cfg, sizeof(RF_Config_t));
1136 return (ENOMEM);
1137 }
1138 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1139 k_cfg->layoutSpecificSize);
1140 if (retcode) {
1141 RF_Free(k_cfg, sizeof(RF_Config_t));
1142 RF_Free(specific_buf,
1143 k_cfg->layoutSpecificSize);
1144 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1145 retcode));
1146 return (retcode);
1147 }
1148 } else
1149 specific_buf = NULL;
1150 k_cfg->layoutSpecific = specific_buf;
1151
1152 /* should do some kind of sanity check on the configuration.
1153 * Store the sum of all the bytes in the last byte? */
1154
1155 /* configure the system */
1156
1157 /*
1158 * Clear the entire RAID descriptor, just to make sure
1159 * there is no stale data left in the case of a
1160 * reconfiguration
1161 */
1162 memset(raidPtr, 0, sizeof(*raidPtr));
1163 raidPtr->raidid = unit;
1164
1165 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1166
1167 if (retcode == 0) {
1168
1169 /* allow this many simultaneous IO's to
1170 this RAID device */
1171 raidPtr->openings = RAIDOUTSTANDING;
1172
1173 raidinit(raidPtr);
1174 rf_markalldirty(raidPtr);
1175 }
1176 /* free the buffers. No return code here. */
1177 if (k_cfg->layoutSpecificSize) {
1178 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1179 }
1180 RF_Free(k_cfg, sizeof(RF_Config_t));
1181
1182 return (retcode);
1183
1184 /* shutdown the system */
1185 case RAIDFRAME_SHUTDOWN:
1186
1187 part = DISKPART(dev);
1188 pmask = (1 << part);
1189
1190 if ((error = raidlock(rs)) != 0)
1191 return (error);
1192
1193 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1194 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1195 (rs->sc_dkdev.dk_copenmask & pmask)))
1196 retcode = EBUSY;
1197 else {
1198 rs->sc_flags |= RAIDF_SHUTDOWN;
1199 rs->sc_dkdev.dk_copenmask &= ~pmask;
1200 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1201 rs->sc_dkdev.dk_openmask &= ~pmask;
1202 retcode = 0;
1203 }
1204
1205 raidunlock(rs);
1206
1207 if (retcode != 0)
1208 return retcode;
1209
1210 /* free the pseudo device attach bits */
1211
1212 cf = device_cfdata(rs->sc_dev);
1213 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1214 free(cf, M_RAIDFRAME);
1215
1216 return (retcode);
1217 case RAIDFRAME_GET_COMPONENT_LABEL:
1218 clabel_ptr = (RF_ComponentLabel_t **) data;
1219 /* need to read the component label for the disk indicated
1220 by row,column in clabel */
1221
1222 /*
1223 * Perhaps there should be an option to skip the in-core
1224 * copy and hit the disk, as with disklabel(8).
1225 */
1226 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1227
1228 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1229
1230 if (retcode) {
1231 RF_Free(clabel, sizeof(*clabel));
1232 return retcode;
1233 }
1234
1235 clabel->row = 0; /* Don't allow looking at anything else.*/
1236
1237 column = clabel->column;
1238
1239 if ((column < 0) || (column >= raidPtr->numCol +
1240 raidPtr->numSpare)) {
1241 RF_Free(clabel, sizeof(*clabel));
1242 return EINVAL;
1243 }
1244
1245 RF_Free(clabel, sizeof(*clabel));
1246
1247 clabel = raidget_component_label(raidPtr, column);
1248
1249 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1250
1251 #if 0
1252 case RAIDFRAME_SET_COMPONENT_LABEL:
1253 clabel = (RF_ComponentLabel_t *) data;
1254
1255 /* XXX check the label for valid stuff... */
1256 /* Note that some things *should not* get modified --
1257 the user should be re-initing the labels instead of
1258 trying to patch things.
1259 */
1260
1261 raidid = raidPtr->raidid;
1262 #ifdef DEBUG
1263 printf("raid%d: Got component label:\n", raidid);
1264 printf("raid%d: Version: %d\n", raidid, clabel->version);
1265 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1266 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1267 printf("raid%d: Column: %d\n", raidid, clabel->column);
1268 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1269 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1270 printf("raid%d: Status: %d\n", raidid, clabel->status);
1271 #endif
1272 clabel->row = 0;
1273 column = clabel->column;
1274
1275 if ((column < 0) || (column >= raidPtr->numCol)) {
1276 return(EINVAL);
1277 }
1278
1279 /* XXX this isn't allowed to do anything for now :-) */
1280
1281 /* XXX and before it is, we need to fill in the rest
1282 of the fields!?!?!?! */
1283 memcpy(raidget_component_label(raidPtr, column),
1284 clabel, sizeof(*clabel));
1285 raidflush_component_label(raidPtr, column);
1286 return (0);
1287 #endif
1288
1289 case RAIDFRAME_INIT_LABELS:
1290 clabel = (RF_ComponentLabel_t *) data;
1291 /*
1292 we only want the serial number from
1293 the above. We get all the rest of the information
1294 from the config that was used to create this RAID
1295 set.
1296 */
1297
1298 raidPtr->serial_number = clabel->serial_number;
1299
1300 for(column=0;column<raidPtr->numCol;column++) {
1301 diskPtr = &raidPtr->Disks[column];
1302 if (!RF_DEAD_DISK(diskPtr->status)) {
1303 ci_label = raidget_component_label(raidPtr,
1304 column);
1305 /* Zeroing this is important. */
1306 memset(ci_label, 0, sizeof(*ci_label));
1307 raid_init_component_label(raidPtr, ci_label);
1308 ci_label->serial_number =
1309 raidPtr->serial_number;
1310 ci_label->row = 0; /* we dont' pretend to support more */
1311 rf_component_label_set_partitionsize(ci_label,
1312 diskPtr->partitionSize);
1313 ci_label->column = column;
1314 raidflush_component_label(raidPtr, column);
1315 }
1316 /* XXXjld what about the spares? */
1317 }
1318
1319 return (retcode);
1320 case RAIDFRAME_SET_AUTOCONFIG:
1321 d = rf_set_autoconfig(raidPtr, *(int *) data);
1322 printf("raid%d: New autoconfig value is: %d\n",
1323 raidPtr->raidid, d);
1324 *(int *) data = d;
1325 return (retcode);
1326
1327 case RAIDFRAME_SET_ROOT:
1328 d = rf_set_rootpartition(raidPtr, *(int *) data);
1329 printf("raid%d: New rootpartition value is: %d\n",
1330 raidPtr->raidid, d);
1331 *(int *) data = d;
1332 return (retcode);
1333
1334 /* initialize all parity */
1335 case RAIDFRAME_REWRITEPARITY:
1336
1337 if (raidPtr->Layout.map->faultsTolerated == 0) {
1338 /* Parity for RAID 0 is trivially correct */
1339 raidPtr->parity_good = RF_RAID_CLEAN;
1340 return(0);
1341 }
1342
1343 if (raidPtr->parity_rewrite_in_progress == 1) {
1344 /* Re-write is already in progress! */
1345 return(EINVAL);
1346 }
1347
1348 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1349 rf_RewriteParityThread,
1350 raidPtr,"raid_parity");
1351 return (retcode);
1352
1353
1354 case RAIDFRAME_ADD_HOT_SPARE:
1355 sparePtr = (RF_SingleComponent_t *) data;
1356 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1357 retcode = rf_add_hot_spare(raidPtr, &component);
1358 return(retcode);
1359
1360 case RAIDFRAME_REMOVE_HOT_SPARE:
1361 return(retcode);
1362
1363 case RAIDFRAME_DELETE_COMPONENT:
1364 componentPtr = (RF_SingleComponent_t *)data;
1365 memcpy( &component, componentPtr,
1366 sizeof(RF_SingleComponent_t));
1367 retcode = rf_delete_component(raidPtr, &component);
1368 return(retcode);
1369
1370 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1371 componentPtr = (RF_SingleComponent_t *)data;
1372 memcpy( &component, componentPtr,
1373 sizeof(RF_SingleComponent_t));
1374 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1375 return(retcode);
1376
1377 case RAIDFRAME_REBUILD_IN_PLACE:
1378
1379 if (raidPtr->Layout.map->faultsTolerated == 0) {
1380 /* Can't do this on a RAID 0!! */
1381 return(EINVAL);
1382 }
1383
1384 if (raidPtr->recon_in_progress == 1) {
1385 /* a reconstruct is already in progress! */
1386 return(EINVAL);
1387 }
1388
1389 componentPtr = (RF_SingleComponent_t *) data;
1390 memcpy( &component, componentPtr,
1391 sizeof(RF_SingleComponent_t));
1392 component.row = 0; /* we don't support any more */
1393 column = component.column;
1394
1395 if ((column < 0) || (column >= raidPtr->numCol)) {
1396 return(EINVAL);
1397 }
1398
1399 rf_lock_mutex2(raidPtr->mutex);
1400 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1401 (raidPtr->numFailures > 0)) {
1402 /* XXX 0 above shouldn't be constant!!! */
1403 /* some component other than this has failed.
1404 Let's not make things worse than they already
1405 are... */
1406 printf("raid%d: Unable to reconstruct to disk at:\n",
1407 raidPtr->raidid);
1408 printf("raid%d: Col: %d Too many failures.\n",
1409 raidPtr->raidid, column);
1410 rf_unlock_mutex2(raidPtr->mutex);
1411 return (EINVAL);
1412 }
1413 if (raidPtr->Disks[column].status ==
1414 rf_ds_reconstructing) {
1415 printf("raid%d: Unable to reconstruct to disk at:\n",
1416 raidPtr->raidid);
1417 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1418
1419 rf_unlock_mutex2(raidPtr->mutex);
1420 return (EINVAL);
1421 }
1422 if (raidPtr->Disks[column].status == rf_ds_spared) {
1423 rf_unlock_mutex2(raidPtr->mutex);
1424 return (EINVAL);
1425 }
1426 rf_unlock_mutex2(raidPtr->mutex);
1427
1428 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1429 if (rrcopy == NULL)
1430 return(ENOMEM);
1431
1432 rrcopy->raidPtr = (void *) raidPtr;
1433 rrcopy->col = column;
1434
1435 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1436 rf_ReconstructInPlaceThread,
1437 rrcopy,"raid_reconip");
1438 return(retcode);
1439
1440 case RAIDFRAME_GET_INFO:
1441 if (!raidPtr->valid)
1442 return (ENODEV);
1443 ucfgp = (RF_DeviceConfig_t **) data;
1444 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1445 (RF_DeviceConfig_t *));
1446 if (d_cfg == NULL)
1447 return (ENOMEM);
1448 d_cfg->rows = 1; /* there is only 1 row now */
1449 d_cfg->cols = raidPtr->numCol;
1450 d_cfg->ndevs = raidPtr->numCol;
1451 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1452 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1453 return (ENOMEM);
1454 }
1455 d_cfg->nspares = raidPtr->numSpare;
1456 if (d_cfg->nspares >= RF_MAX_DISKS) {
1457 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1458 return (ENOMEM);
1459 }
1460 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1461 d = 0;
1462 for (j = 0; j < d_cfg->cols; j++) {
1463 d_cfg->devs[d] = raidPtr->Disks[j];
1464 d++;
1465 }
1466 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1467 d_cfg->spares[i] = raidPtr->Disks[j];
1468 }
1469 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1470 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1471
1472 return (retcode);
1473
1474 case RAIDFRAME_CHECK_PARITY:
1475 *(int *) data = raidPtr->parity_good;
1476 return (0);
1477
1478 case RAIDFRAME_PARITYMAP_STATUS:
1479 if (rf_paritymap_ineligible(raidPtr))
1480 return EINVAL;
1481 rf_paritymap_status(raidPtr->parity_map,
1482 (struct rf_pmstat *)data);
1483 return 0;
1484
1485 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1486 if (rf_paritymap_ineligible(raidPtr))
1487 return EINVAL;
1488 if (raidPtr->parity_map == NULL)
1489 return ENOENT; /* ??? */
1490 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1491 (struct rf_pmparams *)data, 1))
1492 return EINVAL;
1493 return 0;
1494
1495 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1496 if (rf_paritymap_ineligible(raidPtr))
1497 return EINVAL;
1498 *(int *) data = rf_paritymap_get_disable(raidPtr);
1499 return 0;
1500
1501 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1502 if (rf_paritymap_ineligible(raidPtr))
1503 return EINVAL;
1504 rf_paritymap_set_disable(raidPtr, *(int *)data);
1505 /* XXX should errors be passed up? */
1506 return 0;
1507
1508 case RAIDFRAME_RESET_ACCTOTALS:
1509 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1510 return (0);
1511
1512 case RAIDFRAME_GET_ACCTOTALS:
1513 totals = (RF_AccTotals_t *) data;
1514 *totals = raidPtr->acc_totals;
1515 return (0);
1516
1517 case RAIDFRAME_KEEP_ACCTOTALS:
1518 raidPtr->keep_acc_totals = *(int *)data;
1519 return (0);
1520
1521 case RAIDFRAME_GET_SIZE:
1522 *(int *) data = raidPtr->totalSectors;
1523 return (0);
1524
1525 /* fail a disk & optionally start reconstruction */
1526 case RAIDFRAME_FAIL_DISK:
1527
1528 if (raidPtr->Layout.map->faultsTolerated == 0) {
1529 /* Can't do this on a RAID 0!! */
1530 return(EINVAL);
1531 }
1532
1533 rr = (struct rf_recon_req *) data;
1534 rr->row = 0;
1535 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1536 return (EINVAL);
1537
1538
1539 rf_lock_mutex2(raidPtr->mutex);
1540 if (raidPtr->status == rf_rs_reconstructing) {
1541 /* you can't fail a disk while we're reconstructing! */
1542 /* XXX wrong for RAID6 */
1543 rf_unlock_mutex2(raidPtr->mutex);
1544 return (EINVAL);
1545 }
1546 if ((raidPtr->Disks[rr->col].status ==
1547 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1548 /* some other component has failed. Let's not make
1549 things worse. XXX wrong for RAID6 */
1550 rf_unlock_mutex2(raidPtr->mutex);
1551 return (EINVAL);
1552 }
1553 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1554 /* Can't fail a spared disk! */
1555 rf_unlock_mutex2(raidPtr->mutex);
1556 return (EINVAL);
1557 }
1558 rf_unlock_mutex2(raidPtr->mutex);
1559
1560 /* make a copy of the recon request so that we don't rely on
1561 * the user's buffer */
1562 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1563 if (rrcopy == NULL)
1564 return(ENOMEM);
1565 memcpy(rrcopy, rr, sizeof(*rr));
1566 rrcopy->raidPtr = (void *) raidPtr;
1567
1568 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1569 rf_ReconThread,
1570 rrcopy,"raid_recon");
1571 return (0);
1572
1573 /* invoke a copyback operation after recon on whatever disk
1574 * needs it, if any */
1575 case RAIDFRAME_COPYBACK:
1576
1577 if (raidPtr->Layout.map->faultsTolerated == 0) {
1578 /* This makes no sense on a RAID 0!! */
1579 return(EINVAL);
1580 }
1581
1582 if (raidPtr->copyback_in_progress == 1) {
1583 /* Copyback is already in progress! */
1584 return(EINVAL);
1585 }
1586
1587 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1588 rf_CopybackThread,
1589 raidPtr,"raid_copyback");
1590 return (retcode);
1591
1592 /* return the percentage completion of reconstruction */
1593 case RAIDFRAME_CHECK_RECON_STATUS:
1594 if (raidPtr->Layout.map->faultsTolerated == 0) {
1595 /* This makes no sense on a RAID 0, so tell the
1596 user it's done. */
1597 *(int *) data = 100;
1598 return(0);
1599 }
1600 if (raidPtr->status != rf_rs_reconstructing)
1601 *(int *) data = 100;
1602 else {
1603 if (raidPtr->reconControl->numRUsTotal > 0) {
1604 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1605 } else {
1606 *(int *) data = 0;
1607 }
1608 }
1609 return (0);
1610 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1611 progressInfoPtr = (RF_ProgressInfo_t **) data;
1612 if (raidPtr->status != rf_rs_reconstructing) {
1613 progressInfo.remaining = 0;
1614 progressInfo.completed = 100;
1615 progressInfo.total = 100;
1616 } else {
1617 progressInfo.total =
1618 raidPtr->reconControl->numRUsTotal;
1619 progressInfo.completed =
1620 raidPtr->reconControl->numRUsComplete;
1621 progressInfo.remaining = progressInfo.total -
1622 progressInfo.completed;
1623 }
1624 retcode = copyout(&progressInfo, *progressInfoPtr,
1625 sizeof(RF_ProgressInfo_t));
1626 return (retcode);
1627
1628 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1629 if (raidPtr->Layout.map->faultsTolerated == 0) {
1630 /* This makes no sense on a RAID 0, so tell the
1631 user it's done. */
1632 *(int *) data = 100;
1633 return(0);
1634 }
1635 if (raidPtr->parity_rewrite_in_progress == 1) {
1636 *(int *) data = 100 *
1637 raidPtr->parity_rewrite_stripes_done /
1638 raidPtr->Layout.numStripe;
1639 } else {
1640 *(int *) data = 100;
1641 }
1642 return (0);
1643
1644 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1645 progressInfoPtr = (RF_ProgressInfo_t **) data;
1646 if (raidPtr->parity_rewrite_in_progress == 1) {
1647 progressInfo.total = raidPtr->Layout.numStripe;
1648 progressInfo.completed =
1649 raidPtr->parity_rewrite_stripes_done;
1650 progressInfo.remaining = progressInfo.total -
1651 progressInfo.completed;
1652 } else {
1653 progressInfo.remaining = 0;
1654 progressInfo.completed = 100;
1655 progressInfo.total = 100;
1656 }
1657 retcode = copyout(&progressInfo, *progressInfoPtr,
1658 sizeof(RF_ProgressInfo_t));
1659 return (retcode);
1660
1661 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1662 if (raidPtr->Layout.map->faultsTolerated == 0) {
1663 /* This makes no sense on a RAID 0 */
1664 *(int *) data = 100;
1665 return(0);
1666 }
1667 if (raidPtr->copyback_in_progress == 1) {
1668 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1669 raidPtr->Layout.numStripe;
1670 } else {
1671 *(int *) data = 100;
1672 }
1673 return (0);
1674
1675 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1676 progressInfoPtr = (RF_ProgressInfo_t **) data;
1677 if (raidPtr->copyback_in_progress == 1) {
1678 progressInfo.total = raidPtr->Layout.numStripe;
1679 progressInfo.completed =
1680 raidPtr->copyback_stripes_done;
1681 progressInfo.remaining = progressInfo.total -
1682 progressInfo.completed;
1683 } else {
1684 progressInfo.remaining = 0;
1685 progressInfo.completed = 100;
1686 progressInfo.total = 100;
1687 }
1688 retcode = copyout(&progressInfo, *progressInfoPtr,
1689 sizeof(RF_ProgressInfo_t));
1690 return (retcode);
1691
1692 /* the sparetable daemon calls this to wait for the kernel to
1693 * need a spare table. this ioctl does not return until a
1694 * spare table is needed. XXX -- calling mpsleep here in the
1695 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1696 * -- I should either compute the spare table in the kernel,
1697 * or have a different -- XXX XXX -- interface (a different
1698 * character device) for delivering the table -- XXX */
1699 #if 0
1700 case RAIDFRAME_SPARET_WAIT:
1701 rf_lock_mutex2(rf_sparet_wait_mutex);
1702 while (!rf_sparet_wait_queue)
1703 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1704 waitreq = rf_sparet_wait_queue;
1705 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1706 rf_unlock_mutex2(rf_sparet_wait_mutex);
1707
1708 /* structure assignment */
1709 *((RF_SparetWait_t *) data) = *waitreq;
1710
1711 RF_Free(waitreq, sizeof(*waitreq));
1712 return (0);
1713
1714 /* wakes up a process waiting on SPARET_WAIT and puts an error
1715 * code in it that will cause the dameon to exit */
1716 case RAIDFRAME_ABORT_SPARET_WAIT:
1717 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1718 waitreq->fcol = -1;
1719 rf_lock_mutex2(rf_sparet_wait_mutex);
1720 waitreq->next = rf_sparet_wait_queue;
1721 rf_sparet_wait_queue = waitreq;
1722 rf_broadcast_conf2(rf_sparet_wait_cv);
1723 rf_unlock_mutex2(rf_sparet_wait_mutex);
1724 return (0);
1725
1726 /* used by the spare table daemon to deliver a spare table
1727 * into the kernel */
1728 case RAIDFRAME_SEND_SPARET:
1729
1730 /* install the spare table */
1731 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1732
1733 /* respond to the requestor. the return status of the spare
1734 * table installation is passed in the "fcol" field */
1735 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1736 waitreq->fcol = retcode;
1737 rf_lock_mutex2(rf_sparet_wait_mutex);
1738 waitreq->next = rf_sparet_resp_queue;
1739 rf_sparet_resp_queue = waitreq;
1740 rf_broadcast_cond2(rf_sparet_resp_cv);
1741 rf_unlock_mutex2(rf_sparet_wait_mutex);
1742
1743 return (retcode);
1744 #endif
1745
1746 default:
1747 break; /* fall through to the os-specific code below */
1748
1749 }
1750
1751 if (!raidPtr->valid)
1752 return (EINVAL);
1753
1754 /*
1755 * Add support for "regular" device ioctls here.
1756 */
1757
1758 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1759 if (error != EPASSTHROUGH)
1760 return (error);
1761
1762 switch (cmd) {
1763 case DIOCGDINFO:
1764 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1765 break;
1766 #ifdef __HAVE_OLD_DISKLABEL
1767 case ODIOCGDINFO:
1768 newlabel = *(rs->sc_dkdev.dk_label);
1769 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1770 return ENOTTY;
1771 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1772 break;
1773 #endif
1774
1775 case DIOCGPART:
1776 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1777 ((struct partinfo *) data)->part =
1778 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1779 break;
1780
1781 case DIOCWDINFO:
1782 case DIOCSDINFO:
1783 #ifdef __HAVE_OLD_DISKLABEL
1784 case ODIOCWDINFO:
1785 case ODIOCSDINFO:
1786 #endif
1787 {
1788 struct disklabel *lp;
1789 #ifdef __HAVE_OLD_DISKLABEL
1790 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1791 memset(&newlabel, 0, sizeof newlabel);
1792 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1793 lp = &newlabel;
1794 } else
1795 #endif
1796 lp = (struct disklabel *)data;
1797
1798 if ((error = raidlock(rs)) != 0)
1799 return (error);
1800
1801 rs->sc_flags |= RAIDF_LABELLING;
1802
1803 error = setdisklabel(rs->sc_dkdev.dk_label,
1804 lp, 0, rs->sc_dkdev.dk_cpulabel);
1805 if (error == 0) {
1806 if (cmd == DIOCWDINFO
1807 #ifdef __HAVE_OLD_DISKLABEL
1808 || cmd == ODIOCWDINFO
1809 #endif
1810 )
1811 error = writedisklabel(RAIDLABELDEV(dev),
1812 raidstrategy, rs->sc_dkdev.dk_label,
1813 rs->sc_dkdev.dk_cpulabel);
1814 }
1815 rs->sc_flags &= ~RAIDF_LABELLING;
1816
1817 raidunlock(rs);
1818
1819 if (error)
1820 return (error);
1821 break;
1822 }
1823
1824 case DIOCWLABEL:
1825 if (*(int *) data != 0)
1826 rs->sc_flags |= RAIDF_WLABEL;
1827 else
1828 rs->sc_flags &= ~RAIDF_WLABEL;
1829 break;
1830
1831 case DIOCGDEFLABEL:
1832 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1833 break;
1834
1835 #ifdef __HAVE_OLD_DISKLABEL
1836 case ODIOCGDEFLABEL:
1837 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1838 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1839 return ENOTTY;
1840 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1841 break;
1842 #endif
1843
1844 case DIOCAWEDGE:
1845 case DIOCDWEDGE:
1846 dkw = (void *)data;
1847
1848 /* If the ioctl happens here, the parent is us. */
1849 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1850 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1851
1852 case DIOCLWEDGES:
1853 return dkwedge_list(&rs->sc_dkdev,
1854 (struct dkwedge_list *)data, l);
1855 case DIOCCACHESYNC:
1856 return rf_sync_component_caches(raidPtr);
1857 default:
1858 retcode = ENOTTY;
1859 }
1860 return (retcode);
1861
1862 }
1863
1864
1865 /* raidinit -- complete the rest of the initialization for the
1866 RAIDframe device. */
1867
1868
1869 static void
1870 raidinit(RF_Raid_t *raidPtr)
1871 {
1872 cfdata_t cf;
1873 struct raid_softc *rs;
1874 int unit;
1875
1876 unit = raidPtr->raidid;
1877
1878 rs = &raid_softc[unit];
1879
1880 /* XXX should check return code first... */
1881 rs->sc_flags |= RAIDF_INITED;
1882
1883 /* XXX doesn't check bounds. */
1884 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1885
1886 /* attach the pseudo device */
1887 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1888 cf->cf_name = raid_cd.cd_name;
1889 cf->cf_atname = raid_cd.cd_name;
1890 cf->cf_unit = unit;
1891 cf->cf_fstate = FSTATE_STAR;
1892
1893 rs->sc_dev = config_attach_pseudo(cf);
1894
1895 if (rs->sc_dev == NULL) {
1896 printf("raid%d: config_attach_pseudo failed\n",
1897 raidPtr->raidid);
1898 rs->sc_flags &= ~RAIDF_INITED;
1899 free(cf, M_RAIDFRAME);
1900 return;
1901 }
1902
1903 /* disk_attach actually creates space for the CPU disklabel, among
1904 * other things, so it's critical to call this *BEFORE* we try putzing
1905 * with disklabels. */
1906
1907 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1908 disk_attach(&rs->sc_dkdev);
1909 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1910
1911 /* XXX There may be a weird interaction here between this, and
1912 * protectedSectors, as used in RAIDframe. */
1913
1914 rs->sc_size = raidPtr->totalSectors;
1915
1916 dkwedge_discover(&rs->sc_dkdev);
1917
1918 rf_set_properties(rs, raidPtr);
1919
1920 }
1921 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1922 /* wake up the daemon & tell it to get us a spare table
1923 * XXX
1924 * the entries in the queues should be tagged with the raidPtr
1925 * so that in the extremely rare case that two recons happen at once,
1926 * we know for which device were requesting a spare table
1927 * XXX
1928 *
1929 * XXX This code is not currently used. GO
1930 */
1931 int
1932 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1933 {
1934 int retcode;
1935
1936 rf_lock_mutex2(rf_sparet_wait_mutex);
1937 req->next = rf_sparet_wait_queue;
1938 rf_sparet_wait_queue = req;
1939 rf_broadcast_cond2(rf_sparet_wait_cv);
1940
1941 /* mpsleep unlocks the mutex */
1942 while (!rf_sparet_resp_queue) {
1943 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1944 }
1945 req = rf_sparet_resp_queue;
1946 rf_sparet_resp_queue = req->next;
1947 rf_unlock_mutex2(rf_sparet_wait_mutex);
1948
1949 retcode = req->fcol;
1950 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1951 * alloc'd */
1952 return (retcode);
1953 }
1954 #endif
1955
1956 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1957 * bp & passes it down.
1958 * any calls originating in the kernel must use non-blocking I/O
1959 * do some extra sanity checking to return "appropriate" error values for
1960 * certain conditions (to make some standard utilities work)
1961 *
1962 * Formerly known as: rf_DoAccessKernel
1963 */
1964 void
1965 raidstart(RF_Raid_t *raidPtr)
1966 {
1967 RF_SectorCount_t num_blocks, pb, sum;
1968 RF_RaidAddr_t raid_addr;
1969 struct partition *pp;
1970 daddr_t blocknum;
1971 int unit;
1972 struct raid_softc *rs;
1973 int do_async;
1974 struct buf *bp;
1975 int rc;
1976
1977 unit = raidPtr->raidid;
1978 rs = &raid_softc[unit];
1979
1980 /* quick check to see if anything has died recently */
1981 rf_lock_mutex2(raidPtr->mutex);
1982 if (raidPtr->numNewFailures > 0) {
1983 rf_unlock_mutex2(raidPtr->mutex);
1984 rf_update_component_labels(raidPtr,
1985 RF_NORMAL_COMPONENT_UPDATE);
1986 rf_lock_mutex2(raidPtr->mutex);
1987 raidPtr->numNewFailures--;
1988 }
1989
1990 /* Check to see if we're at the limit... */
1991 while (raidPtr->openings > 0) {
1992 rf_unlock_mutex2(raidPtr->mutex);
1993
1994 /* get the next item, if any, from the queue */
1995 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
1996 /* nothing more to do */
1997 return;
1998 }
1999
2000 /* Ok, for the bp we have here, bp->b_blkno is relative to the
2001 * partition.. Need to make it absolute to the underlying
2002 * device.. */
2003
2004 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2005 if (DISKPART(bp->b_dev) != RAW_PART) {
2006 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2007 blocknum += pp->p_offset;
2008 }
2009
2010 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2011 (int) blocknum));
2012
2013 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2014 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2015
2016 /* *THIS* is where we adjust what block we're going to...
2017 * but DO NOT TOUCH bp->b_blkno!!! */
2018 raid_addr = blocknum;
2019
2020 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2021 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2022 sum = raid_addr + num_blocks + pb;
2023 if (1 || rf_debugKernelAccess) {
2024 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2025 (int) raid_addr, (int) sum, (int) num_blocks,
2026 (int) pb, (int) bp->b_resid));
2027 }
2028 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2029 || (sum < num_blocks) || (sum < pb)) {
2030 bp->b_error = ENOSPC;
2031 bp->b_resid = bp->b_bcount;
2032 biodone(bp);
2033 rf_lock_mutex2(raidPtr->mutex);
2034 continue;
2035 }
2036 /*
2037 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2038 */
2039
2040 if (bp->b_bcount & raidPtr->sectorMask) {
2041 bp->b_error = EINVAL;
2042 bp->b_resid = bp->b_bcount;
2043 biodone(bp);
2044 rf_lock_mutex2(raidPtr->mutex);
2045 continue;
2046
2047 }
2048 db1_printf(("Calling DoAccess..\n"));
2049
2050
2051 rf_lock_mutex2(raidPtr->mutex);
2052 raidPtr->openings--;
2053 rf_unlock_mutex2(raidPtr->mutex);
2054
2055 /*
2056 * Everything is async.
2057 */
2058 do_async = 1;
2059
2060 disk_busy(&rs->sc_dkdev);
2061
2062 /* XXX we're still at splbio() here... do we *really*
2063 need to be? */
2064
2065 /* don't ever condition on bp->b_flags & B_WRITE.
2066 * always condition on B_READ instead */
2067
2068 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2069 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2070 do_async, raid_addr, num_blocks,
2071 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2072
2073 if (rc) {
2074 bp->b_error = rc;
2075 bp->b_resid = bp->b_bcount;
2076 biodone(bp);
2077 /* continue loop */
2078 }
2079
2080 rf_lock_mutex2(raidPtr->mutex);
2081 }
2082 rf_unlock_mutex2(raidPtr->mutex);
2083 }
2084
2085
2086
2087
2088 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2089
2090 int
2091 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2092 {
2093 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2094 struct buf *bp;
2095
2096 req->queue = queue;
2097 bp = req->bp;
2098
2099 switch (req->type) {
2100 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2101 /* XXX need to do something extra here.. */
2102 /* I'm leaving this in, as I've never actually seen it used,
2103 * and I'd like folks to report it... GO */
2104 printf(("WAKEUP CALLED\n"));
2105 queue->numOutstanding++;
2106
2107 bp->b_flags = 0;
2108 bp->b_private = req;
2109
2110 KernelWakeupFunc(bp);
2111 break;
2112
2113 case RF_IO_TYPE_READ:
2114 case RF_IO_TYPE_WRITE:
2115 #if RF_ACC_TRACE > 0
2116 if (req->tracerec) {
2117 RF_ETIMER_START(req->tracerec->timer);
2118 }
2119 #endif
2120 InitBP(bp, queue->rf_cinfo->ci_vp,
2121 op, queue->rf_cinfo->ci_dev,
2122 req->sectorOffset, req->numSector,
2123 req->buf, KernelWakeupFunc, (void *) req,
2124 queue->raidPtr->logBytesPerSector, req->b_proc);
2125
2126 if (rf_debugKernelAccess) {
2127 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2128 (long) bp->b_blkno));
2129 }
2130 queue->numOutstanding++;
2131 queue->last_deq_sector = req->sectorOffset;
2132 /* acc wouldn't have been let in if there were any pending
2133 * reqs at any other priority */
2134 queue->curPriority = req->priority;
2135
2136 db1_printf(("Going for %c to unit %d col %d\n",
2137 req->type, queue->raidPtr->raidid,
2138 queue->col));
2139 db1_printf(("sector %d count %d (%d bytes) %d\n",
2140 (int) req->sectorOffset, (int) req->numSector,
2141 (int) (req->numSector <<
2142 queue->raidPtr->logBytesPerSector),
2143 (int) queue->raidPtr->logBytesPerSector));
2144
2145 /*
2146 * XXX: drop lock here since this can block at
2147 * least with backing SCSI devices. Retake it
2148 * to minimize fuss with calling interfaces.
2149 */
2150
2151 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2152 bdev_strategy(bp);
2153 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2154 break;
2155
2156 default:
2157 panic("bad req->type in rf_DispatchKernelIO");
2158 }
2159 db1_printf(("Exiting from DispatchKernelIO\n"));
2160
2161 return (0);
2162 }
2163 /* this is the callback function associated with a I/O invoked from
2164 kernel code.
2165 */
2166 static void
2167 KernelWakeupFunc(struct buf *bp)
2168 {
2169 RF_DiskQueueData_t *req = NULL;
2170 RF_DiskQueue_t *queue;
2171
2172 db1_printf(("recovering the request queue:\n"));
2173
2174 req = bp->b_private;
2175
2176 queue = (RF_DiskQueue_t *) req->queue;
2177
2178 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2179
2180 #if RF_ACC_TRACE > 0
2181 if (req->tracerec) {
2182 RF_ETIMER_STOP(req->tracerec->timer);
2183 RF_ETIMER_EVAL(req->tracerec->timer);
2184 rf_lock_mutex2(rf_tracing_mutex);
2185 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2186 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2187 req->tracerec->num_phys_ios++;
2188 rf_unlock_mutex2(rf_tracing_mutex);
2189 }
2190 #endif
2191
2192 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2193 * ballistic, and mark the component as hosed... */
2194
2195 if (bp->b_error != 0) {
2196 /* Mark the disk as dead */
2197 /* but only mark it once... */
2198 /* and only if it wouldn't leave this RAID set
2199 completely broken */
2200 if (((queue->raidPtr->Disks[queue->col].status ==
2201 rf_ds_optimal) ||
2202 (queue->raidPtr->Disks[queue->col].status ==
2203 rf_ds_used_spare)) &&
2204 (queue->raidPtr->numFailures <
2205 queue->raidPtr->Layout.map->faultsTolerated)) {
2206 printf("raid%d: IO Error. Marking %s as failed.\n",
2207 queue->raidPtr->raidid,
2208 queue->raidPtr->Disks[queue->col].devname);
2209 queue->raidPtr->Disks[queue->col].status =
2210 rf_ds_failed;
2211 queue->raidPtr->status = rf_rs_degraded;
2212 queue->raidPtr->numFailures++;
2213 queue->raidPtr->numNewFailures++;
2214 } else { /* Disk is already dead... */
2215 /* printf("Disk already marked as dead!\n"); */
2216 }
2217
2218 }
2219
2220 /* Fill in the error value */
2221 req->error = bp->b_error;
2222
2223 /* Drop this one on the "finished" queue... */
2224 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2225
2226 /* Let the raidio thread know there is work to be done. */
2227 rf_signal_cond2(queue->raidPtr->iodone_cv);
2228
2229 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2230 }
2231
2232
2233 /*
2234 * initialize a buf structure for doing an I/O in the kernel.
2235 */
2236 static void
2237 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2238 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2239 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2240 struct proc *b_proc)
2241 {
2242 /* bp->b_flags = B_PHYS | rw_flag; */
2243 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2244 bp->b_oflags = 0;
2245 bp->b_cflags = 0;
2246 bp->b_bcount = numSect << logBytesPerSector;
2247 bp->b_bufsize = bp->b_bcount;
2248 bp->b_error = 0;
2249 bp->b_dev = dev;
2250 bp->b_data = bf;
2251 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2252 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2253 if (bp->b_bcount == 0) {
2254 panic("bp->b_bcount is zero in InitBP!!");
2255 }
2256 bp->b_proc = b_proc;
2257 bp->b_iodone = cbFunc;
2258 bp->b_private = cbArg;
2259 }
2260
2261 static void
2262 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2263 struct disklabel *lp)
2264 {
2265 memset(lp, 0, sizeof(*lp));
2266
2267 /* fabricate a label... */
2268 lp->d_secperunit = raidPtr->totalSectors;
2269 lp->d_secsize = raidPtr->bytesPerSector;
2270 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2271 lp->d_ntracks = 4 * raidPtr->numCol;
2272 lp->d_ncylinders = raidPtr->totalSectors /
2273 (lp->d_nsectors * lp->d_ntracks);
2274 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2275
2276 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2277 lp->d_type = DTYPE_RAID;
2278 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2279 lp->d_rpm = 3600;
2280 lp->d_interleave = 1;
2281 lp->d_flags = 0;
2282
2283 lp->d_partitions[RAW_PART].p_offset = 0;
2284 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2285 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2286 lp->d_npartitions = RAW_PART + 1;
2287
2288 lp->d_magic = DISKMAGIC;
2289 lp->d_magic2 = DISKMAGIC;
2290 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2291
2292 }
2293 /*
2294 * Read the disklabel from the raid device. If one is not present, fake one
2295 * up.
2296 */
2297 static void
2298 raidgetdisklabel(dev_t dev)
2299 {
2300 int unit = raidunit(dev);
2301 struct raid_softc *rs = &raid_softc[unit];
2302 const char *errstring;
2303 struct disklabel *lp = rs->sc_dkdev.dk_label;
2304 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2305 RF_Raid_t *raidPtr;
2306
2307 db1_printf(("Getting the disklabel...\n"));
2308
2309 memset(clp, 0, sizeof(*clp));
2310
2311 raidPtr = raidPtrs[unit];
2312
2313 raidgetdefaultlabel(raidPtr, rs, lp);
2314
2315 /*
2316 * Call the generic disklabel extraction routine.
2317 */
2318 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2319 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2320 if (errstring)
2321 raidmakedisklabel(rs);
2322 else {
2323 int i;
2324 struct partition *pp;
2325
2326 /*
2327 * Sanity check whether the found disklabel is valid.
2328 *
2329 * This is necessary since total size of the raid device
2330 * may vary when an interleave is changed even though exactly
2331 * same components are used, and old disklabel may used
2332 * if that is found.
2333 */
2334 if (lp->d_secperunit != rs->sc_size)
2335 printf("raid%d: WARNING: %s: "
2336 "total sector size in disklabel (%" PRIu32 ") != "
2337 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2338 lp->d_secperunit, rs->sc_size);
2339 for (i = 0; i < lp->d_npartitions; i++) {
2340 pp = &lp->d_partitions[i];
2341 if (pp->p_offset + pp->p_size > rs->sc_size)
2342 printf("raid%d: WARNING: %s: end of partition `%c' "
2343 "exceeds the size of raid (%" PRIu64 ")\n",
2344 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2345 }
2346 }
2347
2348 }
2349 /*
2350 * Take care of things one might want to take care of in the event
2351 * that a disklabel isn't present.
2352 */
2353 static void
2354 raidmakedisklabel(struct raid_softc *rs)
2355 {
2356 struct disklabel *lp = rs->sc_dkdev.dk_label;
2357 db1_printf(("Making a label..\n"));
2358
2359 /*
2360 * For historical reasons, if there's no disklabel present
2361 * the raw partition must be marked FS_BSDFFS.
2362 */
2363
2364 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2365
2366 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2367
2368 lp->d_checksum = dkcksum(lp);
2369 }
2370 /*
2371 * Wait interruptibly for an exclusive lock.
2372 *
2373 * XXX
2374 * Several drivers do this; it should be abstracted and made MP-safe.
2375 * (Hmm... where have we seen this warning before :-> GO )
2376 */
2377 static int
2378 raidlock(struct raid_softc *rs)
2379 {
2380 int error;
2381
2382 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2383 rs->sc_flags |= RAIDF_WANTED;
2384 if ((error =
2385 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2386 return (error);
2387 }
2388 rs->sc_flags |= RAIDF_LOCKED;
2389 return (0);
2390 }
2391 /*
2392 * Unlock and wake up any waiters.
2393 */
2394 static void
2395 raidunlock(struct raid_softc *rs)
2396 {
2397
2398 rs->sc_flags &= ~RAIDF_LOCKED;
2399 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2400 rs->sc_flags &= ~RAIDF_WANTED;
2401 wakeup(rs);
2402 }
2403 }
2404
2405
2406 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2407 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2408 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2409
2410 static daddr_t
2411 rf_component_info_offset(void)
2412 {
2413
2414 return RF_COMPONENT_INFO_OFFSET;
2415 }
2416
2417 static daddr_t
2418 rf_component_info_size(unsigned secsize)
2419 {
2420 daddr_t info_size;
2421
2422 KASSERT(secsize);
2423 if (secsize > RF_COMPONENT_INFO_SIZE)
2424 info_size = secsize;
2425 else
2426 info_size = RF_COMPONENT_INFO_SIZE;
2427
2428 return info_size;
2429 }
2430
2431 static daddr_t
2432 rf_parity_map_offset(RF_Raid_t *raidPtr)
2433 {
2434 daddr_t map_offset;
2435
2436 KASSERT(raidPtr->bytesPerSector);
2437 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2438 map_offset = raidPtr->bytesPerSector;
2439 else
2440 map_offset = RF_COMPONENT_INFO_SIZE;
2441 map_offset += rf_component_info_offset();
2442
2443 return map_offset;
2444 }
2445
2446 static daddr_t
2447 rf_parity_map_size(RF_Raid_t *raidPtr)
2448 {
2449 daddr_t map_size;
2450
2451 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2452 map_size = raidPtr->bytesPerSector;
2453 else
2454 map_size = RF_PARITY_MAP_SIZE;
2455
2456 return map_size;
2457 }
2458
2459 int
2460 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2461 {
2462 RF_ComponentLabel_t *clabel;
2463
2464 clabel = raidget_component_label(raidPtr, col);
2465 clabel->clean = RF_RAID_CLEAN;
2466 raidflush_component_label(raidPtr, col);
2467 return(0);
2468 }
2469
2470
2471 int
2472 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2473 {
2474 RF_ComponentLabel_t *clabel;
2475
2476 clabel = raidget_component_label(raidPtr, col);
2477 clabel->clean = RF_RAID_DIRTY;
2478 raidflush_component_label(raidPtr, col);
2479 return(0);
2480 }
2481
2482 int
2483 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2484 {
2485 KASSERT(raidPtr->bytesPerSector);
2486 return raidread_component_label(raidPtr->bytesPerSector,
2487 raidPtr->Disks[col].dev,
2488 raidPtr->raid_cinfo[col].ci_vp,
2489 &raidPtr->raid_cinfo[col].ci_label);
2490 }
2491
2492 RF_ComponentLabel_t *
2493 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2494 {
2495 return &raidPtr->raid_cinfo[col].ci_label;
2496 }
2497
2498 int
2499 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2500 {
2501 RF_ComponentLabel_t *label;
2502
2503 label = &raidPtr->raid_cinfo[col].ci_label;
2504 label->mod_counter = raidPtr->mod_counter;
2505 #ifndef RF_NO_PARITY_MAP
2506 label->parity_map_modcount = label->mod_counter;
2507 #endif
2508 return raidwrite_component_label(raidPtr->bytesPerSector,
2509 raidPtr->Disks[col].dev,
2510 raidPtr->raid_cinfo[col].ci_vp, label);
2511 }
2512
2513
2514 static int
2515 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2516 RF_ComponentLabel_t *clabel)
2517 {
2518 return raidread_component_area(dev, b_vp, clabel,
2519 sizeof(RF_ComponentLabel_t),
2520 rf_component_info_offset(),
2521 rf_component_info_size(secsize));
2522 }
2523
2524 /* ARGSUSED */
2525 static int
2526 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2527 size_t msize, daddr_t offset, daddr_t dsize)
2528 {
2529 struct buf *bp;
2530 const struct bdevsw *bdev;
2531 int error;
2532
2533 /* XXX should probably ensure that we don't try to do this if
2534 someone has changed rf_protected_sectors. */
2535
2536 if (b_vp == NULL) {
2537 /* For whatever reason, this component is not valid.
2538 Don't try to read a component label from it. */
2539 return(EINVAL);
2540 }
2541
2542 /* get a block of the appropriate size... */
2543 bp = geteblk((int)dsize);
2544 bp->b_dev = dev;
2545
2546 /* get our ducks in a row for the read */
2547 bp->b_blkno = offset / DEV_BSIZE;
2548 bp->b_bcount = dsize;
2549 bp->b_flags |= B_READ;
2550 bp->b_resid = dsize;
2551
2552 bdev = bdevsw_lookup(bp->b_dev);
2553 if (bdev == NULL)
2554 return (ENXIO);
2555 (*bdev->d_strategy)(bp);
2556
2557 error = biowait(bp);
2558
2559 if (!error) {
2560 memcpy(data, bp->b_data, msize);
2561 }
2562
2563 brelse(bp, 0);
2564 return(error);
2565 }
2566
2567
2568 static int
2569 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2570 RF_ComponentLabel_t *clabel)
2571 {
2572 return raidwrite_component_area(dev, b_vp, clabel,
2573 sizeof(RF_ComponentLabel_t),
2574 rf_component_info_offset(),
2575 rf_component_info_size(secsize), 0);
2576 }
2577
2578 /* ARGSUSED */
2579 static int
2580 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2581 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2582 {
2583 struct buf *bp;
2584 const struct bdevsw *bdev;
2585 int error;
2586
2587 /* get a block of the appropriate size... */
2588 bp = geteblk((int)dsize);
2589 bp->b_dev = dev;
2590
2591 /* get our ducks in a row for the write */
2592 bp->b_blkno = offset / DEV_BSIZE;
2593 bp->b_bcount = dsize;
2594 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2595 bp->b_resid = dsize;
2596
2597 memset(bp->b_data, 0, dsize);
2598 memcpy(bp->b_data, data, msize);
2599
2600 bdev = bdevsw_lookup(bp->b_dev);
2601 if (bdev == NULL)
2602 return (ENXIO);
2603 (*bdev->d_strategy)(bp);
2604 if (asyncp)
2605 return 0;
2606 error = biowait(bp);
2607 brelse(bp, 0);
2608 if (error) {
2609 #if 1
2610 printf("Failed to write RAID component info!\n");
2611 #endif
2612 }
2613
2614 return(error);
2615 }
2616
2617 void
2618 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2619 {
2620 int c;
2621
2622 for (c = 0; c < raidPtr->numCol; c++) {
2623 /* Skip dead disks. */
2624 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2625 continue;
2626 /* XXXjld: what if an error occurs here? */
2627 raidwrite_component_area(raidPtr->Disks[c].dev,
2628 raidPtr->raid_cinfo[c].ci_vp, map,
2629 RF_PARITYMAP_NBYTE,
2630 rf_parity_map_offset(raidPtr),
2631 rf_parity_map_size(raidPtr), 0);
2632 }
2633 }
2634
2635 void
2636 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2637 {
2638 struct rf_paritymap_ondisk tmp;
2639 int c,first;
2640
2641 first=1;
2642 for (c = 0; c < raidPtr->numCol; c++) {
2643 /* Skip dead disks. */
2644 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2645 continue;
2646 raidread_component_area(raidPtr->Disks[c].dev,
2647 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2648 RF_PARITYMAP_NBYTE,
2649 rf_parity_map_offset(raidPtr),
2650 rf_parity_map_size(raidPtr));
2651 if (first) {
2652 memcpy(map, &tmp, sizeof(*map));
2653 first = 0;
2654 } else {
2655 rf_paritymap_merge(map, &tmp);
2656 }
2657 }
2658 }
2659
2660 void
2661 rf_markalldirty(RF_Raid_t *raidPtr)
2662 {
2663 RF_ComponentLabel_t *clabel;
2664 int sparecol;
2665 int c;
2666 int j;
2667 int scol = -1;
2668
2669 raidPtr->mod_counter++;
2670 for (c = 0; c < raidPtr->numCol; c++) {
2671 /* we don't want to touch (at all) a disk that has
2672 failed */
2673 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2674 clabel = raidget_component_label(raidPtr, c);
2675 if (clabel->status == rf_ds_spared) {
2676 /* XXX do something special...
2677 but whatever you do, don't
2678 try to access it!! */
2679 } else {
2680 raidmarkdirty(raidPtr, c);
2681 }
2682 }
2683 }
2684
2685 for( c = 0; c < raidPtr->numSpare ; c++) {
2686 sparecol = raidPtr->numCol + c;
2687 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2688 /*
2689
2690 we claim this disk is "optimal" if it's
2691 rf_ds_used_spare, as that means it should be
2692 directly substitutable for the disk it replaced.
2693 We note that too...
2694
2695 */
2696
2697 for(j=0;j<raidPtr->numCol;j++) {
2698 if (raidPtr->Disks[j].spareCol == sparecol) {
2699 scol = j;
2700 break;
2701 }
2702 }
2703
2704 clabel = raidget_component_label(raidPtr, sparecol);
2705 /* make sure status is noted */
2706
2707 raid_init_component_label(raidPtr, clabel);
2708
2709 clabel->row = 0;
2710 clabel->column = scol;
2711 /* Note: we *don't* change status from rf_ds_used_spare
2712 to rf_ds_optimal */
2713 /* clabel.status = rf_ds_optimal; */
2714
2715 raidmarkdirty(raidPtr, sparecol);
2716 }
2717 }
2718 }
2719
2720
2721 void
2722 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2723 {
2724 RF_ComponentLabel_t *clabel;
2725 int sparecol;
2726 int c;
2727 int j;
2728 int scol;
2729
2730 scol = -1;
2731
2732 /* XXX should do extra checks to make sure things really are clean,
2733 rather than blindly setting the clean bit... */
2734
2735 raidPtr->mod_counter++;
2736
2737 for (c = 0; c < raidPtr->numCol; c++) {
2738 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2739 clabel = raidget_component_label(raidPtr, c);
2740 /* make sure status is noted */
2741 clabel->status = rf_ds_optimal;
2742
2743 /* note what unit we are configured as */
2744 clabel->last_unit = raidPtr->raidid;
2745
2746 raidflush_component_label(raidPtr, c);
2747 if (final == RF_FINAL_COMPONENT_UPDATE) {
2748 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2749 raidmarkclean(raidPtr, c);
2750 }
2751 }
2752 }
2753 /* else we don't touch it.. */
2754 }
2755
2756 for( c = 0; c < raidPtr->numSpare ; c++) {
2757 sparecol = raidPtr->numCol + c;
2758 /* Need to ensure that the reconstruct actually completed! */
2759 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2760 /*
2761
2762 we claim this disk is "optimal" if it's
2763 rf_ds_used_spare, as that means it should be
2764 directly substitutable for the disk it replaced.
2765 We note that too...
2766
2767 */
2768
2769 for(j=0;j<raidPtr->numCol;j++) {
2770 if (raidPtr->Disks[j].spareCol == sparecol) {
2771 scol = j;
2772 break;
2773 }
2774 }
2775
2776 /* XXX shouldn't *really* need this... */
2777 clabel = raidget_component_label(raidPtr, sparecol);
2778 /* make sure status is noted */
2779
2780 raid_init_component_label(raidPtr, clabel);
2781
2782 clabel->column = scol;
2783 clabel->status = rf_ds_optimal;
2784 clabel->last_unit = raidPtr->raidid;
2785
2786 raidflush_component_label(raidPtr, sparecol);
2787 if (final == RF_FINAL_COMPONENT_UPDATE) {
2788 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2789 raidmarkclean(raidPtr, sparecol);
2790 }
2791 }
2792 }
2793 }
2794 }
2795
2796 void
2797 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2798 {
2799
2800 if (vp != NULL) {
2801 if (auto_configured == 1) {
2802 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2803 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2804 vput(vp);
2805
2806 } else {
2807 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2808 }
2809 }
2810 }
2811
2812
2813 void
2814 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2815 {
2816 int r,c;
2817 struct vnode *vp;
2818 int acd;
2819
2820
2821 /* We take this opportunity to close the vnodes like we should.. */
2822
2823 for (c = 0; c < raidPtr->numCol; c++) {
2824 vp = raidPtr->raid_cinfo[c].ci_vp;
2825 acd = raidPtr->Disks[c].auto_configured;
2826 rf_close_component(raidPtr, vp, acd);
2827 raidPtr->raid_cinfo[c].ci_vp = NULL;
2828 raidPtr->Disks[c].auto_configured = 0;
2829 }
2830
2831 for (r = 0; r < raidPtr->numSpare; r++) {
2832 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2833 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2834 rf_close_component(raidPtr, vp, acd);
2835 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2836 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2837 }
2838 }
2839
2840
2841 void
2842 rf_ReconThread(struct rf_recon_req *req)
2843 {
2844 int s;
2845 RF_Raid_t *raidPtr;
2846
2847 s = splbio();
2848 raidPtr = (RF_Raid_t *) req->raidPtr;
2849 raidPtr->recon_in_progress = 1;
2850
2851 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2852 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2853
2854 RF_Free(req, sizeof(*req));
2855
2856 raidPtr->recon_in_progress = 0;
2857 splx(s);
2858
2859 /* That's all... */
2860 kthread_exit(0); /* does not return */
2861 }
2862
2863 void
2864 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2865 {
2866 int retcode;
2867 int s;
2868
2869 raidPtr->parity_rewrite_stripes_done = 0;
2870 raidPtr->parity_rewrite_in_progress = 1;
2871 s = splbio();
2872 retcode = rf_RewriteParity(raidPtr);
2873 splx(s);
2874 if (retcode) {
2875 printf("raid%d: Error re-writing parity (%d)!\n",
2876 raidPtr->raidid, retcode);
2877 } else {
2878 /* set the clean bit! If we shutdown correctly,
2879 the clean bit on each component label will get
2880 set */
2881 raidPtr->parity_good = RF_RAID_CLEAN;
2882 }
2883 raidPtr->parity_rewrite_in_progress = 0;
2884
2885 /* Anyone waiting for us to stop? If so, inform them... */
2886 if (raidPtr->waitShutdown) {
2887 wakeup(&raidPtr->parity_rewrite_in_progress);
2888 }
2889
2890 /* That's all... */
2891 kthread_exit(0); /* does not return */
2892 }
2893
2894
2895 void
2896 rf_CopybackThread(RF_Raid_t *raidPtr)
2897 {
2898 int s;
2899
2900 raidPtr->copyback_in_progress = 1;
2901 s = splbio();
2902 rf_CopybackReconstructedData(raidPtr);
2903 splx(s);
2904 raidPtr->copyback_in_progress = 0;
2905
2906 /* That's all... */
2907 kthread_exit(0); /* does not return */
2908 }
2909
2910
2911 void
2912 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2913 {
2914 int s;
2915 RF_Raid_t *raidPtr;
2916
2917 s = splbio();
2918 raidPtr = req->raidPtr;
2919 raidPtr->recon_in_progress = 1;
2920 rf_ReconstructInPlace(raidPtr, req->col);
2921 RF_Free(req, sizeof(*req));
2922 raidPtr->recon_in_progress = 0;
2923 splx(s);
2924
2925 /* That's all... */
2926 kthread_exit(0); /* does not return */
2927 }
2928
2929 static RF_AutoConfig_t *
2930 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2931 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2932 unsigned secsize)
2933 {
2934 int good_one = 0;
2935 RF_ComponentLabel_t *clabel;
2936 RF_AutoConfig_t *ac;
2937
2938 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2939 if (clabel == NULL) {
2940 oomem:
2941 while(ac_list) {
2942 ac = ac_list;
2943 if (ac->clabel)
2944 free(ac->clabel, M_RAIDFRAME);
2945 ac_list = ac_list->next;
2946 free(ac, M_RAIDFRAME);
2947 }
2948 printf("RAID auto config: out of memory!\n");
2949 return NULL; /* XXX probably should panic? */
2950 }
2951
2952 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2953 /* Got the label. Does it look reasonable? */
2954 if (rf_reasonable_label(clabel, numsecs) &&
2955 (rf_component_label_partitionsize(clabel) <= size)) {
2956 #ifdef DEBUG
2957 printf("Component on: %s: %llu\n",
2958 cname, (unsigned long long)size);
2959 rf_print_component_label(clabel);
2960 #endif
2961 /* if it's reasonable, add it, else ignore it. */
2962 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2963 M_NOWAIT);
2964 if (ac == NULL) {
2965 free(clabel, M_RAIDFRAME);
2966 goto oomem;
2967 }
2968 strlcpy(ac->devname, cname, sizeof(ac->devname));
2969 ac->dev = dev;
2970 ac->vp = vp;
2971 ac->clabel = clabel;
2972 ac->next = ac_list;
2973 ac_list = ac;
2974 good_one = 1;
2975 }
2976 }
2977 if (!good_one) {
2978 /* cleanup */
2979 free(clabel, M_RAIDFRAME);
2980 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2981 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2982 vput(vp);
2983 }
2984 return ac_list;
2985 }
2986
2987 RF_AutoConfig_t *
2988 rf_find_raid_components(void)
2989 {
2990 struct vnode *vp;
2991 struct disklabel label;
2992 device_t dv;
2993 deviter_t di;
2994 dev_t dev;
2995 int bmajor, bminor, wedge;
2996 int error;
2997 int i;
2998 RF_AutoConfig_t *ac_list;
2999 uint64_t numsecs;
3000 unsigned secsize;
3001
3002 /* initialize the AutoConfig list */
3003 ac_list = NULL;
3004
3005 /* we begin by trolling through *all* the devices on the system */
3006
3007 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3008 dv = deviter_next(&di)) {
3009
3010 /* we are only interested in disks... */
3011 if (device_class(dv) != DV_DISK)
3012 continue;
3013
3014 /* we don't care about floppies... */
3015 if (device_is_a(dv, "fd")) {
3016 continue;
3017 }
3018
3019 /* we don't care about CD's... */
3020 if (device_is_a(dv, "cd")) {
3021 continue;
3022 }
3023
3024 /* we don't care about md's... */
3025 if (device_is_a(dv, "md")) {
3026 continue;
3027 }
3028
3029 /* hdfd is the Atari/Hades floppy driver */
3030 if (device_is_a(dv, "hdfd")) {
3031 continue;
3032 }
3033
3034 /* fdisa is the Atari/Milan floppy driver */
3035 if (device_is_a(dv, "fdisa")) {
3036 continue;
3037 }
3038
3039 /* need to find the device_name_to_block_device_major stuff */
3040 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3041
3042 /* get a vnode for the raw partition of this disk */
3043
3044 wedge = device_is_a(dv, "dk");
3045 bminor = minor(device_unit(dv));
3046 dev = wedge ? makedev(bmajor, bminor) :
3047 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3048 if (bdevvp(dev, &vp))
3049 panic("RAID can't alloc vnode");
3050
3051 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3052
3053 if (error) {
3054 /* "Who cares." Continue looking
3055 for something that exists*/
3056 vput(vp);
3057 continue;
3058 }
3059
3060 error = getdisksize(vp, &numsecs, &secsize);
3061 if (error) {
3062 vput(vp);
3063 continue;
3064 }
3065 if (wedge) {
3066 struct dkwedge_info dkw;
3067 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3068 NOCRED);
3069 if (error) {
3070 printf("RAIDframe: can't get wedge info for "
3071 "dev %s (%d)\n", device_xname(dv), error);
3072 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3073 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3074 vput(vp);
3075 continue;
3076 }
3077
3078 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3079 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3080 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3081 vput(vp);
3082 continue;
3083 }
3084
3085 ac_list = rf_get_component(ac_list, dev, vp,
3086 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3087 continue;
3088 }
3089
3090 /* Ok, the disk exists. Go get the disklabel. */
3091 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3092 if (error) {
3093 /*
3094 * XXX can't happen - open() would
3095 * have errored out (or faked up one)
3096 */
3097 if (error != ENOTTY)
3098 printf("RAIDframe: can't get label for dev "
3099 "%s (%d)\n", device_xname(dv), error);
3100 }
3101
3102 /* don't need this any more. We'll allocate it again
3103 a little later if we really do... */
3104 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3105 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3106 vput(vp);
3107
3108 if (error)
3109 continue;
3110
3111 for (i = 0; i < label.d_npartitions; i++) {
3112 char cname[sizeof(ac_list->devname)];
3113
3114 /* We only support partitions marked as RAID */
3115 if (label.d_partitions[i].p_fstype != FS_RAID)
3116 continue;
3117
3118 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3119 if (bdevvp(dev, &vp))
3120 panic("RAID can't alloc vnode");
3121
3122 error = VOP_OPEN(vp, FREAD, NOCRED);
3123 if (error) {
3124 /* Whatever... */
3125 vput(vp);
3126 continue;
3127 }
3128 snprintf(cname, sizeof(cname), "%s%c",
3129 device_xname(dv), 'a' + i);
3130 ac_list = rf_get_component(ac_list, dev, vp, cname,
3131 label.d_partitions[i].p_size, numsecs, secsize);
3132 }
3133 }
3134 deviter_release(&di);
3135 return ac_list;
3136 }
3137
3138
3139 int
3140 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3141 {
3142
3143 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3144 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3145 ((clabel->clean == RF_RAID_CLEAN) ||
3146 (clabel->clean == RF_RAID_DIRTY)) &&
3147 clabel->row >=0 &&
3148 clabel->column >= 0 &&
3149 clabel->num_rows > 0 &&
3150 clabel->num_columns > 0 &&
3151 clabel->row < clabel->num_rows &&
3152 clabel->column < clabel->num_columns &&
3153 clabel->blockSize > 0 &&
3154 /*
3155 * numBlocksHi may contain garbage, but it is ok since
3156 * the type is unsigned. If it is really garbage,
3157 * rf_fix_old_label_size() will fix it.
3158 */
3159 rf_component_label_numblocks(clabel) > 0) {
3160 /*
3161 * label looks reasonable enough...
3162 * let's make sure it has no old garbage.
3163 */
3164 if (numsecs)
3165 rf_fix_old_label_size(clabel, numsecs);
3166 return(1);
3167 }
3168 return(0);
3169 }
3170
3171
3172 /*
3173 * For reasons yet unknown, some old component labels have garbage in
3174 * the newer numBlocksHi region, and this causes lossage. Since those
3175 * disks will also have numsecs set to less than 32 bits of sectors,
3176 * we can determine when this corruption has occured, and fix it.
3177 *
3178 * The exact same problem, with the same unknown reason, happens to
3179 * the partitionSizeHi member as well.
3180 */
3181 static void
3182 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3183 {
3184
3185 if (numsecs < ((uint64_t)1 << 32)) {
3186 if (clabel->numBlocksHi) {
3187 printf("WARNING: total sectors < 32 bits, yet "
3188 "numBlocksHi set\n"
3189 "WARNING: resetting numBlocksHi to zero.\n");
3190 clabel->numBlocksHi = 0;
3191 }
3192
3193 if (clabel->partitionSizeHi) {
3194 printf("WARNING: total sectors < 32 bits, yet "
3195 "partitionSizeHi set\n"
3196 "WARNING: resetting partitionSizeHi to zero.\n");
3197 clabel->partitionSizeHi = 0;
3198 }
3199 }
3200 }
3201
3202
3203 #ifdef DEBUG
3204 void
3205 rf_print_component_label(RF_ComponentLabel_t *clabel)
3206 {
3207 uint64_t numBlocks;
3208
3209 numBlocks = rf_component_label_numblocks(clabel);
3210
3211 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3212 clabel->row, clabel->column,
3213 clabel->num_rows, clabel->num_columns);
3214 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3215 clabel->version, clabel->serial_number,
3216 clabel->mod_counter);
3217 printf(" Clean: %s Status: %d\n",
3218 clabel->clean ? "Yes" : "No", clabel->status);
3219 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3220 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3221 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3222 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3223 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3224 printf(" Contains root partition: %s\n",
3225 clabel->root_partition ? "Yes" : "No");
3226 printf(" Last configured as: raid%d\n", clabel->last_unit);
3227 #if 0
3228 printf(" Config order: %d\n", clabel->config_order);
3229 #endif
3230
3231 }
3232 #endif
3233
3234 RF_ConfigSet_t *
3235 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3236 {
3237 RF_AutoConfig_t *ac;
3238 RF_ConfigSet_t *config_sets;
3239 RF_ConfigSet_t *cset;
3240 RF_AutoConfig_t *ac_next;
3241
3242
3243 config_sets = NULL;
3244
3245 /* Go through the AutoConfig list, and figure out which components
3246 belong to what sets. */
3247 ac = ac_list;
3248 while(ac!=NULL) {
3249 /* we're going to putz with ac->next, so save it here
3250 for use at the end of the loop */
3251 ac_next = ac->next;
3252
3253 if (config_sets == NULL) {
3254 /* will need at least this one... */
3255 config_sets = (RF_ConfigSet_t *)
3256 malloc(sizeof(RF_ConfigSet_t),
3257 M_RAIDFRAME, M_NOWAIT);
3258 if (config_sets == NULL) {
3259 panic("rf_create_auto_sets: No memory!");
3260 }
3261 /* this one is easy :) */
3262 config_sets->ac = ac;
3263 config_sets->next = NULL;
3264 config_sets->rootable = 0;
3265 ac->next = NULL;
3266 } else {
3267 /* which set does this component fit into? */
3268 cset = config_sets;
3269 while(cset!=NULL) {
3270 if (rf_does_it_fit(cset, ac)) {
3271 /* looks like it matches... */
3272 ac->next = cset->ac;
3273 cset->ac = ac;
3274 break;
3275 }
3276 cset = cset->next;
3277 }
3278 if (cset==NULL) {
3279 /* didn't find a match above... new set..*/
3280 cset = (RF_ConfigSet_t *)
3281 malloc(sizeof(RF_ConfigSet_t),
3282 M_RAIDFRAME, M_NOWAIT);
3283 if (cset == NULL) {
3284 panic("rf_create_auto_sets: No memory!");
3285 }
3286 cset->ac = ac;
3287 ac->next = NULL;
3288 cset->next = config_sets;
3289 cset->rootable = 0;
3290 config_sets = cset;
3291 }
3292 }
3293 ac = ac_next;
3294 }
3295
3296
3297 return(config_sets);
3298 }
3299
3300 static int
3301 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3302 {
3303 RF_ComponentLabel_t *clabel1, *clabel2;
3304
3305 /* If this one matches the *first* one in the set, that's good
3306 enough, since the other members of the set would have been
3307 through here too... */
3308 /* note that we are not checking partitionSize here..
3309
3310 Note that we are also not checking the mod_counters here.
3311 If everything else matches execpt the mod_counter, that's
3312 good enough for this test. We will deal with the mod_counters
3313 a little later in the autoconfiguration process.
3314
3315 (clabel1->mod_counter == clabel2->mod_counter) &&
3316
3317 The reason we don't check for this is that failed disks
3318 will have lower modification counts. If those disks are
3319 not added to the set they used to belong to, then they will
3320 form their own set, which may result in 2 different sets,
3321 for example, competing to be configured at raid0, and
3322 perhaps competing to be the root filesystem set. If the
3323 wrong ones get configured, or both attempt to become /,
3324 weird behaviour and or serious lossage will occur. Thus we
3325 need to bring them into the fold here, and kick them out at
3326 a later point.
3327
3328 */
3329
3330 clabel1 = cset->ac->clabel;
3331 clabel2 = ac->clabel;
3332 if ((clabel1->version == clabel2->version) &&
3333 (clabel1->serial_number == clabel2->serial_number) &&
3334 (clabel1->num_rows == clabel2->num_rows) &&
3335 (clabel1->num_columns == clabel2->num_columns) &&
3336 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3337 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3338 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3339 (clabel1->parityConfig == clabel2->parityConfig) &&
3340 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3341 (clabel1->blockSize == clabel2->blockSize) &&
3342 rf_component_label_numblocks(clabel1) ==
3343 rf_component_label_numblocks(clabel2) &&
3344 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3345 (clabel1->root_partition == clabel2->root_partition) &&
3346 (clabel1->last_unit == clabel2->last_unit) &&
3347 (clabel1->config_order == clabel2->config_order)) {
3348 /* if it get's here, it almost *has* to be a match */
3349 } else {
3350 /* it's not consistent with somebody in the set..
3351 punt */
3352 return(0);
3353 }
3354 /* all was fine.. it must fit... */
3355 return(1);
3356 }
3357
3358 int
3359 rf_have_enough_components(RF_ConfigSet_t *cset)
3360 {
3361 RF_AutoConfig_t *ac;
3362 RF_AutoConfig_t *auto_config;
3363 RF_ComponentLabel_t *clabel;
3364 int c;
3365 int num_cols;
3366 int num_missing;
3367 int mod_counter;
3368 int mod_counter_found;
3369 int even_pair_failed;
3370 char parity_type;
3371
3372
3373 /* check to see that we have enough 'live' components
3374 of this set. If so, we can configure it if necessary */
3375
3376 num_cols = cset->ac->clabel->num_columns;
3377 parity_type = cset->ac->clabel->parityConfig;
3378
3379 /* XXX Check for duplicate components!?!?!? */
3380
3381 /* Determine what the mod_counter is supposed to be for this set. */
3382
3383 mod_counter_found = 0;
3384 mod_counter = 0;
3385 ac = cset->ac;
3386 while(ac!=NULL) {
3387 if (mod_counter_found==0) {
3388 mod_counter = ac->clabel->mod_counter;
3389 mod_counter_found = 1;
3390 } else {
3391 if (ac->clabel->mod_counter > mod_counter) {
3392 mod_counter = ac->clabel->mod_counter;
3393 }
3394 }
3395 ac = ac->next;
3396 }
3397
3398 num_missing = 0;
3399 auto_config = cset->ac;
3400
3401 even_pair_failed = 0;
3402 for(c=0; c<num_cols; c++) {
3403 ac = auto_config;
3404 while(ac!=NULL) {
3405 if ((ac->clabel->column == c) &&
3406 (ac->clabel->mod_counter == mod_counter)) {
3407 /* it's this one... */
3408 #ifdef DEBUG
3409 printf("Found: %s at %d\n",
3410 ac->devname,c);
3411 #endif
3412 break;
3413 }
3414 ac=ac->next;
3415 }
3416 if (ac==NULL) {
3417 /* Didn't find one here! */
3418 /* special case for RAID 1, especially
3419 where there are more than 2
3420 components (where RAIDframe treats
3421 things a little differently :( ) */
3422 if (parity_type == '1') {
3423 if (c%2 == 0) { /* even component */
3424 even_pair_failed = 1;
3425 } else { /* odd component. If
3426 we're failed, and
3427 so is the even
3428 component, it's
3429 "Good Night, Charlie" */
3430 if (even_pair_failed == 1) {
3431 return(0);
3432 }
3433 }
3434 } else {
3435 /* normal accounting */
3436 num_missing++;
3437 }
3438 }
3439 if ((parity_type == '1') && (c%2 == 1)) {
3440 /* Just did an even component, and we didn't
3441 bail.. reset the even_pair_failed flag,
3442 and go on to the next component.... */
3443 even_pair_failed = 0;
3444 }
3445 }
3446
3447 clabel = cset->ac->clabel;
3448
3449 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3450 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3451 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3452 /* XXX this needs to be made *much* more general */
3453 /* Too many failures */
3454 return(0);
3455 }
3456 /* otherwise, all is well, and we've got enough to take a kick
3457 at autoconfiguring this set */
3458 return(1);
3459 }
3460
3461 void
3462 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3463 RF_Raid_t *raidPtr)
3464 {
3465 RF_ComponentLabel_t *clabel;
3466 int i;
3467
3468 clabel = ac->clabel;
3469
3470 /* 1. Fill in the common stuff */
3471 config->numRow = clabel->num_rows = 1;
3472 config->numCol = clabel->num_columns;
3473 config->numSpare = 0; /* XXX should this be set here? */
3474 config->sectPerSU = clabel->sectPerSU;
3475 config->SUsPerPU = clabel->SUsPerPU;
3476 config->SUsPerRU = clabel->SUsPerRU;
3477 config->parityConfig = clabel->parityConfig;
3478 /* XXX... */
3479 strcpy(config->diskQueueType,"fifo");
3480 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3481 config->layoutSpecificSize = 0; /* XXX ?? */
3482
3483 while(ac!=NULL) {
3484 /* row/col values will be in range due to the checks
3485 in reasonable_label() */
3486 strcpy(config->devnames[0][ac->clabel->column],
3487 ac->devname);
3488 ac = ac->next;
3489 }
3490
3491 for(i=0;i<RF_MAXDBGV;i++) {
3492 config->debugVars[i][0] = 0;
3493 }
3494 }
3495
3496 int
3497 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3498 {
3499 RF_ComponentLabel_t *clabel;
3500 int column;
3501 int sparecol;
3502
3503 raidPtr->autoconfigure = new_value;
3504
3505 for(column=0; column<raidPtr->numCol; column++) {
3506 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3507 clabel = raidget_component_label(raidPtr, column);
3508 clabel->autoconfigure = new_value;
3509 raidflush_component_label(raidPtr, column);
3510 }
3511 }
3512 for(column = 0; column < raidPtr->numSpare ; column++) {
3513 sparecol = raidPtr->numCol + column;
3514 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3515 clabel = raidget_component_label(raidPtr, sparecol);
3516 clabel->autoconfigure = new_value;
3517 raidflush_component_label(raidPtr, sparecol);
3518 }
3519 }
3520 return(new_value);
3521 }
3522
3523 int
3524 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3525 {
3526 RF_ComponentLabel_t *clabel;
3527 int column;
3528 int sparecol;
3529
3530 raidPtr->root_partition = new_value;
3531 for(column=0; column<raidPtr->numCol; column++) {
3532 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3533 clabel = raidget_component_label(raidPtr, column);
3534 clabel->root_partition = new_value;
3535 raidflush_component_label(raidPtr, column);
3536 }
3537 }
3538 for(column = 0; column < raidPtr->numSpare ; column++) {
3539 sparecol = raidPtr->numCol + column;
3540 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3541 clabel = raidget_component_label(raidPtr, sparecol);
3542 clabel->root_partition = new_value;
3543 raidflush_component_label(raidPtr, sparecol);
3544 }
3545 }
3546 return(new_value);
3547 }
3548
3549 void
3550 rf_release_all_vps(RF_ConfigSet_t *cset)
3551 {
3552 RF_AutoConfig_t *ac;
3553
3554 ac = cset->ac;
3555 while(ac!=NULL) {
3556 /* Close the vp, and give it back */
3557 if (ac->vp) {
3558 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3559 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3560 vput(ac->vp);
3561 ac->vp = NULL;
3562 }
3563 ac = ac->next;
3564 }
3565 }
3566
3567
3568 void
3569 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3570 {
3571 RF_AutoConfig_t *ac;
3572 RF_AutoConfig_t *next_ac;
3573
3574 ac = cset->ac;
3575 while(ac!=NULL) {
3576 next_ac = ac->next;
3577 /* nuke the label */
3578 free(ac->clabel, M_RAIDFRAME);
3579 /* cleanup the config structure */
3580 free(ac, M_RAIDFRAME);
3581 /* "next.." */
3582 ac = next_ac;
3583 }
3584 /* and, finally, nuke the config set */
3585 free(cset, M_RAIDFRAME);
3586 }
3587
3588
3589 void
3590 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3591 {
3592 /* current version number */
3593 clabel->version = RF_COMPONENT_LABEL_VERSION;
3594 clabel->serial_number = raidPtr->serial_number;
3595 clabel->mod_counter = raidPtr->mod_counter;
3596
3597 clabel->num_rows = 1;
3598 clabel->num_columns = raidPtr->numCol;
3599 clabel->clean = RF_RAID_DIRTY; /* not clean */
3600 clabel->status = rf_ds_optimal; /* "It's good!" */
3601
3602 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3603 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3604 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3605
3606 clabel->blockSize = raidPtr->bytesPerSector;
3607 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3608
3609 /* XXX not portable */
3610 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3611 clabel->maxOutstanding = raidPtr->maxOutstanding;
3612 clabel->autoconfigure = raidPtr->autoconfigure;
3613 clabel->root_partition = raidPtr->root_partition;
3614 clabel->last_unit = raidPtr->raidid;
3615 clabel->config_order = raidPtr->config_order;
3616
3617 #ifndef RF_NO_PARITY_MAP
3618 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3619 #endif
3620 }
3621
3622 int
3623 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3624 {
3625 RF_Raid_t *raidPtr;
3626 RF_Config_t *config;
3627 int raidID;
3628 int retcode;
3629
3630 #ifdef DEBUG
3631 printf("RAID autoconfigure\n");
3632 #endif
3633
3634 retcode = 0;
3635 *unit = -1;
3636
3637 /* 1. Create a config structure */
3638
3639 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3640 M_RAIDFRAME,
3641 M_NOWAIT);
3642 if (config==NULL) {
3643 printf("Out of mem!?!?\n");
3644 /* XXX do something more intelligent here. */
3645 return(1);
3646 }
3647
3648 memset(config, 0, sizeof(RF_Config_t));
3649
3650 /*
3651 2. Figure out what RAID ID this one is supposed to live at
3652 See if we can get the same RAID dev that it was configured
3653 on last time..
3654 */
3655
3656 raidID = cset->ac->clabel->last_unit;
3657 if ((raidID < 0) || (raidID >= numraid)) {
3658 /* let's not wander off into lala land. */
3659 raidID = numraid - 1;
3660 }
3661 if (raidPtrs[raidID]->valid != 0) {
3662
3663 /*
3664 Nope... Go looking for an alternative...
3665 Start high so we don't immediately use raid0 if that's
3666 not taken.
3667 */
3668
3669 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3670 if (raidPtrs[raidID]->valid == 0) {
3671 /* can use this one! */
3672 break;
3673 }
3674 }
3675 }
3676
3677 if (raidID < 0) {
3678 /* punt... */
3679 printf("Unable to auto configure this set!\n");
3680 printf("(Out of RAID devs!)\n");
3681 free(config, M_RAIDFRAME);
3682 return(1);
3683 }
3684
3685 #ifdef DEBUG
3686 printf("Configuring raid%d:\n",raidID);
3687 #endif
3688
3689 raidPtr = raidPtrs[raidID];
3690
3691 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3692 raidPtr->raidid = raidID;
3693 raidPtr->openings = RAIDOUTSTANDING;
3694
3695 /* 3. Build the configuration structure */
3696 rf_create_configuration(cset->ac, config, raidPtr);
3697
3698 /* 4. Do the configuration */
3699 retcode = rf_Configure(raidPtr, config, cset->ac);
3700
3701 if (retcode == 0) {
3702
3703 raidinit(raidPtrs[raidID]);
3704
3705 rf_markalldirty(raidPtrs[raidID]);
3706 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3707 if (cset->ac->clabel->root_partition==1) {
3708 /* everything configured just fine. Make a note
3709 that this set is eligible to be root. */
3710 cset->rootable = 1;
3711 /* XXX do this here? */
3712 raidPtrs[raidID]->root_partition = 1;
3713 }
3714 }
3715
3716 /* 5. Cleanup */
3717 free(config, M_RAIDFRAME);
3718
3719 *unit = raidID;
3720 return(retcode);
3721 }
3722
3723 void
3724 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3725 {
3726 struct buf *bp;
3727
3728 bp = (struct buf *)desc->bp;
3729 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3730 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3731 }
3732
3733 void
3734 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3735 size_t xmin, size_t xmax)
3736 {
3737 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3738 pool_sethiwat(p, xmax);
3739 pool_prime(p, xmin);
3740 pool_setlowat(p, xmin);
3741 }
3742
3743 /*
3744 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3745 * if there is IO pending and if that IO could possibly be done for a
3746 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3747 * otherwise.
3748 *
3749 */
3750
3751 int
3752 rf_buf_queue_check(int raidid)
3753 {
3754 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
3755 raidPtrs[raidid]->openings > 0) {
3756 /* there is work to do */
3757 return 0;
3758 }
3759 /* default is nothing to do */
3760 return 1;
3761 }
3762
3763 int
3764 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3765 {
3766 uint64_t numsecs;
3767 unsigned secsize;
3768 int error;
3769
3770 error = getdisksize(vp, &numsecs, &secsize);
3771 if (error == 0) {
3772 diskPtr->blockSize = secsize;
3773 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3774 diskPtr->partitionSize = numsecs;
3775 return 0;
3776 }
3777 return error;
3778 }
3779
3780 static int
3781 raid_match(device_t self, cfdata_t cfdata, void *aux)
3782 {
3783 return 1;
3784 }
3785
3786 static void
3787 raid_attach(device_t parent, device_t self, void *aux)
3788 {
3789
3790 }
3791
3792
3793 static int
3794 raid_detach(device_t self, int flags)
3795 {
3796 int error;
3797 struct raid_softc *rs = &raid_softc[device_unit(self)];
3798
3799 if ((error = raidlock(rs)) != 0)
3800 return (error);
3801
3802 error = raid_detach_unlocked(rs);
3803
3804 raidunlock(rs);
3805
3806 return error;
3807 }
3808
3809 static void
3810 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3811 {
3812 prop_dictionary_t disk_info, odisk_info, geom;
3813 disk_info = prop_dictionary_create();
3814 geom = prop_dictionary_create();
3815 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3816 raidPtr->totalSectors);
3817 prop_dictionary_set_uint32(geom, "sector-size",
3818 raidPtr->bytesPerSector);
3819
3820 prop_dictionary_set_uint16(geom, "sectors-per-track",
3821 raidPtr->Layout.dataSectorsPerStripe);
3822 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3823 4 * raidPtr->numCol);
3824
3825 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3826 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3827 (4 * raidPtr->numCol)));
3828
3829 prop_dictionary_set(disk_info, "geometry", geom);
3830 prop_object_release(geom);
3831 prop_dictionary_set(device_properties(rs->sc_dev),
3832 "disk-info", disk_info);
3833 odisk_info = rs->sc_dkdev.dk_info;
3834 rs->sc_dkdev.dk_info = disk_info;
3835 if (odisk_info)
3836 prop_object_release(odisk_info);
3837 }
3838
3839 /*
3840 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3841 * We end up returning whatever error was returned by the first cache flush
3842 * that fails.
3843 */
3844
3845 int
3846 rf_sync_component_caches(RF_Raid_t *raidPtr)
3847 {
3848 int c, sparecol;
3849 int e,error;
3850 int force = 1;
3851
3852 error = 0;
3853 for (c = 0; c < raidPtr->numCol; c++) {
3854 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3855 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3856 &force, FWRITE, NOCRED);
3857 if (e) {
3858 if (e != ENODEV)
3859 printf("raid%d: cache flush to component %s failed.\n",
3860 raidPtr->raidid, raidPtr->Disks[c].devname);
3861 if (error == 0) {
3862 error = e;
3863 }
3864 }
3865 }
3866 }
3867
3868 for( c = 0; c < raidPtr->numSpare ; c++) {
3869 sparecol = raidPtr->numCol + c;
3870 /* Need to ensure that the reconstruct actually completed! */
3871 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3872 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3873 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3874 if (e) {
3875 if (e != ENODEV)
3876 printf("raid%d: cache flush to component %s failed.\n",
3877 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3878 if (error == 0) {
3879 error = e;
3880 }
3881 }
3882 }
3883 }
3884 return error;
3885 }
3886