rf_netbsdkintf.c revision 1.281 1 /* $NetBSD: rf_netbsdkintf.c,v 1.281 2011/02/08 20:20:27 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.281 2011/02/08 20:20:27 rmind Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #include "raid.h"
110 #endif
111
112 #include <sys/param.h>
113 #include <sys/errno.h>
114 #include <sys/pool.h>
115 #include <sys/proc.h>
116 #include <sys/queue.h>
117 #include <sys/disk.h>
118 #include <sys/device.h>
119 #include <sys/stat.h>
120 #include <sys/ioctl.h>
121 #include <sys/fcntl.h>
122 #include <sys/systm.h>
123 #include <sys/vnode.h>
124 #include <sys/disklabel.h>
125 #include <sys/conf.h>
126 #include <sys/buf.h>
127 #include <sys/bufq.h>
128 #include <sys/reboot.h>
129 #include <sys/kauth.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #ifdef DEBUG
156 int rf_kdebug_level = 0;
157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
158 #else /* DEBUG */
159 #define db1_printf(a) { }
160 #endif /* DEBUG */
161
162 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
166
167 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
168 * spare table */
169 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
170 * installation process */
171 #endif
172
173 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
174
175 /* prototypes */
176 static void KernelWakeupFunc(struct buf *);
177 static void InitBP(struct buf *, struct vnode *, unsigned,
178 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
179 void *, int, struct proc *);
180 static void raidinit(RF_Raid_t *);
181
182 void raidattach(int);
183 static int raid_match(device_t, cfdata_t, void *);
184 static void raid_attach(device_t, device_t, void *);
185 static int raid_detach(device_t, int);
186
187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
188 daddr_t, daddr_t);
189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t, int);
191
192 static int raidwrite_component_label(unsigned,
193 dev_t, struct vnode *, RF_ComponentLabel_t *);
194 static int raidread_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196
197
198 dev_type_open(raidopen);
199 dev_type_close(raidclose);
200 dev_type_read(raidread);
201 dev_type_write(raidwrite);
202 dev_type_ioctl(raidioctl);
203 dev_type_strategy(raidstrategy);
204 dev_type_dump(raiddump);
205 dev_type_size(raidsize);
206
207 const struct bdevsw raid_bdevsw = {
208 raidopen, raidclose, raidstrategy, raidioctl,
209 raiddump, raidsize, D_DISK
210 };
211
212 const struct cdevsw raid_cdevsw = {
213 raidopen, raidclose, raidread, raidwrite, raidioctl,
214 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
215 };
216
217 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
218
219 /* XXX Not sure if the following should be replacing the raidPtrs above,
220 or if it should be used in conjunction with that...
221 */
222
223 struct raid_softc {
224 device_t sc_dev;
225 int sc_flags; /* flags */
226 int sc_cflags; /* configuration flags */
227 uint64_t sc_size; /* size of the raid device */
228 char sc_xname[20]; /* XXX external name */
229 struct disk sc_dkdev; /* generic disk device info */
230 struct bufq_state *buf_queue; /* used for the device queue */
231 };
232 /* sc_flags */
233 #define RAIDF_INITED 0x01 /* unit has been initialized */
234 #define RAIDF_WLABEL 0x02 /* label area is writable */
235 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
236 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
237 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
238 #define RAIDF_LOCKED 0x80 /* unit is locked */
239
240 #define raidunit(x) DISKUNIT(x)
241 int numraid = 0;
242
243 extern struct cfdriver raid_cd;
244 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
245 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
246 DVF_DETACH_SHUTDOWN);
247
248 /*
249 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
250 * Be aware that large numbers can allow the driver to consume a lot of
251 * kernel memory, especially on writes, and in degraded mode reads.
252 *
253 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
254 * a single 64K write will typically require 64K for the old data,
255 * 64K for the old parity, and 64K for the new parity, for a total
256 * of 192K (if the parity buffer is not re-used immediately).
257 * Even it if is used immediately, that's still 128K, which when multiplied
258 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
259 *
260 * Now in degraded mode, for example, a 64K read on the above setup may
261 * require data reconstruction, which will require *all* of the 4 remaining
262 * disks to participate -- 4 * 32K/disk == 128K again.
263 */
264
265 #ifndef RAIDOUTSTANDING
266 #define RAIDOUTSTANDING 6
267 #endif
268
269 #define RAIDLABELDEV(dev) \
270 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
271
272 /* declared here, and made public, for the benefit of KVM stuff.. */
273 struct raid_softc *raid_softc;
274
275 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
276 struct disklabel *);
277 static void raidgetdisklabel(dev_t);
278 static void raidmakedisklabel(struct raid_softc *);
279
280 static int raidlock(struct raid_softc *);
281 static void raidunlock(struct raid_softc *);
282
283 static int raid_detach_unlocked(struct raid_softc *);
284
285 static void rf_markalldirty(RF_Raid_t *);
286 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
287
288 void rf_ReconThread(struct rf_recon_req *);
289 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
290 void rf_CopybackThread(RF_Raid_t *raidPtr);
291 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
292 int rf_autoconfig(device_t);
293 void rf_buildroothack(RF_ConfigSet_t *);
294
295 RF_AutoConfig_t *rf_find_raid_components(void);
296 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
297 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
298 static int rf_reasonable_label(RF_ComponentLabel_t *);
299 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
300 int rf_set_autoconfig(RF_Raid_t *, int);
301 int rf_set_rootpartition(RF_Raid_t *, int);
302 void rf_release_all_vps(RF_ConfigSet_t *);
303 void rf_cleanup_config_set(RF_ConfigSet_t *);
304 int rf_have_enough_components(RF_ConfigSet_t *);
305 int rf_auto_config_set(RF_ConfigSet_t *, int *);
306 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
307
308 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
309 allow autoconfig to take place.
310 Note that this is overridden by having
311 RAID_AUTOCONFIG as an option in the
312 kernel config file. */
313
314 struct RF_Pools_s rf_pools;
315
316 void
317 raidattach(int num)
318 {
319 int raidID;
320 int i, rc;
321
322 aprint_debug("raidattach: Asked for %d units\n", num);
323
324 if (num <= 0) {
325 #ifdef DIAGNOSTIC
326 panic("raidattach: count <= 0");
327 #endif
328 return;
329 }
330 /* This is where all the initialization stuff gets done. */
331
332 numraid = num;
333
334 /* Make some space for requested number of units... */
335
336 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
337 if (raidPtrs == NULL) {
338 panic("raidPtrs is NULL!!");
339 }
340
341 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
342 rf_mutex_init(&rf_sparet_wait_mutex);
343
344 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
345 #endif
346
347 for (i = 0; i < num; i++)
348 raidPtrs[i] = NULL;
349 rc = rf_BootRaidframe();
350 if (rc == 0)
351 aprint_verbose("Kernelized RAIDframe activated\n");
352 else
353 panic("Serious error booting RAID!!");
354
355 /* put together some datastructures like the CCD device does.. This
356 * lets us lock the device and what-not when it gets opened. */
357
358 raid_softc = (struct raid_softc *)
359 malloc(num * sizeof(struct raid_softc),
360 M_RAIDFRAME, M_NOWAIT);
361 if (raid_softc == NULL) {
362 aprint_error("WARNING: no memory for RAIDframe driver\n");
363 return;
364 }
365
366 memset(raid_softc, 0, num * sizeof(struct raid_softc));
367
368 for (raidID = 0; raidID < num; raidID++) {
369 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
370
371 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
372 (RF_Raid_t *));
373 if (raidPtrs[raidID] == NULL) {
374 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
375 numraid = raidID;
376 return;
377 }
378 }
379
380 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
381 aprint_error("raidattach: config_cfattach_attach failed?\n");
382 }
383
384 #ifdef RAID_AUTOCONFIG
385 raidautoconfig = 1;
386 #endif
387
388 /*
389 * Register a finalizer which will be used to auto-config RAID
390 * sets once all real hardware devices have been found.
391 */
392 if (config_finalize_register(NULL, rf_autoconfig) != 0)
393 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
394 }
395
396 int
397 rf_autoconfig(device_t self)
398 {
399 RF_AutoConfig_t *ac_list;
400 RF_ConfigSet_t *config_sets;
401
402 if (raidautoconfig == 0)
403 return (0);
404
405 /* XXX This code can only be run once. */
406 raidautoconfig = 0;
407
408 /* 1. locate all RAID components on the system */
409 aprint_debug("Searching for RAID components...\n");
410 ac_list = rf_find_raid_components();
411
412 /* 2. Sort them into their respective sets. */
413 config_sets = rf_create_auto_sets(ac_list);
414
415 /*
416 * 3. Evaluate each set andconfigure the valid ones.
417 * This gets done in rf_buildroothack().
418 */
419 rf_buildroothack(config_sets);
420
421 return 1;
422 }
423
424 void
425 rf_buildroothack(RF_ConfigSet_t *config_sets)
426 {
427 RF_ConfigSet_t *cset;
428 RF_ConfigSet_t *next_cset;
429 int retcode;
430 int raidID;
431 int rootID;
432 int col;
433 int num_root;
434 char *devname;
435
436 rootID = 0;
437 num_root = 0;
438 cset = config_sets;
439 while (cset != NULL) {
440 next_cset = cset->next;
441 if (rf_have_enough_components(cset) &&
442 cset->ac->clabel->autoconfigure==1) {
443 retcode = rf_auto_config_set(cset,&raidID);
444 if (!retcode) {
445 aprint_debug("raid%d: configured ok\n", raidID);
446 if (cset->rootable) {
447 rootID = raidID;
448 num_root++;
449 }
450 } else {
451 /* The autoconfig didn't work :( */
452 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
453 rf_release_all_vps(cset);
454 }
455 } else {
456 /* we're not autoconfiguring this set...
457 release the associated resources */
458 rf_release_all_vps(cset);
459 }
460 /* cleanup */
461 rf_cleanup_config_set(cset);
462 cset = next_cset;
463 }
464
465 /* if the user has specified what the root device should be
466 then we don't touch booted_device or boothowto... */
467
468 if (rootspec != NULL)
469 return;
470
471 /* we found something bootable... */
472
473 if (num_root == 1) {
474 booted_device = raid_softc[rootID].sc_dev;
475 } else if (num_root > 1) {
476
477 /*
478 * Maybe the MD code can help. If it cannot, then
479 * setroot() will discover that we have no
480 * booted_device and will ask the user if nothing was
481 * hardwired in the kernel config file
482 */
483
484 if (booted_device == NULL)
485 cpu_rootconf();
486 if (booted_device == NULL)
487 return;
488
489 num_root = 0;
490 for (raidID = 0; raidID < numraid; raidID++) {
491 if (raidPtrs[raidID]->valid == 0)
492 continue;
493
494 if (raidPtrs[raidID]->root_partition == 0)
495 continue;
496
497 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
498 devname = raidPtrs[raidID]->Disks[col].devname;
499 devname += sizeof("/dev/") - 1;
500 if (strncmp(devname, device_xname(booted_device),
501 strlen(device_xname(booted_device))) != 0)
502 continue;
503 aprint_debug("raid%d includes boot device %s\n",
504 raidID, devname);
505 num_root++;
506 rootID = raidID;
507 }
508 }
509
510 if (num_root == 1) {
511 booted_device = raid_softc[rootID].sc_dev;
512 } else {
513 /* we can't guess.. require the user to answer... */
514 boothowto |= RB_ASKNAME;
515 }
516 }
517 }
518
519
520 int
521 raidsize(dev_t dev)
522 {
523 struct raid_softc *rs;
524 struct disklabel *lp;
525 int part, unit, omask, size;
526
527 unit = raidunit(dev);
528 if (unit >= numraid)
529 return (-1);
530 rs = &raid_softc[unit];
531
532 if ((rs->sc_flags & RAIDF_INITED) == 0)
533 return (-1);
534
535 part = DISKPART(dev);
536 omask = rs->sc_dkdev.dk_openmask & (1 << part);
537 lp = rs->sc_dkdev.dk_label;
538
539 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
540 return (-1);
541
542 if (lp->d_partitions[part].p_fstype != FS_SWAP)
543 size = -1;
544 else
545 size = lp->d_partitions[part].p_size *
546 (lp->d_secsize / DEV_BSIZE);
547
548 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
549 return (-1);
550
551 return (size);
552
553 }
554
555 int
556 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
557 {
558 int unit = raidunit(dev);
559 struct raid_softc *rs;
560 const struct bdevsw *bdev;
561 struct disklabel *lp;
562 RF_Raid_t *raidPtr;
563 daddr_t offset;
564 int part, c, sparecol, j, scol, dumpto;
565 int error = 0;
566
567 if (unit >= numraid)
568 return (ENXIO);
569
570 rs = &raid_softc[unit];
571 raidPtr = raidPtrs[unit];
572
573 if ((rs->sc_flags & RAIDF_INITED) == 0)
574 return ENXIO;
575
576 /* we only support dumping to RAID 1 sets */
577 if (raidPtr->Layout.numDataCol != 1 ||
578 raidPtr->Layout.numParityCol != 1)
579 return EINVAL;
580
581
582 if ((error = raidlock(rs)) != 0)
583 return error;
584
585 if (size % DEV_BSIZE != 0) {
586 error = EINVAL;
587 goto out;
588 }
589
590 if (blkno + size / DEV_BSIZE > rs->sc_size) {
591 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
592 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
593 size / DEV_BSIZE, rs->sc_size);
594 error = EINVAL;
595 goto out;
596 }
597
598 part = DISKPART(dev);
599 lp = rs->sc_dkdev.dk_label;
600 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
601
602 /* figure out what device is alive.. */
603
604 /*
605 Look for a component to dump to. The preference for the
606 component to dump to is as follows:
607 1) the master
608 2) a used_spare of the master
609 3) the slave
610 4) a used_spare of the slave
611 */
612
613 dumpto = -1;
614 for (c = 0; c < raidPtr->numCol; c++) {
615 if (raidPtr->Disks[c].status == rf_ds_optimal) {
616 /* this might be the one */
617 dumpto = c;
618 break;
619 }
620 }
621
622 /*
623 At this point we have possibly selected a live master or a
624 live slave. We now check to see if there is a spared
625 master (or a spared slave), if we didn't find a live master
626 or a live slave.
627 */
628
629 for (c = 0; c < raidPtr->numSpare; c++) {
630 sparecol = raidPtr->numCol + c;
631 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
632 /* How about this one? */
633 scol = -1;
634 for(j=0;j<raidPtr->numCol;j++) {
635 if (raidPtr->Disks[j].spareCol == sparecol) {
636 scol = j;
637 break;
638 }
639 }
640 if (scol == 0) {
641 /*
642 We must have found a spared master!
643 We'll take that over anything else
644 found so far. (We couldn't have
645 found a real master before, since
646 this is a used spare, and it's
647 saying that it's replacing the
648 master.) On reboot (with
649 autoconfiguration turned on)
650 sparecol will become the 1st
651 component (component0) of this set.
652 */
653 dumpto = sparecol;
654 break;
655 } else if (scol != -1) {
656 /*
657 Must be a spared slave. We'll dump
658 to that if we havn't found anything
659 else so far.
660 */
661 if (dumpto == -1)
662 dumpto = sparecol;
663 }
664 }
665 }
666
667 if (dumpto == -1) {
668 /* we couldn't find any live components to dump to!?!?
669 */
670 error = EINVAL;
671 goto out;
672 }
673
674 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
675
676 /*
677 Note that blkno is relative to this particular partition.
678 By adding the offset of this partition in the RAID
679 set, and also adding RF_PROTECTED_SECTORS, we get a
680 value that is relative to the partition used for the
681 underlying component.
682 */
683
684 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
685 blkno + offset, va, size);
686
687 out:
688 raidunlock(rs);
689
690 return error;
691 }
692 /* ARGSUSED */
693 int
694 raidopen(dev_t dev, int flags, int fmt,
695 struct lwp *l)
696 {
697 int unit = raidunit(dev);
698 struct raid_softc *rs;
699 struct disklabel *lp;
700 int part, pmask;
701 int error = 0;
702
703 if (unit >= numraid)
704 return (ENXIO);
705 rs = &raid_softc[unit];
706
707 if ((error = raidlock(rs)) != 0)
708 return (error);
709
710 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
711 error = EBUSY;
712 goto bad;
713 }
714
715 lp = rs->sc_dkdev.dk_label;
716
717 part = DISKPART(dev);
718
719 /*
720 * If there are wedges, and this is not RAW_PART, then we
721 * need to fail.
722 */
723 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
724 error = EBUSY;
725 goto bad;
726 }
727 pmask = (1 << part);
728
729 if ((rs->sc_flags & RAIDF_INITED) &&
730 (rs->sc_dkdev.dk_openmask == 0))
731 raidgetdisklabel(dev);
732
733 /* make sure that this partition exists */
734
735 if (part != RAW_PART) {
736 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
737 ((part >= lp->d_npartitions) ||
738 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
739 error = ENXIO;
740 goto bad;
741 }
742 }
743 /* Prevent this unit from being unconfigured while open. */
744 switch (fmt) {
745 case S_IFCHR:
746 rs->sc_dkdev.dk_copenmask |= pmask;
747 break;
748
749 case S_IFBLK:
750 rs->sc_dkdev.dk_bopenmask |= pmask;
751 break;
752 }
753
754 if ((rs->sc_dkdev.dk_openmask == 0) &&
755 ((rs->sc_flags & RAIDF_INITED) != 0)) {
756 /* First one... mark things as dirty... Note that we *MUST*
757 have done a configure before this. I DO NOT WANT TO BE
758 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
759 THAT THEY BELONG TOGETHER!!!!! */
760 /* XXX should check to see if we're only open for reading
761 here... If so, we needn't do this, but then need some
762 other way of keeping track of what's happened.. */
763
764 rf_markalldirty(raidPtrs[unit]);
765 }
766
767
768 rs->sc_dkdev.dk_openmask =
769 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
770
771 bad:
772 raidunlock(rs);
773
774 return (error);
775
776
777 }
778 /* ARGSUSED */
779 int
780 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
781 {
782 int unit = raidunit(dev);
783 struct raid_softc *rs;
784 int error = 0;
785 int part;
786
787 if (unit >= numraid)
788 return (ENXIO);
789 rs = &raid_softc[unit];
790
791 if ((error = raidlock(rs)) != 0)
792 return (error);
793
794 part = DISKPART(dev);
795
796 /* ...that much closer to allowing unconfiguration... */
797 switch (fmt) {
798 case S_IFCHR:
799 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
800 break;
801
802 case S_IFBLK:
803 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
804 break;
805 }
806 rs->sc_dkdev.dk_openmask =
807 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
808
809 if ((rs->sc_dkdev.dk_openmask == 0) &&
810 ((rs->sc_flags & RAIDF_INITED) != 0)) {
811 /* Last one... device is not unconfigured yet.
812 Device shutdown has taken care of setting the
813 clean bits if RAIDF_INITED is not set
814 mark things as clean... */
815
816 rf_update_component_labels(raidPtrs[unit],
817 RF_FINAL_COMPONENT_UPDATE);
818
819 /* If the kernel is shutting down, it will detach
820 * this RAID set soon enough.
821 */
822 }
823
824 raidunlock(rs);
825 return (0);
826
827 }
828
829 void
830 raidstrategy(struct buf *bp)
831 {
832 int s;
833
834 unsigned int raidID = raidunit(bp->b_dev);
835 RF_Raid_t *raidPtr;
836 struct raid_softc *rs = &raid_softc[raidID];
837 int wlabel;
838
839 if ((rs->sc_flags & RAIDF_INITED) ==0) {
840 bp->b_error = ENXIO;
841 goto done;
842 }
843 if (raidID >= numraid || !raidPtrs[raidID]) {
844 bp->b_error = ENODEV;
845 goto done;
846 }
847 raidPtr = raidPtrs[raidID];
848 if (!raidPtr->valid) {
849 bp->b_error = ENODEV;
850 goto done;
851 }
852 if (bp->b_bcount == 0) {
853 db1_printf(("b_bcount is zero..\n"));
854 goto done;
855 }
856
857 /*
858 * Do bounds checking and adjust transfer. If there's an
859 * error, the bounds check will flag that for us.
860 */
861
862 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
863 if (DISKPART(bp->b_dev) == RAW_PART) {
864 uint64_t size; /* device size in DEV_BSIZE unit */
865
866 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
867 size = raidPtr->totalSectors <<
868 (raidPtr->logBytesPerSector - DEV_BSHIFT);
869 } else {
870 size = raidPtr->totalSectors >>
871 (DEV_BSHIFT - raidPtr->logBytesPerSector);
872 }
873 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
874 goto done;
875 }
876 } else {
877 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
878 db1_printf(("Bounds check failed!!:%d %d\n",
879 (int) bp->b_blkno, (int) wlabel));
880 goto done;
881 }
882 }
883 s = splbio();
884
885 bp->b_resid = 0;
886
887 /* stuff it onto our queue */
888 bufq_put(rs->buf_queue, bp);
889
890 /* scheduled the IO to happen at the next convenient time */
891 wakeup(&(raidPtrs[raidID]->iodone));
892
893 splx(s);
894 return;
895
896 done:
897 bp->b_resid = bp->b_bcount;
898 biodone(bp);
899 }
900 /* ARGSUSED */
901 int
902 raidread(dev_t dev, struct uio *uio, int flags)
903 {
904 int unit = raidunit(dev);
905 struct raid_softc *rs;
906
907 if (unit >= numraid)
908 return (ENXIO);
909 rs = &raid_softc[unit];
910
911 if ((rs->sc_flags & RAIDF_INITED) == 0)
912 return (ENXIO);
913
914 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
915
916 }
917 /* ARGSUSED */
918 int
919 raidwrite(dev_t dev, struct uio *uio, int flags)
920 {
921 int unit = raidunit(dev);
922 struct raid_softc *rs;
923
924 if (unit >= numraid)
925 return (ENXIO);
926 rs = &raid_softc[unit];
927
928 if ((rs->sc_flags & RAIDF_INITED) == 0)
929 return (ENXIO);
930
931 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
932
933 }
934
935 static int
936 raid_detach_unlocked(struct raid_softc *rs)
937 {
938 int error;
939 RF_Raid_t *raidPtr;
940
941 raidPtr = raidPtrs[device_unit(rs->sc_dev)];
942
943 /*
944 * If somebody has a partition mounted, we shouldn't
945 * shutdown.
946 */
947 if (rs->sc_dkdev.dk_openmask != 0)
948 return EBUSY;
949
950 if ((rs->sc_flags & RAIDF_INITED) == 0)
951 ; /* not initialized: nothing to do */
952 else if ((error = rf_Shutdown(raidPtr)) != 0)
953 return error;
954 else
955 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
956
957 /* Detach the disk. */
958 dkwedge_delall(&rs->sc_dkdev);
959 disk_detach(&rs->sc_dkdev);
960 disk_destroy(&rs->sc_dkdev);
961
962 return 0;
963 }
964
965 int
966 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
967 {
968 int unit = raidunit(dev);
969 int error = 0;
970 int part, pmask;
971 cfdata_t cf;
972 struct raid_softc *rs;
973 RF_Config_t *k_cfg, *u_cfg;
974 RF_Raid_t *raidPtr;
975 RF_RaidDisk_t *diskPtr;
976 RF_AccTotals_t *totals;
977 RF_DeviceConfig_t *d_cfg, **ucfgp;
978 u_char *specific_buf;
979 int retcode = 0;
980 int column;
981 /* int raidid; */
982 struct rf_recon_req *rrcopy, *rr;
983 RF_ComponentLabel_t *clabel;
984 RF_ComponentLabel_t *ci_label;
985 RF_ComponentLabel_t **clabel_ptr;
986 RF_SingleComponent_t *sparePtr,*componentPtr;
987 RF_SingleComponent_t component;
988 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
989 int i, j, d;
990 #ifdef __HAVE_OLD_DISKLABEL
991 struct disklabel newlabel;
992 #endif
993 struct dkwedge_info *dkw;
994
995 if (unit >= numraid)
996 return (ENXIO);
997 rs = &raid_softc[unit];
998 raidPtr = raidPtrs[unit];
999
1000 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1001 (int) DISKPART(dev), (int) unit, cmd));
1002
1003 /* Must be open for writes for these commands... */
1004 switch (cmd) {
1005 #ifdef DIOCGSECTORSIZE
1006 case DIOCGSECTORSIZE:
1007 *(u_int *)data = raidPtr->bytesPerSector;
1008 return 0;
1009 case DIOCGMEDIASIZE:
1010 *(off_t *)data =
1011 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1012 return 0;
1013 #endif
1014 case DIOCSDINFO:
1015 case DIOCWDINFO:
1016 #ifdef __HAVE_OLD_DISKLABEL
1017 case ODIOCWDINFO:
1018 case ODIOCSDINFO:
1019 #endif
1020 case DIOCWLABEL:
1021 case DIOCAWEDGE:
1022 case DIOCDWEDGE:
1023 if ((flag & FWRITE) == 0)
1024 return (EBADF);
1025 }
1026
1027 /* Must be initialized for these... */
1028 switch (cmd) {
1029 case DIOCGDINFO:
1030 case DIOCSDINFO:
1031 case DIOCWDINFO:
1032 #ifdef __HAVE_OLD_DISKLABEL
1033 case ODIOCGDINFO:
1034 case ODIOCWDINFO:
1035 case ODIOCSDINFO:
1036 case ODIOCGDEFLABEL:
1037 #endif
1038 case DIOCGPART:
1039 case DIOCWLABEL:
1040 case DIOCGDEFLABEL:
1041 case DIOCAWEDGE:
1042 case DIOCDWEDGE:
1043 case DIOCLWEDGES:
1044 case DIOCCACHESYNC:
1045 case RAIDFRAME_SHUTDOWN:
1046 case RAIDFRAME_REWRITEPARITY:
1047 case RAIDFRAME_GET_INFO:
1048 case RAIDFRAME_RESET_ACCTOTALS:
1049 case RAIDFRAME_GET_ACCTOTALS:
1050 case RAIDFRAME_KEEP_ACCTOTALS:
1051 case RAIDFRAME_GET_SIZE:
1052 case RAIDFRAME_FAIL_DISK:
1053 case RAIDFRAME_COPYBACK:
1054 case RAIDFRAME_CHECK_RECON_STATUS:
1055 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1056 case RAIDFRAME_GET_COMPONENT_LABEL:
1057 case RAIDFRAME_SET_COMPONENT_LABEL:
1058 case RAIDFRAME_ADD_HOT_SPARE:
1059 case RAIDFRAME_REMOVE_HOT_SPARE:
1060 case RAIDFRAME_INIT_LABELS:
1061 case RAIDFRAME_REBUILD_IN_PLACE:
1062 case RAIDFRAME_CHECK_PARITY:
1063 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1064 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1065 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1066 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1067 case RAIDFRAME_SET_AUTOCONFIG:
1068 case RAIDFRAME_SET_ROOT:
1069 case RAIDFRAME_DELETE_COMPONENT:
1070 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1071 case RAIDFRAME_PARITYMAP_STATUS:
1072 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1073 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1074 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1075 if ((rs->sc_flags & RAIDF_INITED) == 0)
1076 return (ENXIO);
1077 }
1078
1079 switch (cmd) {
1080 #ifdef COMPAT_50
1081 case RAIDFRAME_GET_INFO50:
1082 return rf_get_info50(raidPtr, data);
1083
1084 case RAIDFRAME_CONFIGURE50:
1085 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1086 return retcode;
1087 goto config;
1088 #endif
1089 /* configure the system */
1090 case RAIDFRAME_CONFIGURE:
1091
1092 if (raidPtr->valid) {
1093 /* There is a valid RAID set running on this unit! */
1094 printf("raid%d: Device already configured!\n",unit);
1095 return(EINVAL);
1096 }
1097
1098 /* copy-in the configuration information */
1099 /* data points to a pointer to the configuration structure */
1100
1101 u_cfg = *((RF_Config_t **) data);
1102 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1103 if (k_cfg == NULL) {
1104 return (ENOMEM);
1105 }
1106 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1107 if (retcode) {
1108 RF_Free(k_cfg, sizeof(RF_Config_t));
1109 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1110 retcode));
1111 return (retcode);
1112 }
1113 goto config;
1114 config:
1115 /* allocate a buffer for the layout-specific data, and copy it
1116 * in */
1117 if (k_cfg->layoutSpecificSize) {
1118 if (k_cfg->layoutSpecificSize > 10000) {
1119 /* sanity check */
1120 RF_Free(k_cfg, sizeof(RF_Config_t));
1121 return (EINVAL);
1122 }
1123 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1124 (u_char *));
1125 if (specific_buf == NULL) {
1126 RF_Free(k_cfg, sizeof(RF_Config_t));
1127 return (ENOMEM);
1128 }
1129 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1130 k_cfg->layoutSpecificSize);
1131 if (retcode) {
1132 RF_Free(k_cfg, sizeof(RF_Config_t));
1133 RF_Free(specific_buf,
1134 k_cfg->layoutSpecificSize);
1135 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1136 retcode));
1137 return (retcode);
1138 }
1139 } else
1140 specific_buf = NULL;
1141 k_cfg->layoutSpecific = specific_buf;
1142
1143 /* should do some kind of sanity check on the configuration.
1144 * Store the sum of all the bytes in the last byte? */
1145
1146 /* configure the system */
1147
1148 /*
1149 * Clear the entire RAID descriptor, just to make sure
1150 * there is no stale data left in the case of a
1151 * reconfiguration
1152 */
1153 memset(raidPtr, 0, sizeof(*raidPtr));
1154 raidPtr->raidid = unit;
1155
1156 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1157
1158 if (retcode == 0) {
1159
1160 /* allow this many simultaneous IO's to
1161 this RAID device */
1162 raidPtr->openings = RAIDOUTSTANDING;
1163
1164 raidinit(raidPtr);
1165 rf_markalldirty(raidPtr);
1166 }
1167 /* free the buffers. No return code here. */
1168 if (k_cfg->layoutSpecificSize) {
1169 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1170 }
1171 RF_Free(k_cfg, sizeof(RF_Config_t));
1172
1173 return (retcode);
1174
1175 /* shutdown the system */
1176 case RAIDFRAME_SHUTDOWN:
1177
1178 part = DISKPART(dev);
1179 pmask = (1 << part);
1180
1181 if ((error = raidlock(rs)) != 0)
1182 return (error);
1183
1184 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1185 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1186 (rs->sc_dkdev.dk_copenmask & pmask)))
1187 retcode = EBUSY;
1188 else {
1189 rs->sc_flags |= RAIDF_SHUTDOWN;
1190 rs->sc_dkdev.dk_copenmask &= ~pmask;
1191 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1192 rs->sc_dkdev.dk_openmask &= ~pmask;
1193 retcode = 0;
1194 }
1195
1196 raidunlock(rs);
1197
1198 if (retcode != 0)
1199 return retcode;
1200
1201 /* free the pseudo device attach bits */
1202
1203 cf = device_cfdata(rs->sc_dev);
1204 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1205 free(cf, M_RAIDFRAME);
1206
1207 return (retcode);
1208 case RAIDFRAME_GET_COMPONENT_LABEL:
1209 clabel_ptr = (RF_ComponentLabel_t **) data;
1210 /* need to read the component label for the disk indicated
1211 by row,column in clabel */
1212
1213 /*
1214 * Perhaps there should be an option to skip the in-core
1215 * copy and hit the disk, as with disklabel(8).
1216 */
1217 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1218
1219 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1220
1221 if (retcode) {
1222 RF_Free(clabel, sizeof(*clabel));
1223 return retcode;
1224 }
1225
1226 clabel->row = 0; /* Don't allow looking at anything else.*/
1227
1228 column = clabel->column;
1229
1230 if ((column < 0) || (column >= raidPtr->numCol +
1231 raidPtr->numSpare)) {
1232 RF_Free(clabel, sizeof(*clabel));
1233 return EINVAL;
1234 }
1235
1236 RF_Free(clabel, sizeof(*clabel));
1237
1238 clabel = raidget_component_label(raidPtr, column);
1239
1240 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1241
1242 #if 0
1243 case RAIDFRAME_SET_COMPONENT_LABEL:
1244 clabel = (RF_ComponentLabel_t *) data;
1245
1246 /* XXX check the label for valid stuff... */
1247 /* Note that some things *should not* get modified --
1248 the user should be re-initing the labels instead of
1249 trying to patch things.
1250 */
1251
1252 raidid = raidPtr->raidid;
1253 #ifdef DEBUG
1254 printf("raid%d: Got component label:\n", raidid);
1255 printf("raid%d: Version: %d\n", raidid, clabel->version);
1256 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1257 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1258 printf("raid%d: Column: %d\n", raidid, clabel->column);
1259 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1260 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1261 printf("raid%d: Status: %d\n", raidid, clabel->status);
1262 #endif
1263 clabel->row = 0;
1264 column = clabel->column;
1265
1266 if ((column < 0) || (column >= raidPtr->numCol)) {
1267 return(EINVAL);
1268 }
1269
1270 /* XXX this isn't allowed to do anything for now :-) */
1271
1272 /* XXX and before it is, we need to fill in the rest
1273 of the fields!?!?!?! */
1274 memcpy(raidget_component_label(raidPtr, column),
1275 clabel, sizeof(*clabel));
1276 raidflush_component_label(raidPtr, column);
1277 return (0);
1278 #endif
1279
1280 case RAIDFRAME_INIT_LABELS:
1281 clabel = (RF_ComponentLabel_t *) data;
1282 /*
1283 we only want the serial number from
1284 the above. We get all the rest of the information
1285 from the config that was used to create this RAID
1286 set.
1287 */
1288
1289 raidPtr->serial_number = clabel->serial_number;
1290
1291 for(column=0;column<raidPtr->numCol;column++) {
1292 diskPtr = &raidPtr->Disks[column];
1293 if (!RF_DEAD_DISK(diskPtr->status)) {
1294 ci_label = raidget_component_label(raidPtr,
1295 column);
1296 /* Zeroing this is important. */
1297 memset(ci_label, 0, sizeof(*ci_label));
1298 raid_init_component_label(raidPtr, ci_label);
1299 ci_label->serial_number =
1300 raidPtr->serial_number;
1301 ci_label->row = 0; /* we dont' pretend to support more */
1302 ci_label->partitionSize =
1303 diskPtr->partitionSize;
1304 ci_label->column = column;
1305 raidflush_component_label(raidPtr, column);
1306 }
1307 /* XXXjld what about the spares? */
1308 }
1309
1310 return (retcode);
1311 case RAIDFRAME_SET_AUTOCONFIG:
1312 d = rf_set_autoconfig(raidPtr, *(int *) data);
1313 printf("raid%d: New autoconfig value is: %d\n",
1314 raidPtr->raidid, d);
1315 *(int *) data = d;
1316 return (retcode);
1317
1318 case RAIDFRAME_SET_ROOT:
1319 d = rf_set_rootpartition(raidPtr, *(int *) data);
1320 printf("raid%d: New rootpartition value is: %d\n",
1321 raidPtr->raidid, d);
1322 *(int *) data = d;
1323 return (retcode);
1324
1325 /* initialize all parity */
1326 case RAIDFRAME_REWRITEPARITY:
1327
1328 if (raidPtr->Layout.map->faultsTolerated == 0) {
1329 /* Parity for RAID 0 is trivially correct */
1330 raidPtr->parity_good = RF_RAID_CLEAN;
1331 return(0);
1332 }
1333
1334 if (raidPtr->parity_rewrite_in_progress == 1) {
1335 /* Re-write is already in progress! */
1336 return(EINVAL);
1337 }
1338
1339 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1340 rf_RewriteParityThread,
1341 raidPtr,"raid_parity");
1342 return (retcode);
1343
1344
1345 case RAIDFRAME_ADD_HOT_SPARE:
1346 sparePtr = (RF_SingleComponent_t *) data;
1347 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1348 retcode = rf_add_hot_spare(raidPtr, &component);
1349 return(retcode);
1350
1351 case RAIDFRAME_REMOVE_HOT_SPARE:
1352 return(retcode);
1353
1354 case RAIDFRAME_DELETE_COMPONENT:
1355 componentPtr = (RF_SingleComponent_t *)data;
1356 memcpy( &component, componentPtr,
1357 sizeof(RF_SingleComponent_t));
1358 retcode = rf_delete_component(raidPtr, &component);
1359 return(retcode);
1360
1361 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1362 componentPtr = (RF_SingleComponent_t *)data;
1363 memcpy( &component, componentPtr,
1364 sizeof(RF_SingleComponent_t));
1365 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1366 return(retcode);
1367
1368 case RAIDFRAME_REBUILD_IN_PLACE:
1369
1370 if (raidPtr->Layout.map->faultsTolerated == 0) {
1371 /* Can't do this on a RAID 0!! */
1372 return(EINVAL);
1373 }
1374
1375 if (raidPtr->recon_in_progress == 1) {
1376 /* a reconstruct is already in progress! */
1377 return(EINVAL);
1378 }
1379
1380 componentPtr = (RF_SingleComponent_t *) data;
1381 memcpy( &component, componentPtr,
1382 sizeof(RF_SingleComponent_t));
1383 component.row = 0; /* we don't support any more */
1384 column = component.column;
1385
1386 if ((column < 0) || (column >= raidPtr->numCol)) {
1387 return(EINVAL);
1388 }
1389
1390 RF_LOCK_MUTEX(raidPtr->mutex);
1391 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1392 (raidPtr->numFailures > 0)) {
1393 /* XXX 0 above shouldn't be constant!!! */
1394 /* some component other than this has failed.
1395 Let's not make things worse than they already
1396 are... */
1397 printf("raid%d: Unable to reconstruct to disk at:\n",
1398 raidPtr->raidid);
1399 printf("raid%d: Col: %d Too many failures.\n",
1400 raidPtr->raidid, column);
1401 RF_UNLOCK_MUTEX(raidPtr->mutex);
1402 return (EINVAL);
1403 }
1404 if (raidPtr->Disks[column].status ==
1405 rf_ds_reconstructing) {
1406 printf("raid%d: Unable to reconstruct to disk at:\n",
1407 raidPtr->raidid);
1408 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1409
1410 RF_UNLOCK_MUTEX(raidPtr->mutex);
1411 return (EINVAL);
1412 }
1413 if (raidPtr->Disks[column].status == rf_ds_spared) {
1414 RF_UNLOCK_MUTEX(raidPtr->mutex);
1415 return (EINVAL);
1416 }
1417 RF_UNLOCK_MUTEX(raidPtr->mutex);
1418
1419 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1420 if (rrcopy == NULL)
1421 return(ENOMEM);
1422
1423 rrcopy->raidPtr = (void *) raidPtr;
1424 rrcopy->col = column;
1425
1426 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1427 rf_ReconstructInPlaceThread,
1428 rrcopy,"raid_reconip");
1429 return(retcode);
1430
1431 case RAIDFRAME_GET_INFO:
1432 if (!raidPtr->valid)
1433 return (ENODEV);
1434 ucfgp = (RF_DeviceConfig_t **) data;
1435 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1436 (RF_DeviceConfig_t *));
1437 if (d_cfg == NULL)
1438 return (ENOMEM);
1439 d_cfg->rows = 1; /* there is only 1 row now */
1440 d_cfg->cols = raidPtr->numCol;
1441 d_cfg->ndevs = raidPtr->numCol;
1442 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1443 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1444 return (ENOMEM);
1445 }
1446 d_cfg->nspares = raidPtr->numSpare;
1447 if (d_cfg->nspares >= RF_MAX_DISKS) {
1448 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1449 return (ENOMEM);
1450 }
1451 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1452 d = 0;
1453 for (j = 0; j < d_cfg->cols; j++) {
1454 d_cfg->devs[d] = raidPtr->Disks[j];
1455 d++;
1456 }
1457 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1458 d_cfg->spares[i] = raidPtr->Disks[j];
1459 }
1460 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1461 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1462
1463 return (retcode);
1464
1465 case RAIDFRAME_CHECK_PARITY:
1466 *(int *) data = raidPtr->parity_good;
1467 return (0);
1468
1469 case RAIDFRAME_PARITYMAP_STATUS:
1470 if (rf_paritymap_ineligible(raidPtr))
1471 return EINVAL;
1472 rf_paritymap_status(raidPtr->parity_map,
1473 (struct rf_pmstat *)data);
1474 return 0;
1475
1476 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1477 if (rf_paritymap_ineligible(raidPtr))
1478 return EINVAL;
1479 if (raidPtr->parity_map == NULL)
1480 return ENOENT; /* ??? */
1481 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1482 (struct rf_pmparams *)data, 1))
1483 return EINVAL;
1484 return 0;
1485
1486 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1487 if (rf_paritymap_ineligible(raidPtr))
1488 return EINVAL;
1489 *(int *) data = rf_paritymap_get_disable(raidPtr);
1490 return 0;
1491
1492 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1493 if (rf_paritymap_ineligible(raidPtr))
1494 return EINVAL;
1495 rf_paritymap_set_disable(raidPtr, *(int *)data);
1496 /* XXX should errors be passed up? */
1497 return 0;
1498
1499 case RAIDFRAME_RESET_ACCTOTALS:
1500 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1501 return (0);
1502
1503 case RAIDFRAME_GET_ACCTOTALS:
1504 totals = (RF_AccTotals_t *) data;
1505 *totals = raidPtr->acc_totals;
1506 return (0);
1507
1508 case RAIDFRAME_KEEP_ACCTOTALS:
1509 raidPtr->keep_acc_totals = *(int *)data;
1510 return (0);
1511
1512 case RAIDFRAME_GET_SIZE:
1513 *(int *) data = raidPtr->totalSectors;
1514 return (0);
1515
1516 /* fail a disk & optionally start reconstruction */
1517 case RAIDFRAME_FAIL_DISK:
1518
1519 if (raidPtr->Layout.map->faultsTolerated == 0) {
1520 /* Can't do this on a RAID 0!! */
1521 return(EINVAL);
1522 }
1523
1524 rr = (struct rf_recon_req *) data;
1525 rr->row = 0;
1526 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1527 return (EINVAL);
1528
1529
1530 RF_LOCK_MUTEX(raidPtr->mutex);
1531 if (raidPtr->status == rf_rs_reconstructing) {
1532 /* you can't fail a disk while we're reconstructing! */
1533 /* XXX wrong for RAID6 */
1534 RF_UNLOCK_MUTEX(raidPtr->mutex);
1535 return (EINVAL);
1536 }
1537 if ((raidPtr->Disks[rr->col].status ==
1538 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1539 /* some other component has failed. Let's not make
1540 things worse. XXX wrong for RAID6 */
1541 RF_UNLOCK_MUTEX(raidPtr->mutex);
1542 return (EINVAL);
1543 }
1544 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1545 /* Can't fail a spared disk! */
1546 RF_UNLOCK_MUTEX(raidPtr->mutex);
1547 return (EINVAL);
1548 }
1549 RF_UNLOCK_MUTEX(raidPtr->mutex);
1550
1551 /* make a copy of the recon request so that we don't rely on
1552 * the user's buffer */
1553 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1554 if (rrcopy == NULL)
1555 return(ENOMEM);
1556 memcpy(rrcopy, rr, sizeof(*rr));
1557 rrcopy->raidPtr = (void *) raidPtr;
1558
1559 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1560 rf_ReconThread,
1561 rrcopy,"raid_recon");
1562 return (0);
1563
1564 /* invoke a copyback operation after recon on whatever disk
1565 * needs it, if any */
1566 case RAIDFRAME_COPYBACK:
1567
1568 if (raidPtr->Layout.map->faultsTolerated == 0) {
1569 /* This makes no sense on a RAID 0!! */
1570 return(EINVAL);
1571 }
1572
1573 if (raidPtr->copyback_in_progress == 1) {
1574 /* Copyback is already in progress! */
1575 return(EINVAL);
1576 }
1577
1578 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1579 rf_CopybackThread,
1580 raidPtr,"raid_copyback");
1581 return (retcode);
1582
1583 /* return the percentage completion of reconstruction */
1584 case RAIDFRAME_CHECK_RECON_STATUS:
1585 if (raidPtr->Layout.map->faultsTolerated == 0) {
1586 /* This makes no sense on a RAID 0, so tell the
1587 user it's done. */
1588 *(int *) data = 100;
1589 return(0);
1590 }
1591 if (raidPtr->status != rf_rs_reconstructing)
1592 *(int *) data = 100;
1593 else {
1594 if (raidPtr->reconControl->numRUsTotal > 0) {
1595 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1596 } else {
1597 *(int *) data = 0;
1598 }
1599 }
1600 return (0);
1601 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1602 progressInfoPtr = (RF_ProgressInfo_t **) data;
1603 if (raidPtr->status != rf_rs_reconstructing) {
1604 progressInfo.remaining = 0;
1605 progressInfo.completed = 100;
1606 progressInfo.total = 100;
1607 } else {
1608 progressInfo.total =
1609 raidPtr->reconControl->numRUsTotal;
1610 progressInfo.completed =
1611 raidPtr->reconControl->numRUsComplete;
1612 progressInfo.remaining = progressInfo.total -
1613 progressInfo.completed;
1614 }
1615 retcode = copyout(&progressInfo, *progressInfoPtr,
1616 sizeof(RF_ProgressInfo_t));
1617 return (retcode);
1618
1619 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1620 if (raidPtr->Layout.map->faultsTolerated == 0) {
1621 /* This makes no sense on a RAID 0, so tell the
1622 user it's done. */
1623 *(int *) data = 100;
1624 return(0);
1625 }
1626 if (raidPtr->parity_rewrite_in_progress == 1) {
1627 *(int *) data = 100 *
1628 raidPtr->parity_rewrite_stripes_done /
1629 raidPtr->Layout.numStripe;
1630 } else {
1631 *(int *) data = 100;
1632 }
1633 return (0);
1634
1635 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1636 progressInfoPtr = (RF_ProgressInfo_t **) data;
1637 if (raidPtr->parity_rewrite_in_progress == 1) {
1638 progressInfo.total = raidPtr->Layout.numStripe;
1639 progressInfo.completed =
1640 raidPtr->parity_rewrite_stripes_done;
1641 progressInfo.remaining = progressInfo.total -
1642 progressInfo.completed;
1643 } else {
1644 progressInfo.remaining = 0;
1645 progressInfo.completed = 100;
1646 progressInfo.total = 100;
1647 }
1648 retcode = copyout(&progressInfo, *progressInfoPtr,
1649 sizeof(RF_ProgressInfo_t));
1650 return (retcode);
1651
1652 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1653 if (raidPtr->Layout.map->faultsTolerated == 0) {
1654 /* This makes no sense on a RAID 0 */
1655 *(int *) data = 100;
1656 return(0);
1657 }
1658 if (raidPtr->copyback_in_progress == 1) {
1659 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1660 raidPtr->Layout.numStripe;
1661 } else {
1662 *(int *) data = 100;
1663 }
1664 return (0);
1665
1666 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1667 progressInfoPtr = (RF_ProgressInfo_t **) data;
1668 if (raidPtr->copyback_in_progress == 1) {
1669 progressInfo.total = raidPtr->Layout.numStripe;
1670 progressInfo.completed =
1671 raidPtr->copyback_stripes_done;
1672 progressInfo.remaining = progressInfo.total -
1673 progressInfo.completed;
1674 } else {
1675 progressInfo.remaining = 0;
1676 progressInfo.completed = 100;
1677 progressInfo.total = 100;
1678 }
1679 retcode = copyout(&progressInfo, *progressInfoPtr,
1680 sizeof(RF_ProgressInfo_t));
1681 return (retcode);
1682
1683 /* the sparetable daemon calls this to wait for the kernel to
1684 * need a spare table. this ioctl does not return until a
1685 * spare table is needed. XXX -- calling mpsleep here in the
1686 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1687 * -- I should either compute the spare table in the kernel,
1688 * or have a different -- XXX XXX -- interface (a different
1689 * character device) for delivering the table -- XXX */
1690 #if 0
1691 case RAIDFRAME_SPARET_WAIT:
1692 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1693 while (!rf_sparet_wait_queue)
1694 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1695 waitreq = rf_sparet_wait_queue;
1696 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1697 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1698
1699 /* structure assignment */
1700 *((RF_SparetWait_t *) data) = *waitreq;
1701
1702 RF_Free(waitreq, sizeof(*waitreq));
1703 return (0);
1704
1705 /* wakes up a process waiting on SPARET_WAIT and puts an error
1706 * code in it that will cause the dameon to exit */
1707 case RAIDFRAME_ABORT_SPARET_WAIT:
1708 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1709 waitreq->fcol = -1;
1710 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1711 waitreq->next = rf_sparet_wait_queue;
1712 rf_sparet_wait_queue = waitreq;
1713 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1714 wakeup(&rf_sparet_wait_queue);
1715 return (0);
1716
1717 /* used by the spare table daemon to deliver a spare table
1718 * into the kernel */
1719 case RAIDFRAME_SEND_SPARET:
1720
1721 /* install the spare table */
1722 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1723
1724 /* respond to the requestor. the return status of the spare
1725 * table installation is passed in the "fcol" field */
1726 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1727 waitreq->fcol = retcode;
1728 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1729 waitreq->next = rf_sparet_resp_queue;
1730 rf_sparet_resp_queue = waitreq;
1731 wakeup(&rf_sparet_resp_queue);
1732 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1733
1734 return (retcode);
1735 #endif
1736
1737 default:
1738 break; /* fall through to the os-specific code below */
1739
1740 }
1741
1742 if (!raidPtr->valid)
1743 return (EINVAL);
1744
1745 /*
1746 * Add support for "regular" device ioctls here.
1747 */
1748
1749 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1750 if (error != EPASSTHROUGH)
1751 return (error);
1752
1753 switch (cmd) {
1754 case DIOCGDINFO:
1755 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1756 break;
1757 #ifdef __HAVE_OLD_DISKLABEL
1758 case ODIOCGDINFO:
1759 newlabel = *(rs->sc_dkdev.dk_label);
1760 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1761 return ENOTTY;
1762 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1763 break;
1764 #endif
1765
1766 case DIOCGPART:
1767 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1768 ((struct partinfo *) data)->part =
1769 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1770 break;
1771
1772 case DIOCWDINFO:
1773 case DIOCSDINFO:
1774 #ifdef __HAVE_OLD_DISKLABEL
1775 case ODIOCWDINFO:
1776 case ODIOCSDINFO:
1777 #endif
1778 {
1779 struct disklabel *lp;
1780 #ifdef __HAVE_OLD_DISKLABEL
1781 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1782 memset(&newlabel, 0, sizeof newlabel);
1783 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1784 lp = &newlabel;
1785 } else
1786 #endif
1787 lp = (struct disklabel *)data;
1788
1789 if ((error = raidlock(rs)) != 0)
1790 return (error);
1791
1792 rs->sc_flags |= RAIDF_LABELLING;
1793
1794 error = setdisklabel(rs->sc_dkdev.dk_label,
1795 lp, 0, rs->sc_dkdev.dk_cpulabel);
1796 if (error == 0) {
1797 if (cmd == DIOCWDINFO
1798 #ifdef __HAVE_OLD_DISKLABEL
1799 || cmd == ODIOCWDINFO
1800 #endif
1801 )
1802 error = writedisklabel(RAIDLABELDEV(dev),
1803 raidstrategy, rs->sc_dkdev.dk_label,
1804 rs->sc_dkdev.dk_cpulabel);
1805 }
1806 rs->sc_flags &= ~RAIDF_LABELLING;
1807
1808 raidunlock(rs);
1809
1810 if (error)
1811 return (error);
1812 break;
1813 }
1814
1815 case DIOCWLABEL:
1816 if (*(int *) data != 0)
1817 rs->sc_flags |= RAIDF_WLABEL;
1818 else
1819 rs->sc_flags &= ~RAIDF_WLABEL;
1820 break;
1821
1822 case DIOCGDEFLABEL:
1823 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1824 break;
1825
1826 #ifdef __HAVE_OLD_DISKLABEL
1827 case ODIOCGDEFLABEL:
1828 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1829 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1830 return ENOTTY;
1831 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1832 break;
1833 #endif
1834
1835 case DIOCAWEDGE:
1836 case DIOCDWEDGE:
1837 dkw = (void *)data;
1838
1839 /* If the ioctl happens here, the parent is us. */
1840 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1841 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1842
1843 case DIOCLWEDGES:
1844 return dkwedge_list(&rs->sc_dkdev,
1845 (struct dkwedge_list *)data, l);
1846 case DIOCCACHESYNC:
1847 return rf_sync_component_caches(raidPtr);
1848 default:
1849 retcode = ENOTTY;
1850 }
1851 return (retcode);
1852
1853 }
1854
1855
1856 /* raidinit -- complete the rest of the initialization for the
1857 RAIDframe device. */
1858
1859
1860 static void
1861 raidinit(RF_Raid_t *raidPtr)
1862 {
1863 cfdata_t cf;
1864 struct raid_softc *rs;
1865 int unit;
1866
1867 unit = raidPtr->raidid;
1868
1869 rs = &raid_softc[unit];
1870
1871 /* XXX should check return code first... */
1872 rs->sc_flags |= RAIDF_INITED;
1873
1874 /* XXX doesn't check bounds. */
1875 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1876
1877 /* attach the pseudo device */
1878 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1879 cf->cf_name = raid_cd.cd_name;
1880 cf->cf_atname = raid_cd.cd_name;
1881 cf->cf_unit = unit;
1882 cf->cf_fstate = FSTATE_STAR;
1883
1884 rs->sc_dev = config_attach_pseudo(cf);
1885
1886 if (rs->sc_dev == NULL) {
1887 printf("raid%d: config_attach_pseudo failed\n",
1888 raidPtr->raidid);
1889 rs->sc_flags &= ~RAIDF_INITED;
1890 free(cf, M_RAIDFRAME);
1891 return;
1892 }
1893
1894 /* disk_attach actually creates space for the CPU disklabel, among
1895 * other things, so it's critical to call this *BEFORE* we try putzing
1896 * with disklabels. */
1897
1898 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1899 disk_attach(&rs->sc_dkdev);
1900 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1901
1902 /* XXX There may be a weird interaction here between this, and
1903 * protectedSectors, as used in RAIDframe. */
1904
1905 rs->sc_size = raidPtr->totalSectors;
1906
1907 dkwedge_discover(&rs->sc_dkdev);
1908
1909 rf_set_properties(rs, raidPtr);
1910
1911 }
1912 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1913 /* wake up the daemon & tell it to get us a spare table
1914 * XXX
1915 * the entries in the queues should be tagged with the raidPtr
1916 * so that in the extremely rare case that two recons happen at once,
1917 * we know for which device were requesting a spare table
1918 * XXX
1919 *
1920 * XXX This code is not currently used. GO
1921 */
1922 int
1923 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1924 {
1925 int retcode;
1926
1927 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1928 req->next = rf_sparet_wait_queue;
1929 rf_sparet_wait_queue = req;
1930 wakeup(&rf_sparet_wait_queue);
1931
1932 /* mpsleep unlocks the mutex */
1933 while (!rf_sparet_resp_queue) {
1934 tsleep(&rf_sparet_resp_queue, PRIBIO,
1935 "raidframe getsparetable", 0);
1936 }
1937 req = rf_sparet_resp_queue;
1938 rf_sparet_resp_queue = req->next;
1939 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1940
1941 retcode = req->fcol;
1942 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1943 * alloc'd */
1944 return (retcode);
1945 }
1946 #endif
1947
1948 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1949 * bp & passes it down.
1950 * any calls originating in the kernel must use non-blocking I/O
1951 * do some extra sanity checking to return "appropriate" error values for
1952 * certain conditions (to make some standard utilities work)
1953 *
1954 * Formerly known as: rf_DoAccessKernel
1955 */
1956 void
1957 raidstart(RF_Raid_t *raidPtr)
1958 {
1959 RF_SectorCount_t num_blocks, pb, sum;
1960 RF_RaidAddr_t raid_addr;
1961 struct partition *pp;
1962 daddr_t blocknum;
1963 int unit;
1964 struct raid_softc *rs;
1965 int do_async;
1966 struct buf *bp;
1967 int rc;
1968
1969 unit = raidPtr->raidid;
1970 rs = &raid_softc[unit];
1971
1972 /* quick check to see if anything has died recently */
1973 RF_LOCK_MUTEX(raidPtr->mutex);
1974 if (raidPtr->numNewFailures > 0) {
1975 RF_UNLOCK_MUTEX(raidPtr->mutex);
1976 rf_update_component_labels(raidPtr,
1977 RF_NORMAL_COMPONENT_UPDATE);
1978 RF_LOCK_MUTEX(raidPtr->mutex);
1979 raidPtr->numNewFailures--;
1980 }
1981
1982 /* Check to see if we're at the limit... */
1983 while (raidPtr->openings > 0) {
1984 RF_UNLOCK_MUTEX(raidPtr->mutex);
1985
1986 /* get the next item, if any, from the queue */
1987 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
1988 /* nothing more to do */
1989 return;
1990 }
1991
1992 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1993 * partition.. Need to make it absolute to the underlying
1994 * device.. */
1995
1996 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
1997 if (DISKPART(bp->b_dev) != RAW_PART) {
1998 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1999 blocknum += pp->p_offset;
2000 }
2001
2002 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2003 (int) blocknum));
2004
2005 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2006 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2007
2008 /* *THIS* is where we adjust what block we're going to...
2009 * but DO NOT TOUCH bp->b_blkno!!! */
2010 raid_addr = blocknum;
2011
2012 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2013 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2014 sum = raid_addr + num_blocks + pb;
2015 if (1 || rf_debugKernelAccess) {
2016 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2017 (int) raid_addr, (int) sum, (int) num_blocks,
2018 (int) pb, (int) bp->b_resid));
2019 }
2020 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2021 || (sum < num_blocks) || (sum < pb)) {
2022 bp->b_error = ENOSPC;
2023 bp->b_resid = bp->b_bcount;
2024 biodone(bp);
2025 RF_LOCK_MUTEX(raidPtr->mutex);
2026 continue;
2027 }
2028 /*
2029 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2030 */
2031
2032 if (bp->b_bcount & raidPtr->sectorMask) {
2033 bp->b_error = EINVAL;
2034 bp->b_resid = bp->b_bcount;
2035 biodone(bp);
2036 RF_LOCK_MUTEX(raidPtr->mutex);
2037 continue;
2038
2039 }
2040 db1_printf(("Calling DoAccess..\n"));
2041
2042
2043 RF_LOCK_MUTEX(raidPtr->mutex);
2044 raidPtr->openings--;
2045 RF_UNLOCK_MUTEX(raidPtr->mutex);
2046
2047 /*
2048 * Everything is async.
2049 */
2050 do_async = 1;
2051
2052 disk_busy(&rs->sc_dkdev);
2053
2054 /* XXX we're still at splbio() here... do we *really*
2055 need to be? */
2056
2057 /* don't ever condition on bp->b_flags & B_WRITE.
2058 * always condition on B_READ instead */
2059
2060 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2061 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2062 do_async, raid_addr, num_blocks,
2063 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2064
2065 if (rc) {
2066 bp->b_error = rc;
2067 bp->b_resid = bp->b_bcount;
2068 biodone(bp);
2069 /* continue loop */
2070 }
2071
2072 RF_LOCK_MUTEX(raidPtr->mutex);
2073 }
2074 RF_UNLOCK_MUTEX(raidPtr->mutex);
2075 }
2076
2077
2078
2079
2080 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2081
2082 int
2083 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2084 {
2085 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2086 struct buf *bp;
2087
2088 req->queue = queue;
2089 bp = req->bp;
2090
2091 switch (req->type) {
2092 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2093 /* XXX need to do something extra here.. */
2094 /* I'm leaving this in, as I've never actually seen it used,
2095 * and I'd like folks to report it... GO */
2096 printf(("WAKEUP CALLED\n"));
2097 queue->numOutstanding++;
2098
2099 bp->b_flags = 0;
2100 bp->b_private = req;
2101
2102 KernelWakeupFunc(bp);
2103 break;
2104
2105 case RF_IO_TYPE_READ:
2106 case RF_IO_TYPE_WRITE:
2107 #if RF_ACC_TRACE > 0
2108 if (req->tracerec) {
2109 RF_ETIMER_START(req->tracerec->timer);
2110 }
2111 #endif
2112 InitBP(bp, queue->rf_cinfo->ci_vp,
2113 op, queue->rf_cinfo->ci_dev,
2114 req->sectorOffset, req->numSector,
2115 req->buf, KernelWakeupFunc, (void *) req,
2116 queue->raidPtr->logBytesPerSector, req->b_proc);
2117
2118 if (rf_debugKernelAccess) {
2119 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2120 (long) bp->b_blkno));
2121 }
2122 queue->numOutstanding++;
2123 queue->last_deq_sector = req->sectorOffset;
2124 /* acc wouldn't have been let in if there were any pending
2125 * reqs at any other priority */
2126 queue->curPriority = req->priority;
2127
2128 db1_printf(("Going for %c to unit %d col %d\n",
2129 req->type, queue->raidPtr->raidid,
2130 queue->col));
2131 db1_printf(("sector %d count %d (%d bytes) %d\n",
2132 (int) req->sectorOffset, (int) req->numSector,
2133 (int) (req->numSector <<
2134 queue->raidPtr->logBytesPerSector),
2135 (int) queue->raidPtr->logBytesPerSector));
2136
2137 /*
2138 * XXX: drop lock here since this can block at
2139 * least with backing SCSI devices. Retake it
2140 * to minimize fuss with calling interfaces.
2141 */
2142
2143 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2144 bdev_strategy(bp);
2145 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2146 break;
2147
2148 default:
2149 panic("bad req->type in rf_DispatchKernelIO");
2150 }
2151 db1_printf(("Exiting from DispatchKernelIO\n"));
2152
2153 return (0);
2154 }
2155 /* this is the callback function associated with a I/O invoked from
2156 kernel code.
2157 */
2158 static void
2159 KernelWakeupFunc(struct buf *bp)
2160 {
2161 RF_DiskQueueData_t *req = NULL;
2162 RF_DiskQueue_t *queue;
2163 int s;
2164
2165 s = splbio();
2166 db1_printf(("recovering the request queue:\n"));
2167 req = bp->b_private;
2168
2169 queue = (RF_DiskQueue_t *) req->queue;
2170
2171 #if RF_ACC_TRACE > 0
2172 if (req->tracerec) {
2173 RF_ETIMER_STOP(req->tracerec->timer);
2174 RF_ETIMER_EVAL(req->tracerec->timer);
2175 RF_LOCK_MUTEX(rf_tracing_mutex);
2176 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2177 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2178 req->tracerec->num_phys_ios++;
2179 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2180 }
2181 #endif
2182
2183 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2184 * ballistic, and mark the component as hosed... */
2185
2186 if (bp->b_error != 0) {
2187 /* Mark the disk as dead */
2188 /* but only mark it once... */
2189 /* and only if it wouldn't leave this RAID set
2190 completely broken */
2191 if (((queue->raidPtr->Disks[queue->col].status ==
2192 rf_ds_optimal) ||
2193 (queue->raidPtr->Disks[queue->col].status ==
2194 rf_ds_used_spare)) &&
2195 (queue->raidPtr->numFailures <
2196 queue->raidPtr->Layout.map->faultsTolerated)) {
2197 printf("raid%d: IO Error. Marking %s as failed.\n",
2198 queue->raidPtr->raidid,
2199 queue->raidPtr->Disks[queue->col].devname);
2200 queue->raidPtr->Disks[queue->col].status =
2201 rf_ds_failed;
2202 queue->raidPtr->status = rf_rs_degraded;
2203 queue->raidPtr->numFailures++;
2204 queue->raidPtr->numNewFailures++;
2205 } else { /* Disk is already dead... */
2206 /* printf("Disk already marked as dead!\n"); */
2207 }
2208
2209 }
2210
2211 /* Fill in the error value */
2212
2213 req->error = bp->b_error;
2214
2215 simple_lock(&queue->raidPtr->iodone_lock);
2216
2217 /* Drop this one on the "finished" queue... */
2218 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2219
2220 /* Let the raidio thread know there is work to be done. */
2221 wakeup(&(queue->raidPtr->iodone));
2222
2223 simple_unlock(&queue->raidPtr->iodone_lock);
2224
2225 splx(s);
2226 }
2227
2228
2229
2230 /*
2231 * initialize a buf structure for doing an I/O in the kernel.
2232 */
2233 static void
2234 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2235 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2236 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2237 struct proc *b_proc)
2238 {
2239 /* bp->b_flags = B_PHYS | rw_flag; */
2240 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2241 bp->b_oflags = 0;
2242 bp->b_cflags = 0;
2243 bp->b_bcount = numSect << logBytesPerSector;
2244 bp->b_bufsize = bp->b_bcount;
2245 bp->b_error = 0;
2246 bp->b_dev = dev;
2247 bp->b_data = bf;
2248 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2249 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2250 if (bp->b_bcount == 0) {
2251 panic("bp->b_bcount is zero in InitBP!!");
2252 }
2253 bp->b_proc = b_proc;
2254 bp->b_iodone = cbFunc;
2255 bp->b_private = cbArg;
2256 }
2257
2258 static void
2259 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2260 struct disklabel *lp)
2261 {
2262 memset(lp, 0, sizeof(*lp));
2263
2264 /* fabricate a label... */
2265 lp->d_secperunit = raidPtr->totalSectors;
2266 lp->d_secsize = raidPtr->bytesPerSector;
2267 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2268 lp->d_ntracks = 4 * raidPtr->numCol;
2269 lp->d_ncylinders = raidPtr->totalSectors /
2270 (lp->d_nsectors * lp->d_ntracks);
2271 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2272
2273 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2274 lp->d_type = DTYPE_RAID;
2275 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2276 lp->d_rpm = 3600;
2277 lp->d_interleave = 1;
2278 lp->d_flags = 0;
2279
2280 lp->d_partitions[RAW_PART].p_offset = 0;
2281 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2282 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2283 lp->d_npartitions = RAW_PART + 1;
2284
2285 lp->d_magic = DISKMAGIC;
2286 lp->d_magic2 = DISKMAGIC;
2287 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2288
2289 }
2290 /*
2291 * Read the disklabel from the raid device. If one is not present, fake one
2292 * up.
2293 */
2294 static void
2295 raidgetdisklabel(dev_t dev)
2296 {
2297 int unit = raidunit(dev);
2298 struct raid_softc *rs = &raid_softc[unit];
2299 const char *errstring;
2300 struct disklabel *lp = rs->sc_dkdev.dk_label;
2301 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2302 RF_Raid_t *raidPtr;
2303
2304 db1_printf(("Getting the disklabel...\n"));
2305
2306 memset(clp, 0, sizeof(*clp));
2307
2308 raidPtr = raidPtrs[unit];
2309
2310 raidgetdefaultlabel(raidPtr, rs, lp);
2311
2312 /*
2313 * Call the generic disklabel extraction routine.
2314 */
2315 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2316 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2317 if (errstring)
2318 raidmakedisklabel(rs);
2319 else {
2320 int i;
2321 struct partition *pp;
2322
2323 /*
2324 * Sanity check whether the found disklabel is valid.
2325 *
2326 * This is necessary since total size of the raid device
2327 * may vary when an interleave is changed even though exactly
2328 * same components are used, and old disklabel may used
2329 * if that is found.
2330 */
2331 if (lp->d_secperunit != rs->sc_size)
2332 printf("raid%d: WARNING: %s: "
2333 "total sector size in disklabel (%" PRIu32 ") != "
2334 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2335 lp->d_secperunit, rs->sc_size);
2336 for (i = 0; i < lp->d_npartitions; i++) {
2337 pp = &lp->d_partitions[i];
2338 if (pp->p_offset + pp->p_size > rs->sc_size)
2339 printf("raid%d: WARNING: %s: end of partition `%c' "
2340 "exceeds the size of raid (%" PRIu64 ")\n",
2341 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2342 }
2343 }
2344
2345 }
2346 /*
2347 * Take care of things one might want to take care of in the event
2348 * that a disklabel isn't present.
2349 */
2350 static void
2351 raidmakedisklabel(struct raid_softc *rs)
2352 {
2353 struct disklabel *lp = rs->sc_dkdev.dk_label;
2354 db1_printf(("Making a label..\n"));
2355
2356 /*
2357 * For historical reasons, if there's no disklabel present
2358 * the raw partition must be marked FS_BSDFFS.
2359 */
2360
2361 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2362
2363 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2364
2365 lp->d_checksum = dkcksum(lp);
2366 }
2367 /*
2368 * Wait interruptibly for an exclusive lock.
2369 *
2370 * XXX
2371 * Several drivers do this; it should be abstracted and made MP-safe.
2372 * (Hmm... where have we seen this warning before :-> GO )
2373 */
2374 static int
2375 raidlock(struct raid_softc *rs)
2376 {
2377 int error;
2378
2379 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2380 rs->sc_flags |= RAIDF_WANTED;
2381 if ((error =
2382 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2383 return (error);
2384 }
2385 rs->sc_flags |= RAIDF_LOCKED;
2386 return (0);
2387 }
2388 /*
2389 * Unlock and wake up any waiters.
2390 */
2391 static void
2392 raidunlock(struct raid_softc *rs)
2393 {
2394
2395 rs->sc_flags &= ~RAIDF_LOCKED;
2396 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2397 rs->sc_flags &= ~RAIDF_WANTED;
2398 wakeup(rs);
2399 }
2400 }
2401
2402
2403 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2404 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2405 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2406
2407 static daddr_t
2408 rf_component_info_offset(void)
2409 {
2410
2411 return RF_COMPONENT_INFO_OFFSET;
2412 }
2413
2414 static daddr_t
2415 rf_component_info_size(unsigned secsize)
2416 {
2417 daddr_t info_size;
2418
2419 KASSERT(secsize);
2420 if (secsize > RF_COMPONENT_INFO_SIZE)
2421 info_size = secsize;
2422 else
2423 info_size = RF_COMPONENT_INFO_SIZE;
2424
2425 return info_size;
2426 }
2427
2428 static daddr_t
2429 rf_parity_map_offset(RF_Raid_t *raidPtr)
2430 {
2431 daddr_t map_offset;
2432
2433 KASSERT(raidPtr->bytesPerSector);
2434 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2435 map_offset = raidPtr->bytesPerSector;
2436 else
2437 map_offset = RF_COMPONENT_INFO_SIZE;
2438 map_offset += rf_component_info_offset();
2439
2440 return map_offset;
2441 }
2442
2443 static daddr_t
2444 rf_parity_map_size(RF_Raid_t *raidPtr)
2445 {
2446 daddr_t map_size;
2447
2448 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2449 map_size = raidPtr->bytesPerSector;
2450 else
2451 map_size = RF_PARITY_MAP_SIZE;
2452
2453 return map_size;
2454 }
2455
2456 int
2457 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2458 {
2459 RF_ComponentLabel_t *clabel;
2460
2461 clabel = raidget_component_label(raidPtr, col);
2462 clabel->clean = RF_RAID_CLEAN;
2463 raidflush_component_label(raidPtr, col);
2464 return(0);
2465 }
2466
2467
2468 int
2469 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2470 {
2471 RF_ComponentLabel_t *clabel;
2472
2473 clabel = raidget_component_label(raidPtr, col);
2474 clabel->clean = RF_RAID_DIRTY;
2475 raidflush_component_label(raidPtr, col);
2476 return(0);
2477 }
2478
2479 int
2480 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2481 {
2482 KASSERT(raidPtr->bytesPerSector);
2483 return raidread_component_label(raidPtr->bytesPerSector,
2484 raidPtr->Disks[col].dev,
2485 raidPtr->raid_cinfo[col].ci_vp,
2486 &raidPtr->raid_cinfo[col].ci_label);
2487 }
2488
2489 RF_ComponentLabel_t *
2490 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2491 {
2492 return &raidPtr->raid_cinfo[col].ci_label;
2493 }
2494
2495 int
2496 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2497 {
2498 RF_ComponentLabel_t *label;
2499
2500 label = &raidPtr->raid_cinfo[col].ci_label;
2501 label->mod_counter = raidPtr->mod_counter;
2502 #ifndef RF_NO_PARITY_MAP
2503 label->parity_map_modcount = label->mod_counter;
2504 #endif
2505 return raidwrite_component_label(raidPtr->bytesPerSector,
2506 raidPtr->Disks[col].dev,
2507 raidPtr->raid_cinfo[col].ci_vp, label);
2508 }
2509
2510
2511 static int
2512 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2513 RF_ComponentLabel_t *clabel)
2514 {
2515 return raidread_component_area(dev, b_vp, clabel,
2516 sizeof(RF_ComponentLabel_t),
2517 rf_component_info_offset(),
2518 rf_component_info_size(secsize));
2519 }
2520
2521 /* ARGSUSED */
2522 static int
2523 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2524 size_t msize, daddr_t offset, daddr_t dsize)
2525 {
2526 struct buf *bp;
2527 const struct bdevsw *bdev;
2528 int error;
2529
2530 /* XXX should probably ensure that we don't try to do this if
2531 someone has changed rf_protected_sectors. */
2532
2533 if (b_vp == NULL) {
2534 /* For whatever reason, this component is not valid.
2535 Don't try to read a component label from it. */
2536 return(EINVAL);
2537 }
2538
2539 /* get a block of the appropriate size... */
2540 bp = geteblk((int)dsize);
2541 bp->b_dev = dev;
2542
2543 /* get our ducks in a row for the read */
2544 bp->b_blkno = offset / DEV_BSIZE;
2545 bp->b_bcount = dsize;
2546 bp->b_flags |= B_READ;
2547 bp->b_resid = dsize;
2548
2549 bdev = bdevsw_lookup(bp->b_dev);
2550 if (bdev == NULL)
2551 return (ENXIO);
2552 (*bdev->d_strategy)(bp);
2553
2554 error = biowait(bp);
2555
2556 if (!error) {
2557 memcpy(data, bp->b_data, msize);
2558 }
2559
2560 brelse(bp, 0);
2561 return(error);
2562 }
2563
2564
2565 static int
2566 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2567 RF_ComponentLabel_t *clabel)
2568 {
2569 return raidwrite_component_area(dev, b_vp, clabel,
2570 sizeof(RF_ComponentLabel_t),
2571 rf_component_info_offset(),
2572 rf_component_info_size(secsize), 0);
2573 }
2574
2575 /* ARGSUSED */
2576 static int
2577 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2578 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2579 {
2580 struct buf *bp;
2581 const struct bdevsw *bdev;
2582 int error;
2583
2584 /* get a block of the appropriate size... */
2585 bp = geteblk((int)dsize);
2586 bp->b_dev = dev;
2587
2588 /* get our ducks in a row for the write */
2589 bp->b_blkno = offset / DEV_BSIZE;
2590 bp->b_bcount = dsize;
2591 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2592 bp->b_resid = dsize;
2593
2594 memset(bp->b_data, 0, dsize);
2595 memcpy(bp->b_data, data, msize);
2596
2597 bdev = bdevsw_lookup(bp->b_dev);
2598 if (bdev == NULL)
2599 return (ENXIO);
2600 (*bdev->d_strategy)(bp);
2601 if (asyncp)
2602 return 0;
2603 error = biowait(bp);
2604 brelse(bp, 0);
2605 if (error) {
2606 #if 1
2607 printf("Failed to write RAID component info!\n");
2608 #endif
2609 }
2610
2611 return(error);
2612 }
2613
2614 void
2615 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2616 {
2617 int c;
2618
2619 for (c = 0; c < raidPtr->numCol; c++) {
2620 /* Skip dead disks. */
2621 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2622 continue;
2623 /* XXXjld: what if an error occurs here? */
2624 raidwrite_component_area(raidPtr->Disks[c].dev,
2625 raidPtr->raid_cinfo[c].ci_vp, map,
2626 RF_PARITYMAP_NBYTE,
2627 rf_parity_map_offset(raidPtr),
2628 rf_parity_map_size(raidPtr), 0);
2629 }
2630 }
2631
2632 void
2633 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2634 {
2635 struct rf_paritymap_ondisk tmp;
2636 int c,first;
2637
2638 first=1;
2639 for (c = 0; c < raidPtr->numCol; c++) {
2640 /* Skip dead disks. */
2641 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2642 continue;
2643 raidread_component_area(raidPtr->Disks[c].dev,
2644 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2645 RF_PARITYMAP_NBYTE,
2646 rf_parity_map_offset(raidPtr),
2647 rf_parity_map_size(raidPtr));
2648 if (first) {
2649 memcpy(map, &tmp, sizeof(*map));
2650 first = 0;
2651 } else {
2652 rf_paritymap_merge(map, &tmp);
2653 }
2654 }
2655 }
2656
2657 void
2658 rf_markalldirty(RF_Raid_t *raidPtr)
2659 {
2660 RF_ComponentLabel_t *clabel;
2661 int sparecol;
2662 int c;
2663 int j;
2664 int scol = -1;
2665
2666 raidPtr->mod_counter++;
2667 for (c = 0; c < raidPtr->numCol; c++) {
2668 /* we don't want to touch (at all) a disk that has
2669 failed */
2670 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2671 clabel = raidget_component_label(raidPtr, c);
2672 if (clabel->status == rf_ds_spared) {
2673 /* XXX do something special...
2674 but whatever you do, don't
2675 try to access it!! */
2676 } else {
2677 raidmarkdirty(raidPtr, c);
2678 }
2679 }
2680 }
2681
2682 for( c = 0; c < raidPtr->numSpare ; c++) {
2683 sparecol = raidPtr->numCol + c;
2684 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2685 /*
2686
2687 we claim this disk is "optimal" if it's
2688 rf_ds_used_spare, as that means it should be
2689 directly substitutable for the disk it replaced.
2690 We note that too...
2691
2692 */
2693
2694 for(j=0;j<raidPtr->numCol;j++) {
2695 if (raidPtr->Disks[j].spareCol == sparecol) {
2696 scol = j;
2697 break;
2698 }
2699 }
2700
2701 clabel = raidget_component_label(raidPtr, sparecol);
2702 /* make sure status is noted */
2703
2704 raid_init_component_label(raidPtr, clabel);
2705
2706 clabel->row = 0;
2707 clabel->column = scol;
2708 /* Note: we *don't* change status from rf_ds_used_spare
2709 to rf_ds_optimal */
2710 /* clabel.status = rf_ds_optimal; */
2711
2712 raidmarkdirty(raidPtr, sparecol);
2713 }
2714 }
2715 }
2716
2717
2718 void
2719 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2720 {
2721 RF_ComponentLabel_t *clabel;
2722 int sparecol;
2723 int c;
2724 int j;
2725 int scol;
2726
2727 scol = -1;
2728
2729 /* XXX should do extra checks to make sure things really are clean,
2730 rather than blindly setting the clean bit... */
2731
2732 raidPtr->mod_counter++;
2733
2734 for (c = 0; c < raidPtr->numCol; c++) {
2735 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2736 clabel = raidget_component_label(raidPtr, c);
2737 /* make sure status is noted */
2738 clabel->status = rf_ds_optimal;
2739
2740 /* note what unit we are configured as */
2741 clabel->last_unit = raidPtr->raidid;
2742
2743 raidflush_component_label(raidPtr, c);
2744 if (final == RF_FINAL_COMPONENT_UPDATE) {
2745 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2746 raidmarkclean(raidPtr, c);
2747 }
2748 }
2749 }
2750 /* else we don't touch it.. */
2751 }
2752
2753 for( c = 0; c < raidPtr->numSpare ; c++) {
2754 sparecol = raidPtr->numCol + c;
2755 /* Need to ensure that the reconstruct actually completed! */
2756 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2757 /*
2758
2759 we claim this disk is "optimal" if it's
2760 rf_ds_used_spare, as that means it should be
2761 directly substitutable for the disk it replaced.
2762 We note that too...
2763
2764 */
2765
2766 for(j=0;j<raidPtr->numCol;j++) {
2767 if (raidPtr->Disks[j].spareCol == sparecol) {
2768 scol = j;
2769 break;
2770 }
2771 }
2772
2773 /* XXX shouldn't *really* need this... */
2774 clabel = raidget_component_label(raidPtr, sparecol);
2775 /* make sure status is noted */
2776
2777 raid_init_component_label(raidPtr, clabel);
2778
2779 clabel->column = scol;
2780 clabel->status = rf_ds_optimal;
2781 clabel->last_unit = raidPtr->raidid;
2782
2783 raidflush_component_label(raidPtr, sparecol);
2784 if (final == RF_FINAL_COMPONENT_UPDATE) {
2785 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2786 raidmarkclean(raidPtr, sparecol);
2787 }
2788 }
2789 }
2790 }
2791 }
2792
2793 void
2794 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2795 {
2796
2797 if (vp != NULL) {
2798 if (auto_configured == 1) {
2799 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2800 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2801 vput(vp);
2802
2803 } else {
2804 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2805 }
2806 }
2807 }
2808
2809
2810 void
2811 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2812 {
2813 int r,c;
2814 struct vnode *vp;
2815 int acd;
2816
2817
2818 /* We take this opportunity to close the vnodes like we should.. */
2819
2820 for (c = 0; c < raidPtr->numCol; c++) {
2821 vp = raidPtr->raid_cinfo[c].ci_vp;
2822 acd = raidPtr->Disks[c].auto_configured;
2823 rf_close_component(raidPtr, vp, acd);
2824 raidPtr->raid_cinfo[c].ci_vp = NULL;
2825 raidPtr->Disks[c].auto_configured = 0;
2826 }
2827
2828 for (r = 0; r < raidPtr->numSpare; r++) {
2829 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2830 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2831 rf_close_component(raidPtr, vp, acd);
2832 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2833 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2834 }
2835 }
2836
2837
2838 void
2839 rf_ReconThread(struct rf_recon_req *req)
2840 {
2841 int s;
2842 RF_Raid_t *raidPtr;
2843
2844 s = splbio();
2845 raidPtr = (RF_Raid_t *) req->raidPtr;
2846 raidPtr->recon_in_progress = 1;
2847
2848 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2849 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2850
2851 RF_Free(req, sizeof(*req));
2852
2853 raidPtr->recon_in_progress = 0;
2854 splx(s);
2855
2856 /* That's all... */
2857 kthread_exit(0); /* does not return */
2858 }
2859
2860 void
2861 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2862 {
2863 int retcode;
2864 int s;
2865
2866 raidPtr->parity_rewrite_stripes_done = 0;
2867 raidPtr->parity_rewrite_in_progress = 1;
2868 s = splbio();
2869 retcode = rf_RewriteParity(raidPtr);
2870 splx(s);
2871 if (retcode) {
2872 printf("raid%d: Error re-writing parity (%d)!\n",
2873 raidPtr->raidid, retcode);
2874 } else {
2875 /* set the clean bit! If we shutdown correctly,
2876 the clean bit on each component label will get
2877 set */
2878 raidPtr->parity_good = RF_RAID_CLEAN;
2879 }
2880 raidPtr->parity_rewrite_in_progress = 0;
2881
2882 /* Anyone waiting for us to stop? If so, inform them... */
2883 if (raidPtr->waitShutdown) {
2884 wakeup(&raidPtr->parity_rewrite_in_progress);
2885 }
2886
2887 /* That's all... */
2888 kthread_exit(0); /* does not return */
2889 }
2890
2891
2892 void
2893 rf_CopybackThread(RF_Raid_t *raidPtr)
2894 {
2895 int s;
2896
2897 raidPtr->copyback_in_progress = 1;
2898 s = splbio();
2899 rf_CopybackReconstructedData(raidPtr);
2900 splx(s);
2901 raidPtr->copyback_in_progress = 0;
2902
2903 /* That's all... */
2904 kthread_exit(0); /* does not return */
2905 }
2906
2907
2908 void
2909 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2910 {
2911 int s;
2912 RF_Raid_t *raidPtr;
2913
2914 s = splbio();
2915 raidPtr = req->raidPtr;
2916 raidPtr->recon_in_progress = 1;
2917 rf_ReconstructInPlace(raidPtr, req->col);
2918 RF_Free(req, sizeof(*req));
2919 raidPtr->recon_in_progress = 0;
2920 splx(s);
2921
2922 /* That's all... */
2923 kthread_exit(0); /* does not return */
2924 }
2925
2926 static RF_AutoConfig_t *
2927 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2928 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2929 unsigned secsize)
2930 {
2931 int good_one = 0;
2932 RF_ComponentLabel_t *clabel;
2933 RF_AutoConfig_t *ac;
2934
2935 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2936 if (clabel == NULL) {
2937 oomem:
2938 while(ac_list) {
2939 ac = ac_list;
2940 if (ac->clabel)
2941 free(ac->clabel, M_RAIDFRAME);
2942 ac_list = ac_list->next;
2943 free(ac, M_RAIDFRAME);
2944 }
2945 printf("RAID auto config: out of memory!\n");
2946 return NULL; /* XXX probably should panic? */
2947 }
2948
2949 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2950 /* Got the label. Does it look reasonable? */
2951 if (rf_reasonable_label(clabel) &&
2952 (clabel->partitionSize <= size)) {
2953 rf_fix_old_label_size(clabel, numsecs);
2954 #ifdef DEBUG
2955 printf("Component on: %s: %llu\n",
2956 cname, (unsigned long long)size);
2957 rf_print_component_label(clabel);
2958 #endif
2959 /* if it's reasonable, add it, else ignore it. */
2960 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2961 M_NOWAIT);
2962 if (ac == NULL) {
2963 free(clabel, M_RAIDFRAME);
2964 goto oomem;
2965 }
2966 strlcpy(ac->devname, cname, sizeof(ac->devname));
2967 ac->dev = dev;
2968 ac->vp = vp;
2969 ac->clabel = clabel;
2970 ac->next = ac_list;
2971 ac_list = ac;
2972 good_one = 1;
2973 }
2974 }
2975 if (!good_one) {
2976 /* cleanup */
2977 free(clabel, M_RAIDFRAME);
2978 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2979 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2980 vput(vp);
2981 }
2982 return ac_list;
2983 }
2984
2985 RF_AutoConfig_t *
2986 rf_find_raid_components(void)
2987 {
2988 struct vnode *vp;
2989 struct disklabel label;
2990 device_t dv;
2991 deviter_t di;
2992 dev_t dev;
2993 int bmajor, bminor, wedge;
2994 int error;
2995 int i;
2996 RF_AutoConfig_t *ac_list;
2997 uint64_t numsecs;
2998 unsigned secsize;
2999
3000 RF_ASSERT(raidPtr->bytesPerSector < rf_component_info_offset());
3001
3002 /* initialize the AutoConfig list */
3003 ac_list = NULL;
3004
3005 /* we begin by trolling through *all* the devices on the system */
3006
3007 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3008 dv = deviter_next(&di)) {
3009
3010 /* we are only interested in disks... */
3011 if (device_class(dv) != DV_DISK)
3012 continue;
3013
3014 /* we don't care about floppies... */
3015 if (device_is_a(dv, "fd")) {
3016 continue;
3017 }
3018
3019 /* we don't care about CD's... */
3020 if (device_is_a(dv, "cd")) {
3021 continue;
3022 }
3023
3024 /* we don't care about md's... */
3025 if (device_is_a(dv, "md")) {
3026 continue;
3027 }
3028
3029 /* hdfd is the Atari/Hades floppy driver */
3030 if (device_is_a(dv, "hdfd")) {
3031 continue;
3032 }
3033
3034 /* fdisa is the Atari/Milan floppy driver */
3035 if (device_is_a(dv, "fdisa")) {
3036 continue;
3037 }
3038
3039 /* need to find the device_name_to_block_device_major stuff */
3040 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3041
3042 /* get a vnode for the raw partition of this disk */
3043
3044 wedge = device_is_a(dv, "dk");
3045 bminor = minor(device_unit(dv));
3046 dev = wedge ? makedev(bmajor, bminor) :
3047 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3048 if (bdevvp(dev, &vp))
3049 panic("RAID can't alloc vnode");
3050
3051 error = VOP_OPEN(vp, FREAD, NOCRED);
3052
3053 if (error) {
3054 /* "Who cares." Continue looking
3055 for something that exists*/
3056 vput(vp);
3057 continue;
3058 }
3059
3060 error = getdisksize(vp, &numsecs, &secsize);
3061 if (error) {
3062 vput(vp);
3063 continue;
3064 }
3065 if (wedge) {
3066 struct dkwedge_info dkw;
3067 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3068 NOCRED);
3069 if (error) {
3070 printf("RAIDframe: can't get wedge info for "
3071 "dev %s (%d)\n", device_xname(dv), error);
3072 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3073 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3074 vput(vp);
3075 continue;
3076 }
3077
3078 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3079 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3080 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3081 vput(vp);
3082 continue;
3083 }
3084
3085 ac_list = rf_get_component(ac_list, dev, vp,
3086 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3087 continue;
3088 }
3089
3090 /* Ok, the disk exists. Go get the disklabel. */
3091 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3092 if (error) {
3093 /*
3094 * XXX can't happen - open() would
3095 * have errored out (or faked up one)
3096 */
3097 if (error != ENOTTY)
3098 printf("RAIDframe: can't get label for dev "
3099 "%s (%d)\n", device_xname(dv), error);
3100 }
3101
3102 /* don't need this any more. We'll allocate it again
3103 a little later if we really do... */
3104 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3105 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3106 vput(vp);
3107
3108 if (error)
3109 continue;
3110
3111 for (i = 0; i < label.d_npartitions; i++) {
3112 char cname[sizeof(ac_list->devname)];
3113
3114 /* We only support partitions marked as RAID */
3115 if (label.d_partitions[i].p_fstype != FS_RAID)
3116 continue;
3117
3118 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3119 if (bdevvp(dev, &vp))
3120 panic("RAID can't alloc vnode");
3121
3122 error = VOP_OPEN(vp, FREAD, NOCRED);
3123 if (error) {
3124 /* Whatever... */
3125 vput(vp);
3126 continue;
3127 }
3128 snprintf(cname, sizeof(cname), "%s%c",
3129 device_xname(dv), 'a' + i);
3130 ac_list = rf_get_component(ac_list, dev, vp, cname,
3131 label.d_partitions[i].p_size, numsecs, secsize);
3132 }
3133 }
3134 deviter_release(&di);
3135 return ac_list;
3136 }
3137
3138
3139 static int
3140 rf_reasonable_label(RF_ComponentLabel_t *clabel)
3141 {
3142
3143 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3144 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3145 ((clabel->clean == RF_RAID_CLEAN) ||
3146 (clabel->clean == RF_RAID_DIRTY)) &&
3147 clabel->row >=0 &&
3148 clabel->column >= 0 &&
3149 clabel->num_rows > 0 &&
3150 clabel->num_columns > 0 &&
3151 clabel->row < clabel->num_rows &&
3152 clabel->column < clabel->num_columns &&
3153 clabel->blockSize > 0 &&
3154 clabel->numBlocks > 0) {
3155 /* label looks reasonable enough... */
3156 return(1);
3157 }
3158 return(0);
3159 }
3160
3161
3162 /*
3163 * For reasons yet unknown, some old component labels have garbage in
3164 * the newer numBlocksHi region, and this causes lossage. Since those
3165 * disks will also have numsecs set to less than 32 bits of sectors,
3166 * we can determine when this corruption has occured, and fix it.
3167 */
3168 static void
3169 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3170 {
3171
3172 if (clabel->numBlocksHi && numsecs < ((uint64_t)1 << 32)) {
3173 printf("WARNING: total sectors < 32 bits, yet numBlocksHi set\n"
3174 "WARNING: resetting numBlocksHi to zero.\n");
3175 clabel->numBlocksHi = 0;
3176 }
3177 }
3178
3179
3180 #ifdef DEBUG
3181 void
3182 rf_print_component_label(RF_ComponentLabel_t *clabel)
3183 {
3184 uint64_t numBlocks = clabel->numBlocks;
3185
3186 numBlocks |= (uint64_t)clabel->numBlocksHi << 32;
3187
3188 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3189 clabel->row, clabel->column,
3190 clabel->num_rows, clabel->num_columns);
3191 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3192 clabel->version, clabel->serial_number,
3193 clabel->mod_counter);
3194 printf(" Clean: %s Status: %d\n",
3195 clabel->clean ? "Yes" : "No", clabel->status);
3196 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3197 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3198 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3199 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3200 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3201 printf(" Contains root partition: %s\n",
3202 clabel->root_partition ? "Yes" : "No");
3203 printf(" Last configured as: raid%d\n", clabel->last_unit);
3204 #if 0
3205 printf(" Config order: %d\n", clabel->config_order);
3206 #endif
3207
3208 }
3209 #endif
3210
3211 RF_ConfigSet_t *
3212 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3213 {
3214 RF_AutoConfig_t *ac;
3215 RF_ConfigSet_t *config_sets;
3216 RF_ConfigSet_t *cset;
3217 RF_AutoConfig_t *ac_next;
3218
3219
3220 config_sets = NULL;
3221
3222 /* Go through the AutoConfig list, and figure out which components
3223 belong to what sets. */
3224 ac = ac_list;
3225 while(ac!=NULL) {
3226 /* we're going to putz with ac->next, so save it here
3227 for use at the end of the loop */
3228 ac_next = ac->next;
3229
3230 if (config_sets == NULL) {
3231 /* will need at least this one... */
3232 config_sets = (RF_ConfigSet_t *)
3233 malloc(sizeof(RF_ConfigSet_t),
3234 M_RAIDFRAME, M_NOWAIT);
3235 if (config_sets == NULL) {
3236 panic("rf_create_auto_sets: No memory!");
3237 }
3238 /* this one is easy :) */
3239 config_sets->ac = ac;
3240 config_sets->next = NULL;
3241 config_sets->rootable = 0;
3242 ac->next = NULL;
3243 } else {
3244 /* which set does this component fit into? */
3245 cset = config_sets;
3246 while(cset!=NULL) {
3247 if (rf_does_it_fit(cset, ac)) {
3248 /* looks like it matches... */
3249 ac->next = cset->ac;
3250 cset->ac = ac;
3251 break;
3252 }
3253 cset = cset->next;
3254 }
3255 if (cset==NULL) {
3256 /* didn't find a match above... new set..*/
3257 cset = (RF_ConfigSet_t *)
3258 malloc(sizeof(RF_ConfigSet_t),
3259 M_RAIDFRAME, M_NOWAIT);
3260 if (cset == NULL) {
3261 panic("rf_create_auto_sets: No memory!");
3262 }
3263 cset->ac = ac;
3264 ac->next = NULL;
3265 cset->next = config_sets;
3266 cset->rootable = 0;
3267 config_sets = cset;
3268 }
3269 }
3270 ac = ac_next;
3271 }
3272
3273
3274 return(config_sets);
3275 }
3276
3277 static int
3278 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3279 {
3280 RF_ComponentLabel_t *clabel1, *clabel2;
3281
3282 /* If this one matches the *first* one in the set, that's good
3283 enough, since the other members of the set would have been
3284 through here too... */
3285 /* note that we are not checking partitionSize here..
3286
3287 Note that we are also not checking the mod_counters here.
3288 If everything else matches execpt the mod_counter, that's
3289 good enough for this test. We will deal with the mod_counters
3290 a little later in the autoconfiguration process.
3291
3292 (clabel1->mod_counter == clabel2->mod_counter) &&
3293
3294 The reason we don't check for this is that failed disks
3295 will have lower modification counts. If those disks are
3296 not added to the set they used to belong to, then they will
3297 form their own set, which may result in 2 different sets,
3298 for example, competing to be configured at raid0, and
3299 perhaps competing to be the root filesystem set. If the
3300 wrong ones get configured, or both attempt to become /,
3301 weird behaviour and or serious lossage will occur. Thus we
3302 need to bring them into the fold here, and kick them out at
3303 a later point.
3304
3305 */
3306
3307 clabel1 = cset->ac->clabel;
3308 clabel2 = ac->clabel;
3309 if ((clabel1->version == clabel2->version) &&
3310 (clabel1->serial_number == clabel2->serial_number) &&
3311 (clabel1->num_rows == clabel2->num_rows) &&
3312 (clabel1->num_columns == clabel2->num_columns) &&
3313 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3314 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3315 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3316 (clabel1->parityConfig == clabel2->parityConfig) &&
3317 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3318 (clabel1->blockSize == clabel2->blockSize) &&
3319 (clabel1->numBlocks == clabel2->numBlocks) &&
3320 (clabel1->numBlocksHi == clabel2->numBlocksHi) &&
3321 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3322 (clabel1->root_partition == clabel2->root_partition) &&
3323 (clabel1->last_unit == clabel2->last_unit) &&
3324 (clabel1->config_order == clabel2->config_order)) {
3325 /* if it get's here, it almost *has* to be a match */
3326 } else {
3327 /* it's not consistent with somebody in the set..
3328 punt */
3329 return(0);
3330 }
3331 /* all was fine.. it must fit... */
3332 return(1);
3333 }
3334
3335 int
3336 rf_have_enough_components(RF_ConfigSet_t *cset)
3337 {
3338 RF_AutoConfig_t *ac;
3339 RF_AutoConfig_t *auto_config;
3340 RF_ComponentLabel_t *clabel;
3341 int c;
3342 int num_cols;
3343 int num_missing;
3344 int mod_counter;
3345 int mod_counter_found;
3346 int even_pair_failed;
3347 char parity_type;
3348
3349
3350 /* check to see that we have enough 'live' components
3351 of this set. If so, we can configure it if necessary */
3352
3353 num_cols = cset->ac->clabel->num_columns;
3354 parity_type = cset->ac->clabel->parityConfig;
3355
3356 /* XXX Check for duplicate components!?!?!? */
3357
3358 /* Determine what the mod_counter is supposed to be for this set. */
3359
3360 mod_counter_found = 0;
3361 mod_counter = 0;
3362 ac = cset->ac;
3363 while(ac!=NULL) {
3364 if (mod_counter_found==0) {
3365 mod_counter = ac->clabel->mod_counter;
3366 mod_counter_found = 1;
3367 } else {
3368 if (ac->clabel->mod_counter > mod_counter) {
3369 mod_counter = ac->clabel->mod_counter;
3370 }
3371 }
3372 ac = ac->next;
3373 }
3374
3375 num_missing = 0;
3376 auto_config = cset->ac;
3377
3378 even_pair_failed = 0;
3379 for(c=0; c<num_cols; c++) {
3380 ac = auto_config;
3381 while(ac!=NULL) {
3382 if ((ac->clabel->column == c) &&
3383 (ac->clabel->mod_counter == mod_counter)) {
3384 /* it's this one... */
3385 #ifdef DEBUG
3386 printf("Found: %s at %d\n",
3387 ac->devname,c);
3388 #endif
3389 break;
3390 }
3391 ac=ac->next;
3392 }
3393 if (ac==NULL) {
3394 /* Didn't find one here! */
3395 /* special case for RAID 1, especially
3396 where there are more than 2
3397 components (where RAIDframe treats
3398 things a little differently :( ) */
3399 if (parity_type == '1') {
3400 if (c%2 == 0) { /* even component */
3401 even_pair_failed = 1;
3402 } else { /* odd component. If
3403 we're failed, and
3404 so is the even
3405 component, it's
3406 "Good Night, Charlie" */
3407 if (even_pair_failed == 1) {
3408 return(0);
3409 }
3410 }
3411 } else {
3412 /* normal accounting */
3413 num_missing++;
3414 }
3415 }
3416 if ((parity_type == '1') && (c%2 == 1)) {
3417 /* Just did an even component, and we didn't
3418 bail.. reset the even_pair_failed flag,
3419 and go on to the next component.... */
3420 even_pair_failed = 0;
3421 }
3422 }
3423
3424 clabel = cset->ac->clabel;
3425
3426 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3427 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3428 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3429 /* XXX this needs to be made *much* more general */
3430 /* Too many failures */
3431 return(0);
3432 }
3433 /* otherwise, all is well, and we've got enough to take a kick
3434 at autoconfiguring this set */
3435 return(1);
3436 }
3437
3438 void
3439 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3440 RF_Raid_t *raidPtr)
3441 {
3442 RF_ComponentLabel_t *clabel;
3443 int i;
3444
3445 clabel = ac->clabel;
3446
3447 /* 1. Fill in the common stuff */
3448 config->numRow = clabel->num_rows = 1;
3449 config->numCol = clabel->num_columns;
3450 config->numSpare = 0; /* XXX should this be set here? */
3451 config->sectPerSU = clabel->sectPerSU;
3452 config->SUsPerPU = clabel->SUsPerPU;
3453 config->SUsPerRU = clabel->SUsPerRU;
3454 config->parityConfig = clabel->parityConfig;
3455 /* XXX... */
3456 strcpy(config->diskQueueType,"fifo");
3457 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3458 config->layoutSpecificSize = 0; /* XXX ?? */
3459
3460 while(ac!=NULL) {
3461 /* row/col values will be in range due to the checks
3462 in reasonable_label() */
3463 strcpy(config->devnames[0][ac->clabel->column],
3464 ac->devname);
3465 ac = ac->next;
3466 }
3467
3468 for(i=0;i<RF_MAXDBGV;i++) {
3469 config->debugVars[i][0] = 0;
3470 }
3471 }
3472
3473 int
3474 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3475 {
3476 RF_ComponentLabel_t *clabel;
3477 int column;
3478 int sparecol;
3479
3480 raidPtr->autoconfigure = new_value;
3481
3482 for(column=0; column<raidPtr->numCol; column++) {
3483 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3484 clabel = raidget_component_label(raidPtr, column);
3485 clabel->autoconfigure = new_value;
3486 raidflush_component_label(raidPtr, column);
3487 }
3488 }
3489 for(column = 0; column < raidPtr->numSpare ; column++) {
3490 sparecol = raidPtr->numCol + column;
3491 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3492 clabel = raidget_component_label(raidPtr, sparecol);
3493 clabel->autoconfigure = new_value;
3494 raidflush_component_label(raidPtr, sparecol);
3495 }
3496 }
3497 return(new_value);
3498 }
3499
3500 int
3501 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3502 {
3503 RF_ComponentLabel_t *clabel;
3504 int column;
3505 int sparecol;
3506
3507 raidPtr->root_partition = new_value;
3508 for(column=0; column<raidPtr->numCol; column++) {
3509 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3510 clabel = raidget_component_label(raidPtr, column);
3511 clabel->root_partition = new_value;
3512 raidflush_component_label(raidPtr, column);
3513 }
3514 }
3515 for(column = 0; column < raidPtr->numSpare ; column++) {
3516 sparecol = raidPtr->numCol + column;
3517 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3518 clabel = raidget_component_label(raidPtr, sparecol);
3519 clabel->root_partition = new_value;
3520 raidflush_component_label(raidPtr, sparecol);
3521 }
3522 }
3523 return(new_value);
3524 }
3525
3526 void
3527 rf_release_all_vps(RF_ConfigSet_t *cset)
3528 {
3529 RF_AutoConfig_t *ac;
3530
3531 ac = cset->ac;
3532 while(ac!=NULL) {
3533 /* Close the vp, and give it back */
3534 if (ac->vp) {
3535 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3536 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3537 vput(ac->vp);
3538 ac->vp = NULL;
3539 }
3540 ac = ac->next;
3541 }
3542 }
3543
3544
3545 void
3546 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3547 {
3548 RF_AutoConfig_t *ac;
3549 RF_AutoConfig_t *next_ac;
3550
3551 ac = cset->ac;
3552 while(ac!=NULL) {
3553 next_ac = ac->next;
3554 /* nuke the label */
3555 free(ac->clabel, M_RAIDFRAME);
3556 /* cleanup the config structure */
3557 free(ac, M_RAIDFRAME);
3558 /* "next.." */
3559 ac = next_ac;
3560 }
3561 /* and, finally, nuke the config set */
3562 free(cset, M_RAIDFRAME);
3563 }
3564
3565
3566 void
3567 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3568 {
3569 /* current version number */
3570 clabel->version = RF_COMPONENT_LABEL_VERSION;
3571 clabel->serial_number = raidPtr->serial_number;
3572 clabel->mod_counter = raidPtr->mod_counter;
3573
3574 clabel->num_rows = 1;
3575 clabel->num_columns = raidPtr->numCol;
3576 clabel->clean = RF_RAID_DIRTY; /* not clean */
3577 clabel->status = rf_ds_optimal; /* "It's good!" */
3578
3579 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3580 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3581 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3582
3583 clabel->blockSize = raidPtr->bytesPerSector;
3584 clabel->numBlocks = raidPtr->sectorsPerDisk;
3585 clabel->numBlocksHi = raidPtr->sectorsPerDisk >> 32;
3586
3587 /* XXX not portable */
3588 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3589 clabel->maxOutstanding = raidPtr->maxOutstanding;
3590 clabel->autoconfigure = raidPtr->autoconfigure;
3591 clabel->root_partition = raidPtr->root_partition;
3592 clabel->last_unit = raidPtr->raidid;
3593 clabel->config_order = raidPtr->config_order;
3594
3595 #ifndef RF_NO_PARITY_MAP
3596 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3597 #endif
3598 }
3599
3600 int
3601 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3602 {
3603 RF_Raid_t *raidPtr;
3604 RF_Config_t *config;
3605 int raidID;
3606 int retcode;
3607
3608 #ifdef DEBUG
3609 printf("RAID autoconfigure\n");
3610 #endif
3611
3612 retcode = 0;
3613 *unit = -1;
3614
3615 /* 1. Create a config structure */
3616
3617 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3618 M_RAIDFRAME,
3619 M_NOWAIT);
3620 if (config==NULL) {
3621 printf("Out of mem!?!?\n");
3622 /* XXX do something more intelligent here. */
3623 return(1);
3624 }
3625
3626 memset(config, 0, sizeof(RF_Config_t));
3627
3628 /*
3629 2. Figure out what RAID ID this one is supposed to live at
3630 See if we can get the same RAID dev that it was configured
3631 on last time..
3632 */
3633
3634 raidID = cset->ac->clabel->last_unit;
3635 if ((raidID < 0) || (raidID >= numraid)) {
3636 /* let's not wander off into lala land. */
3637 raidID = numraid - 1;
3638 }
3639 if (raidPtrs[raidID]->valid != 0) {
3640
3641 /*
3642 Nope... Go looking for an alternative...
3643 Start high so we don't immediately use raid0 if that's
3644 not taken.
3645 */
3646
3647 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3648 if (raidPtrs[raidID]->valid == 0) {
3649 /* can use this one! */
3650 break;
3651 }
3652 }
3653 }
3654
3655 if (raidID < 0) {
3656 /* punt... */
3657 printf("Unable to auto configure this set!\n");
3658 printf("(Out of RAID devs!)\n");
3659 free(config, M_RAIDFRAME);
3660 return(1);
3661 }
3662
3663 #ifdef DEBUG
3664 printf("Configuring raid%d:\n",raidID);
3665 #endif
3666
3667 raidPtr = raidPtrs[raidID];
3668
3669 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3670 raidPtr->raidid = raidID;
3671 raidPtr->openings = RAIDOUTSTANDING;
3672
3673 /* 3. Build the configuration structure */
3674 rf_create_configuration(cset->ac, config, raidPtr);
3675
3676 /* 4. Do the configuration */
3677 retcode = rf_Configure(raidPtr, config, cset->ac);
3678
3679 if (retcode == 0) {
3680
3681 raidinit(raidPtrs[raidID]);
3682
3683 rf_markalldirty(raidPtrs[raidID]);
3684 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3685 if (cset->ac->clabel->root_partition==1) {
3686 /* everything configured just fine. Make a note
3687 that this set is eligible to be root. */
3688 cset->rootable = 1;
3689 /* XXX do this here? */
3690 raidPtrs[raidID]->root_partition = 1;
3691 }
3692 }
3693
3694 /* 5. Cleanup */
3695 free(config, M_RAIDFRAME);
3696
3697 *unit = raidID;
3698 return(retcode);
3699 }
3700
3701 void
3702 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3703 {
3704 struct buf *bp;
3705
3706 bp = (struct buf *)desc->bp;
3707 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3708 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3709 }
3710
3711 void
3712 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3713 size_t xmin, size_t xmax)
3714 {
3715 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3716 pool_sethiwat(p, xmax);
3717 pool_prime(p, xmin);
3718 pool_setlowat(p, xmin);
3719 }
3720
3721 /*
3722 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3723 * if there is IO pending and if that IO could possibly be done for a
3724 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3725 * otherwise.
3726 *
3727 */
3728
3729 int
3730 rf_buf_queue_check(int raidid)
3731 {
3732 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
3733 raidPtrs[raidid]->openings > 0) {
3734 /* there is work to do */
3735 return 0;
3736 }
3737 /* default is nothing to do */
3738 return 1;
3739 }
3740
3741 int
3742 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3743 {
3744 uint64_t numsecs;
3745 unsigned secsize;
3746 int error;
3747
3748 error = getdisksize(vp, &numsecs, &secsize);
3749 if (error == 0) {
3750 diskPtr->blockSize = secsize;
3751 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3752 diskPtr->partitionSize = numsecs;
3753 return 0;
3754 }
3755 return error;
3756 }
3757
3758 static int
3759 raid_match(device_t self, cfdata_t cfdata, void *aux)
3760 {
3761 return 1;
3762 }
3763
3764 static void
3765 raid_attach(device_t parent, device_t self, void *aux)
3766 {
3767
3768 }
3769
3770
3771 static int
3772 raid_detach(device_t self, int flags)
3773 {
3774 int error;
3775 struct raid_softc *rs = &raid_softc[device_unit(self)];
3776
3777 if ((error = raidlock(rs)) != 0)
3778 return (error);
3779
3780 error = raid_detach_unlocked(rs);
3781
3782 raidunlock(rs);
3783
3784 return error;
3785 }
3786
3787 static void
3788 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3789 {
3790 prop_dictionary_t disk_info, odisk_info, geom;
3791 disk_info = prop_dictionary_create();
3792 geom = prop_dictionary_create();
3793 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3794 raidPtr->totalSectors);
3795 prop_dictionary_set_uint32(geom, "sector-size",
3796 raidPtr->bytesPerSector);
3797
3798 prop_dictionary_set_uint16(geom, "sectors-per-track",
3799 raidPtr->Layout.dataSectorsPerStripe);
3800 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3801 4 * raidPtr->numCol);
3802
3803 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3804 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3805 (4 * raidPtr->numCol)));
3806
3807 prop_dictionary_set(disk_info, "geometry", geom);
3808 prop_object_release(geom);
3809 prop_dictionary_set(device_properties(rs->sc_dev),
3810 "disk-info", disk_info);
3811 odisk_info = rs->sc_dkdev.dk_info;
3812 rs->sc_dkdev.dk_info = disk_info;
3813 if (odisk_info)
3814 prop_object_release(odisk_info);
3815 }
3816
3817 /*
3818 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3819 * We end up returning whatever error was returned by the first cache flush
3820 * that fails.
3821 */
3822
3823 int
3824 rf_sync_component_caches(RF_Raid_t *raidPtr)
3825 {
3826 int c, sparecol;
3827 int e,error;
3828 int force = 1;
3829
3830 error = 0;
3831 for (c = 0; c < raidPtr->numCol; c++) {
3832 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3833 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3834 &force, FWRITE, NOCRED);
3835 if (e) {
3836 if (e != ENODEV)
3837 printf("raid%d: cache flush to component %s failed.\n",
3838 raidPtr->raidid, raidPtr->Disks[c].devname);
3839 if (error == 0) {
3840 error = e;
3841 }
3842 }
3843 }
3844 }
3845
3846 for( c = 0; c < raidPtr->numSpare ; c++) {
3847 sparecol = raidPtr->numCol + c;
3848 /* Need to ensure that the reconstruct actually completed! */
3849 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3850 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3851 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3852 if (e) {
3853 if (e != ENODEV)
3854 printf("raid%d: cache flush to component %s failed.\n",
3855 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3856 if (error == 0) {
3857 error = e;
3858 }
3859 }
3860 }
3861 }
3862 return error;
3863 }
3864