rf_netbsdkintf.c revision 1.284 1 /* $NetBSD: rf_netbsdkintf.c,v 1.284 2011/03/18 23:53:26 mrg Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.284 2011/03/18 23:53:26 mrg Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #include "raid.h"
110 #endif
111
112 #include <sys/param.h>
113 #include <sys/errno.h>
114 #include <sys/pool.h>
115 #include <sys/proc.h>
116 #include <sys/queue.h>
117 #include <sys/disk.h>
118 #include <sys/device.h>
119 #include <sys/stat.h>
120 #include <sys/ioctl.h>
121 #include <sys/fcntl.h>
122 #include <sys/systm.h>
123 #include <sys/vnode.h>
124 #include <sys/disklabel.h>
125 #include <sys/conf.h>
126 #include <sys/buf.h>
127 #include <sys/bufq.h>
128 #include <sys/reboot.h>
129 #include <sys/kauth.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #ifdef DEBUG
156 int rf_kdebug_level = 0;
157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
158 #else /* DEBUG */
159 #define db1_printf(a) { }
160 #endif /* DEBUG */
161
162 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
166
167 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
168 * spare table */
169 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
170 * installation process */
171 #endif
172
173 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
174
175 /* prototypes */
176 static void KernelWakeupFunc(struct buf *);
177 static void InitBP(struct buf *, struct vnode *, unsigned,
178 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
179 void *, int, struct proc *);
180 static void raidinit(RF_Raid_t *);
181
182 void raidattach(int);
183 static int raid_match(device_t, cfdata_t, void *);
184 static void raid_attach(device_t, device_t, void *);
185 static int raid_detach(device_t, int);
186
187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
188 daddr_t, daddr_t);
189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t, int);
191
192 static int raidwrite_component_label(unsigned,
193 dev_t, struct vnode *, RF_ComponentLabel_t *);
194 static int raidread_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196
197
198 dev_type_open(raidopen);
199 dev_type_close(raidclose);
200 dev_type_read(raidread);
201 dev_type_write(raidwrite);
202 dev_type_ioctl(raidioctl);
203 dev_type_strategy(raidstrategy);
204 dev_type_dump(raiddump);
205 dev_type_size(raidsize);
206
207 const struct bdevsw raid_bdevsw = {
208 raidopen, raidclose, raidstrategy, raidioctl,
209 raiddump, raidsize, D_DISK
210 };
211
212 const struct cdevsw raid_cdevsw = {
213 raidopen, raidclose, raidread, raidwrite, raidioctl,
214 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
215 };
216
217 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
218
219 /* XXX Not sure if the following should be replacing the raidPtrs above,
220 or if it should be used in conjunction with that...
221 */
222
223 struct raid_softc {
224 device_t sc_dev;
225 int sc_flags; /* flags */
226 int sc_cflags; /* configuration flags */
227 uint64_t sc_size; /* size of the raid device */
228 char sc_xname[20]; /* XXX external name */
229 struct disk sc_dkdev; /* generic disk device info */
230 struct bufq_state *buf_queue; /* used for the device queue */
231 };
232 /* sc_flags */
233 #define RAIDF_INITED 0x01 /* unit has been initialized */
234 #define RAIDF_WLABEL 0x02 /* label area is writable */
235 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
236 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
237 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
238 #define RAIDF_LOCKED 0x80 /* unit is locked */
239
240 #define raidunit(x) DISKUNIT(x)
241 int numraid = 0;
242
243 extern struct cfdriver raid_cd;
244 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
245 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
246 DVF_DETACH_SHUTDOWN);
247
248 /*
249 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
250 * Be aware that large numbers can allow the driver to consume a lot of
251 * kernel memory, especially on writes, and in degraded mode reads.
252 *
253 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
254 * a single 64K write will typically require 64K for the old data,
255 * 64K for the old parity, and 64K for the new parity, for a total
256 * of 192K (if the parity buffer is not re-used immediately).
257 * Even it if is used immediately, that's still 128K, which when multiplied
258 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
259 *
260 * Now in degraded mode, for example, a 64K read on the above setup may
261 * require data reconstruction, which will require *all* of the 4 remaining
262 * disks to participate -- 4 * 32K/disk == 128K again.
263 */
264
265 #ifndef RAIDOUTSTANDING
266 #define RAIDOUTSTANDING 6
267 #endif
268
269 #define RAIDLABELDEV(dev) \
270 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
271
272 /* declared here, and made public, for the benefit of KVM stuff.. */
273 struct raid_softc *raid_softc;
274
275 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
276 struct disklabel *);
277 static void raidgetdisklabel(dev_t);
278 static void raidmakedisklabel(struct raid_softc *);
279
280 static int raidlock(struct raid_softc *);
281 static void raidunlock(struct raid_softc *);
282
283 static int raid_detach_unlocked(struct raid_softc *);
284
285 static void rf_markalldirty(RF_Raid_t *);
286 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
287
288 void rf_ReconThread(struct rf_recon_req *);
289 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
290 void rf_CopybackThread(RF_Raid_t *raidPtr);
291 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
292 int rf_autoconfig(device_t);
293 void rf_buildroothack(RF_ConfigSet_t *);
294
295 RF_AutoConfig_t *rf_find_raid_components(void);
296 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
297 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
298 static int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
299 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
300 int rf_set_autoconfig(RF_Raid_t *, int);
301 int rf_set_rootpartition(RF_Raid_t *, int);
302 void rf_release_all_vps(RF_ConfigSet_t *);
303 void rf_cleanup_config_set(RF_ConfigSet_t *);
304 int rf_have_enough_components(RF_ConfigSet_t *);
305 int rf_auto_config_set(RF_ConfigSet_t *, int *);
306 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
307
308 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
309 allow autoconfig to take place.
310 Note that this is overridden by having
311 RAID_AUTOCONFIG as an option in the
312 kernel config file. */
313
314 struct RF_Pools_s rf_pools;
315
316 void
317 raidattach(int num)
318 {
319 int raidID;
320 int i, rc;
321
322 aprint_debug("raidattach: Asked for %d units\n", num);
323
324 if (num <= 0) {
325 #ifdef DIAGNOSTIC
326 panic("raidattach: count <= 0");
327 #endif
328 return;
329 }
330 /* This is where all the initialization stuff gets done. */
331
332 numraid = num;
333
334 /* Make some space for requested number of units... */
335
336 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
337 if (raidPtrs == NULL) {
338 panic("raidPtrs is NULL!!");
339 }
340
341 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
342 rf_mutex_init(&rf_sparet_wait_mutex);
343
344 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
345 #endif
346
347 for (i = 0; i < num; i++)
348 raidPtrs[i] = NULL;
349 rc = rf_BootRaidframe();
350 if (rc == 0)
351 aprint_verbose("Kernelized RAIDframe activated\n");
352 else
353 panic("Serious error booting RAID!!");
354
355 /* put together some datastructures like the CCD device does.. This
356 * lets us lock the device and what-not when it gets opened. */
357
358 raid_softc = (struct raid_softc *)
359 malloc(num * sizeof(struct raid_softc),
360 M_RAIDFRAME, M_NOWAIT);
361 if (raid_softc == NULL) {
362 aprint_error("WARNING: no memory for RAIDframe driver\n");
363 return;
364 }
365
366 memset(raid_softc, 0, num * sizeof(struct raid_softc));
367
368 for (raidID = 0; raidID < num; raidID++) {
369 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
370
371 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
372 (RF_Raid_t *));
373 if (raidPtrs[raidID] == NULL) {
374 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
375 numraid = raidID;
376 return;
377 }
378 }
379
380 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
381 aprint_error("raidattach: config_cfattach_attach failed?\n");
382 }
383
384 #ifdef RAID_AUTOCONFIG
385 raidautoconfig = 1;
386 #endif
387
388 /*
389 * Register a finalizer which will be used to auto-config RAID
390 * sets once all real hardware devices have been found.
391 */
392 if (config_finalize_register(NULL, rf_autoconfig) != 0)
393 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
394 }
395
396 int
397 rf_autoconfig(device_t self)
398 {
399 RF_AutoConfig_t *ac_list;
400 RF_ConfigSet_t *config_sets;
401
402 if (raidautoconfig == 0)
403 return (0);
404
405 /* XXX This code can only be run once. */
406 raidautoconfig = 0;
407
408 /* 1. locate all RAID components on the system */
409 aprint_debug("Searching for RAID components...\n");
410 ac_list = rf_find_raid_components();
411
412 /* 2. Sort them into their respective sets. */
413 config_sets = rf_create_auto_sets(ac_list);
414
415 /*
416 * 3. Evaluate each set andconfigure the valid ones.
417 * This gets done in rf_buildroothack().
418 */
419 rf_buildroothack(config_sets);
420
421 return 1;
422 }
423
424 void
425 rf_buildroothack(RF_ConfigSet_t *config_sets)
426 {
427 RF_ConfigSet_t *cset;
428 RF_ConfigSet_t *next_cset;
429 int retcode;
430 int raidID;
431 int rootID;
432 int col;
433 int num_root;
434 char *devname;
435
436 rootID = 0;
437 num_root = 0;
438 cset = config_sets;
439 while (cset != NULL) {
440 next_cset = cset->next;
441 if (rf_have_enough_components(cset) &&
442 cset->ac->clabel->autoconfigure==1) {
443 retcode = rf_auto_config_set(cset,&raidID);
444 if (!retcode) {
445 aprint_debug("raid%d: configured ok\n", raidID);
446 if (cset->rootable) {
447 rootID = raidID;
448 num_root++;
449 }
450 } else {
451 /* The autoconfig didn't work :( */
452 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
453 rf_release_all_vps(cset);
454 }
455 } else {
456 /* we're not autoconfiguring this set...
457 release the associated resources */
458 rf_release_all_vps(cset);
459 }
460 /* cleanup */
461 rf_cleanup_config_set(cset);
462 cset = next_cset;
463 }
464
465 /* if the user has specified what the root device should be
466 then we don't touch booted_device or boothowto... */
467
468 if (rootspec != NULL)
469 return;
470
471 /* we found something bootable... */
472
473 if (num_root == 1) {
474 booted_device = raid_softc[rootID].sc_dev;
475 } else if (num_root > 1) {
476
477 /*
478 * Maybe the MD code can help. If it cannot, then
479 * setroot() will discover that we have no
480 * booted_device and will ask the user if nothing was
481 * hardwired in the kernel config file
482 */
483
484 if (booted_device == NULL)
485 cpu_rootconf();
486 if (booted_device == NULL)
487 return;
488
489 num_root = 0;
490 for (raidID = 0; raidID < numraid; raidID++) {
491 if (raidPtrs[raidID]->valid == 0)
492 continue;
493
494 if (raidPtrs[raidID]->root_partition == 0)
495 continue;
496
497 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
498 devname = raidPtrs[raidID]->Disks[col].devname;
499 devname += sizeof("/dev/") - 1;
500 if (strncmp(devname, device_xname(booted_device),
501 strlen(device_xname(booted_device))) != 0)
502 continue;
503 aprint_debug("raid%d includes boot device %s\n",
504 raidID, devname);
505 num_root++;
506 rootID = raidID;
507 }
508 }
509
510 if (num_root == 1) {
511 booted_device = raid_softc[rootID].sc_dev;
512 } else {
513 /* we can't guess.. require the user to answer... */
514 boothowto |= RB_ASKNAME;
515 }
516 }
517 }
518
519
520 int
521 raidsize(dev_t dev)
522 {
523 struct raid_softc *rs;
524 struct disklabel *lp;
525 int part, unit, omask, size;
526
527 unit = raidunit(dev);
528 if (unit >= numraid)
529 return (-1);
530 rs = &raid_softc[unit];
531
532 if ((rs->sc_flags & RAIDF_INITED) == 0)
533 return (-1);
534
535 part = DISKPART(dev);
536 omask = rs->sc_dkdev.dk_openmask & (1 << part);
537 lp = rs->sc_dkdev.dk_label;
538
539 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
540 return (-1);
541
542 if (lp->d_partitions[part].p_fstype != FS_SWAP)
543 size = -1;
544 else
545 size = lp->d_partitions[part].p_size *
546 (lp->d_secsize / DEV_BSIZE);
547
548 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
549 return (-1);
550
551 return (size);
552
553 }
554
555 int
556 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
557 {
558 int unit = raidunit(dev);
559 struct raid_softc *rs;
560 const struct bdevsw *bdev;
561 struct disklabel *lp;
562 RF_Raid_t *raidPtr;
563 daddr_t offset;
564 int part, c, sparecol, j, scol, dumpto;
565 int error = 0;
566
567 if (unit >= numraid)
568 return (ENXIO);
569
570 rs = &raid_softc[unit];
571 raidPtr = raidPtrs[unit];
572
573 if ((rs->sc_flags & RAIDF_INITED) == 0)
574 return ENXIO;
575
576 /* we only support dumping to RAID 1 sets */
577 if (raidPtr->Layout.numDataCol != 1 ||
578 raidPtr->Layout.numParityCol != 1)
579 return EINVAL;
580
581
582 if ((error = raidlock(rs)) != 0)
583 return error;
584
585 if (size % DEV_BSIZE != 0) {
586 error = EINVAL;
587 goto out;
588 }
589
590 if (blkno + size / DEV_BSIZE > rs->sc_size) {
591 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
592 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
593 size / DEV_BSIZE, rs->sc_size);
594 error = EINVAL;
595 goto out;
596 }
597
598 part = DISKPART(dev);
599 lp = rs->sc_dkdev.dk_label;
600 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
601
602 /* figure out what device is alive.. */
603
604 /*
605 Look for a component to dump to. The preference for the
606 component to dump to is as follows:
607 1) the master
608 2) a used_spare of the master
609 3) the slave
610 4) a used_spare of the slave
611 */
612
613 dumpto = -1;
614 for (c = 0; c < raidPtr->numCol; c++) {
615 if (raidPtr->Disks[c].status == rf_ds_optimal) {
616 /* this might be the one */
617 dumpto = c;
618 break;
619 }
620 }
621
622 /*
623 At this point we have possibly selected a live master or a
624 live slave. We now check to see if there is a spared
625 master (or a spared slave), if we didn't find a live master
626 or a live slave.
627 */
628
629 for (c = 0; c < raidPtr->numSpare; c++) {
630 sparecol = raidPtr->numCol + c;
631 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
632 /* How about this one? */
633 scol = -1;
634 for(j=0;j<raidPtr->numCol;j++) {
635 if (raidPtr->Disks[j].spareCol == sparecol) {
636 scol = j;
637 break;
638 }
639 }
640 if (scol == 0) {
641 /*
642 We must have found a spared master!
643 We'll take that over anything else
644 found so far. (We couldn't have
645 found a real master before, since
646 this is a used spare, and it's
647 saying that it's replacing the
648 master.) On reboot (with
649 autoconfiguration turned on)
650 sparecol will become the 1st
651 component (component0) of this set.
652 */
653 dumpto = sparecol;
654 break;
655 } else if (scol != -1) {
656 /*
657 Must be a spared slave. We'll dump
658 to that if we havn't found anything
659 else so far.
660 */
661 if (dumpto == -1)
662 dumpto = sparecol;
663 }
664 }
665 }
666
667 if (dumpto == -1) {
668 /* we couldn't find any live components to dump to!?!?
669 */
670 error = EINVAL;
671 goto out;
672 }
673
674 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
675
676 /*
677 Note that blkno is relative to this particular partition.
678 By adding the offset of this partition in the RAID
679 set, and also adding RF_PROTECTED_SECTORS, we get a
680 value that is relative to the partition used for the
681 underlying component.
682 */
683
684 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
685 blkno + offset, va, size);
686
687 out:
688 raidunlock(rs);
689
690 return error;
691 }
692 /* ARGSUSED */
693 int
694 raidopen(dev_t dev, int flags, int fmt,
695 struct lwp *l)
696 {
697 int unit = raidunit(dev);
698 struct raid_softc *rs;
699 struct disklabel *lp;
700 int part, pmask;
701 int error = 0;
702
703 if (unit >= numraid)
704 return (ENXIO);
705 rs = &raid_softc[unit];
706
707 if ((error = raidlock(rs)) != 0)
708 return (error);
709
710 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
711 error = EBUSY;
712 goto bad;
713 }
714
715 lp = rs->sc_dkdev.dk_label;
716
717 part = DISKPART(dev);
718
719 /*
720 * If there are wedges, and this is not RAW_PART, then we
721 * need to fail.
722 */
723 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
724 error = EBUSY;
725 goto bad;
726 }
727 pmask = (1 << part);
728
729 if ((rs->sc_flags & RAIDF_INITED) &&
730 (rs->sc_dkdev.dk_openmask == 0))
731 raidgetdisklabel(dev);
732
733 /* make sure that this partition exists */
734
735 if (part != RAW_PART) {
736 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
737 ((part >= lp->d_npartitions) ||
738 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
739 error = ENXIO;
740 goto bad;
741 }
742 }
743 /* Prevent this unit from being unconfigured while open. */
744 switch (fmt) {
745 case S_IFCHR:
746 rs->sc_dkdev.dk_copenmask |= pmask;
747 break;
748
749 case S_IFBLK:
750 rs->sc_dkdev.dk_bopenmask |= pmask;
751 break;
752 }
753
754 if ((rs->sc_dkdev.dk_openmask == 0) &&
755 ((rs->sc_flags & RAIDF_INITED) != 0)) {
756 /* First one... mark things as dirty... Note that we *MUST*
757 have done a configure before this. I DO NOT WANT TO BE
758 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
759 THAT THEY BELONG TOGETHER!!!!! */
760 /* XXX should check to see if we're only open for reading
761 here... If so, we needn't do this, but then need some
762 other way of keeping track of what's happened.. */
763
764 rf_markalldirty(raidPtrs[unit]);
765 }
766
767
768 rs->sc_dkdev.dk_openmask =
769 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
770
771 bad:
772 raidunlock(rs);
773
774 return (error);
775
776
777 }
778 /* ARGSUSED */
779 int
780 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
781 {
782 int unit = raidunit(dev);
783 struct raid_softc *rs;
784 int error = 0;
785 int part;
786
787 if (unit >= numraid)
788 return (ENXIO);
789 rs = &raid_softc[unit];
790
791 if ((error = raidlock(rs)) != 0)
792 return (error);
793
794 part = DISKPART(dev);
795
796 /* ...that much closer to allowing unconfiguration... */
797 switch (fmt) {
798 case S_IFCHR:
799 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
800 break;
801
802 case S_IFBLK:
803 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
804 break;
805 }
806 rs->sc_dkdev.dk_openmask =
807 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
808
809 if ((rs->sc_dkdev.dk_openmask == 0) &&
810 ((rs->sc_flags & RAIDF_INITED) != 0)) {
811 /* Last one... device is not unconfigured yet.
812 Device shutdown has taken care of setting the
813 clean bits if RAIDF_INITED is not set
814 mark things as clean... */
815
816 rf_update_component_labels(raidPtrs[unit],
817 RF_FINAL_COMPONENT_UPDATE);
818
819 /* If the kernel is shutting down, it will detach
820 * this RAID set soon enough.
821 */
822 }
823
824 raidunlock(rs);
825 return (0);
826
827 }
828
829 void
830 raidstrategy(struct buf *bp)
831 {
832 int s;
833
834 unsigned int raidID = raidunit(bp->b_dev);
835 RF_Raid_t *raidPtr;
836 struct raid_softc *rs = &raid_softc[raidID];
837 int wlabel;
838
839 if ((rs->sc_flags & RAIDF_INITED) ==0) {
840 bp->b_error = ENXIO;
841 goto done;
842 }
843 if (raidID >= numraid || !raidPtrs[raidID]) {
844 bp->b_error = ENODEV;
845 goto done;
846 }
847 raidPtr = raidPtrs[raidID];
848 if (!raidPtr->valid) {
849 bp->b_error = ENODEV;
850 goto done;
851 }
852 if (bp->b_bcount == 0) {
853 db1_printf(("b_bcount is zero..\n"));
854 goto done;
855 }
856
857 /*
858 * Do bounds checking and adjust transfer. If there's an
859 * error, the bounds check will flag that for us.
860 */
861
862 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
863 if (DISKPART(bp->b_dev) == RAW_PART) {
864 uint64_t size; /* device size in DEV_BSIZE unit */
865
866 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
867 size = raidPtr->totalSectors <<
868 (raidPtr->logBytesPerSector - DEV_BSHIFT);
869 } else {
870 size = raidPtr->totalSectors >>
871 (DEV_BSHIFT - raidPtr->logBytesPerSector);
872 }
873 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
874 goto done;
875 }
876 } else {
877 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
878 db1_printf(("Bounds check failed!!:%d %d\n",
879 (int) bp->b_blkno, (int) wlabel));
880 goto done;
881 }
882 }
883 s = splbio();
884
885 bp->b_resid = 0;
886
887 /* stuff it onto our queue */
888 bufq_put(rs->buf_queue, bp);
889
890 /* scheduled the IO to happen at the next convenient time */
891 wakeup(&(raidPtrs[raidID]->iodone));
892
893 splx(s);
894 return;
895
896 done:
897 bp->b_resid = bp->b_bcount;
898 biodone(bp);
899 }
900 /* ARGSUSED */
901 int
902 raidread(dev_t dev, struct uio *uio, int flags)
903 {
904 int unit = raidunit(dev);
905 struct raid_softc *rs;
906
907 if (unit >= numraid)
908 return (ENXIO);
909 rs = &raid_softc[unit];
910
911 if ((rs->sc_flags & RAIDF_INITED) == 0)
912 return (ENXIO);
913
914 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
915
916 }
917 /* ARGSUSED */
918 int
919 raidwrite(dev_t dev, struct uio *uio, int flags)
920 {
921 int unit = raidunit(dev);
922 struct raid_softc *rs;
923
924 if (unit >= numraid)
925 return (ENXIO);
926 rs = &raid_softc[unit];
927
928 if ((rs->sc_flags & RAIDF_INITED) == 0)
929 return (ENXIO);
930
931 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
932
933 }
934
935 static int
936 raid_detach_unlocked(struct raid_softc *rs)
937 {
938 int error;
939 RF_Raid_t *raidPtr;
940
941 raidPtr = raidPtrs[device_unit(rs->sc_dev)];
942
943 /*
944 * If somebody has a partition mounted, we shouldn't
945 * shutdown.
946 */
947 if (rs->sc_dkdev.dk_openmask != 0)
948 return EBUSY;
949
950 if ((rs->sc_flags & RAIDF_INITED) == 0)
951 ; /* not initialized: nothing to do */
952 else if ((error = rf_Shutdown(raidPtr)) != 0)
953 return error;
954 else
955 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
956
957 /* Detach the disk. */
958 dkwedge_delall(&rs->sc_dkdev);
959 disk_detach(&rs->sc_dkdev);
960 disk_destroy(&rs->sc_dkdev);
961
962 return 0;
963 }
964
965 int
966 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
967 {
968 int unit = raidunit(dev);
969 int error = 0;
970 int part, pmask;
971 cfdata_t cf;
972 struct raid_softc *rs;
973 RF_Config_t *k_cfg, *u_cfg;
974 RF_Raid_t *raidPtr;
975 RF_RaidDisk_t *diskPtr;
976 RF_AccTotals_t *totals;
977 RF_DeviceConfig_t *d_cfg, **ucfgp;
978 u_char *specific_buf;
979 int retcode = 0;
980 int column;
981 /* int raidid; */
982 struct rf_recon_req *rrcopy, *rr;
983 RF_ComponentLabel_t *clabel;
984 RF_ComponentLabel_t *ci_label;
985 RF_ComponentLabel_t **clabel_ptr;
986 RF_SingleComponent_t *sparePtr,*componentPtr;
987 RF_SingleComponent_t component;
988 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
989 int i, j, d;
990 #ifdef __HAVE_OLD_DISKLABEL
991 struct disklabel newlabel;
992 #endif
993 struct dkwedge_info *dkw;
994
995 if (unit >= numraid)
996 return (ENXIO);
997 rs = &raid_softc[unit];
998 raidPtr = raidPtrs[unit];
999
1000 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1001 (int) DISKPART(dev), (int) unit, cmd));
1002
1003 /* Must be open for writes for these commands... */
1004 switch (cmd) {
1005 #ifdef DIOCGSECTORSIZE
1006 case DIOCGSECTORSIZE:
1007 *(u_int *)data = raidPtr->bytesPerSector;
1008 return 0;
1009 case DIOCGMEDIASIZE:
1010 *(off_t *)data =
1011 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1012 return 0;
1013 #endif
1014 case DIOCSDINFO:
1015 case DIOCWDINFO:
1016 #ifdef __HAVE_OLD_DISKLABEL
1017 case ODIOCWDINFO:
1018 case ODIOCSDINFO:
1019 #endif
1020 case DIOCWLABEL:
1021 case DIOCAWEDGE:
1022 case DIOCDWEDGE:
1023 if ((flag & FWRITE) == 0)
1024 return (EBADF);
1025 }
1026
1027 /* Must be initialized for these... */
1028 switch (cmd) {
1029 case DIOCGDINFO:
1030 case DIOCSDINFO:
1031 case DIOCWDINFO:
1032 #ifdef __HAVE_OLD_DISKLABEL
1033 case ODIOCGDINFO:
1034 case ODIOCWDINFO:
1035 case ODIOCSDINFO:
1036 case ODIOCGDEFLABEL:
1037 #endif
1038 case DIOCGPART:
1039 case DIOCWLABEL:
1040 case DIOCGDEFLABEL:
1041 case DIOCAWEDGE:
1042 case DIOCDWEDGE:
1043 case DIOCLWEDGES:
1044 case DIOCCACHESYNC:
1045 case RAIDFRAME_SHUTDOWN:
1046 case RAIDFRAME_REWRITEPARITY:
1047 case RAIDFRAME_GET_INFO:
1048 case RAIDFRAME_RESET_ACCTOTALS:
1049 case RAIDFRAME_GET_ACCTOTALS:
1050 case RAIDFRAME_KEEP_ACCTOTALS:
1051 case RAIDFRAME_GET_SIZE:
1052 case RAIDFRAME_FAIL_DISK:
1053 case RAIDFRAME_COPYBACK:
1054 case RAIDFRAME_CHECK_RECON_STATUS:
1055 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1056 case RAIDFRAME_GET_COMPONENT_LABEL:
1057 case RAIDFRAME_SET_COMPONENT_LABEL:
1058 case RAIDFRAME_ADD_HOT_SPARE:
1059 case RAIDFRAME_REMOVE_HOT_SPARE:
1060 case RAIDFRAME_INIT_LABELS:
1061 case RAIDFRAME_REBUILD_IN_PLACE:
1062 case RAIDFRAME_CHECK_PARITY:
1063 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1064 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1065 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1066 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1067 case RAIDFRAME_SET_AUTOCONFIG:
1068 case RAIDFRAME_SET_ROOT:
1069 case RAIDFRAME_DELETE_COMPONENT:
1070 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1071 case RAIDFRAME_PARITYMAP_STATUS:
1072 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1073 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1074 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1075 if ((rs->sc_flags & RAIDF_INITED) == 0)
1076 return (ENXIO);
1077 }
1078
1079 switch (cmd) {
1080 #ifdef COMPAT_50
1081 case RAIDFRAME_GET_INFO50:
1082 return rf_get_info50(raidPtr, data);
1083
1084 case RAIDFRAME_CONFIGURE50:
1085 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1086 return retcode;
1087 goto config;
1088 #endif
1089 /* configure the system */
1090 case RAIDFRAME_CONFIGURE:
1091
1092 if (raidPtr->valid) {
1093 /* There is a valid RAID set running on this unit! */
1094 printf("raid%d: Device already configured!\n",unit);
1095 return(EINVAL);
1096 }
1097
1098 /* copy-in the configuration information */
1099 /* data points to a pointer to the configuration structure */
1100
1101 u_cfg = *((RF_Config_t **) data);
1102 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1103 if (k_cfg == NULL) {
1104 return (ENOMEM);
1105 }
1106 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1107 if (retcode) {
1108 RF_Free(k_cfg, sizeof(RF_Config_t));
1109 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1110 retcode));
1111 return (retcode);
1112 }
1113 goto config;
1114 config:
1115 /* allocate a buffer for the layout-specific data, and copy it
1116 * in */
1117 if (k_cfg->layoutSpecificSize) {
1118 if (k_cfg->layoutSpecificSize > 10000) {
1119 /* sanity check */
1120 RF_Free(k_cfg, sizeof(RF_Config_t));
1121 return (EINVAL);
1122 }
1123 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1124 (u_char *));
1125 if (specific_buf == NULL) {
1126 RF_Free(k_cfg, sizeof(RF_Config_t));
1127 return (ENOMEM);
1128 }
1129 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1130 k_cfg->layoutSpecificSize);
1131 if (retcode) {
1132 RF_Free(k_cfg, sizeof(RF_Config_t));
1133 RF_Free(specific_buf,
1134 k_cfg->layoutSpecificSize);
1135 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1136 retcode));
1137 return (retcode);
1138 }
1139 } else
1140 specific_buf = NULL;
1141 k_cfg->layoutSpecific = specific_buf;
1142
1143 /* should do some kind of sanity check on the configuration.
1144 * Store the sum of all the bytes in the last byte? */
1145
1146 /* configure the system */
1147
1148 /*
1149 * Clear the entire RAID descriptor, just to make sure
1150 * there is no stale data left in the case of a
1151 * reconfiguration
1152 */
1153 memset(raidPtr, 0, sizeof(*raidPtr));
1154 raidPtr->raidid = unit;
1155
1156 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1157
1158 if (retcode == 0) {
1159
1160 /* allow this many simultaneous IO's to
1161 this RAID device */
1162 raidPtr->openings = RAIDOUTSTANDING;
1163
1164 raidinit(raidPtr);
1165 rf_markalldirty(raidPtr);
1166 }
1167 /* free the buffers. No return code here. */
1168 if (k_cfg->layoutSpecificSize) {
1169 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1170 }
1171 RF_Free(k_cfg, sizeof(RF_Config_t));
1172
1173 return (retcode);
1174
1175 /* shutdown the system */
1176 case RAIDFRAME_SHUTDOWN:
1177
1178 part = DISKPART(dev);
1179 pmask = (1 << part);
1180
1181 if ((error = raidlock(rs)) != 0)
1182 return (error);
1183
1184 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1185 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1186 (rs->sc_dkdev.dk_copenmask & pmask)))
1187 retcode = EBUSY;
1188 else {
1189 rs->sc_flags |= RAIDF_SHUTDOWN;
1190 rs->sc_dkdev.dk_copenmask &= ~pmask;
1191 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1192 rs->sc_dkdev.dk_openmask &= ~pmask;
1193 retcode = 0;
1194 }
1195
1196 raidunlock(rs);
1197
1198 if (retcode != 0)
1199 return retcode;
1200
1201 /* free the pseudo device attach bits */
1202
1203 cf = device_cfdata(rs->sc_dev);
1204 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1205 free(cf, M_RAIDFRAME);
1206
1207 return (retcode);
1208 case RAIDFRAME_GET_COMPONENT_LABEL:
1209 clabel_ptr = (RF_ComponentLabel_t **) data;
1210 /* need to read the component label for the disk indicated
1211 by row,column in clabel */
1212
1213 /*
1214 * Perhaps there should be an option to skip the in-core
1215 * copy and hit the disk, as with disklabel(8).
1216 */
1217 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1218
1219 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1220
1221 if (retcode) {
1222 RF_Free(clabel, sizeof(*clabel));
1223 return retcode;
1224 }
1225
1226 clabel->row = 0; /* Don't allow looking at anything else.*/
1227
1228 column = clabel->column;
1229
1230 if ((column < 0) || (column >= raidPtr->numCol +
1231 raidPtr->numSpare)) {
1232 RF_Free(clabel, sizeof(*clabel));
1233 return EINVAL;
1234 }
1235
1236 RF_Free(clabel, sizeof(*clabel));
1237
1238 clabel = raidget_component_label(raidPtr, column);
1239
1240 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1241
1242 #if 0
1243 case RAIDFRAME_SET_COMPONENT_LABEL:
1244 clabel = (RF_ComponentLabel_t *) data;
1245
1246 /* XXX check the label for valid stuff... */
1247 /* Note that some things *should not* get modified --
1248 the user should be re-initing the labels instead of
1249 trying to patch things.
1250 */
1251
1252 raidid = raidPtr->raidid;
1253 #ifdef DEBUG
1254 printf("raid%d: Got component label:\n", raidid);
1255 printf("raid%d: Version: %d\n", raidid, clabel->version);
1256 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1257 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1258 printf("raid%d: Column: %d\n", raidid, clabel->column);
1259 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1260 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1261 printf("raid%d: Status: %d\n", raidid, clabel->status);
1262 #endif
1263 clabel->row = 0;
1264 column = clabel->column;
1265
1266 if ((column < 0) || (column >= raidPtr->numCol)) {
1267 return(EINVAL);
1268 }
1269
1270 /* XXX this isn't allowed to do anything for now :-) */
1271
1272 /* XXX and before it is, we need to fill in the rest
1273 of the fields!?!?!?! */
1274 memcpy(raidget_component_label(raidPtr, column),
1275 clabel, sizeof(*clabel));
1276 raidflush_component_label(raidPtr, column);
1277 return (0);
1278 #endif
1279
1280 case RAIDFRAME_INIT_LABELS:
1281 clabel = (RF_ComponentLabel_t *) data;
1282 /*
1283 we only want the serial number from
1284 the above. We get all the rest of the information
1285 from the config that was used to create this RAID
1286 set.
1287 */
1288
1289 raidPtr->serial_number = clabel->serial_number;
1290
1291 for(column=0;column<raidPtr->numCol;column++) {
1292 diskPtr = &raidPtr->Disks[column];
1293 if (!RF_DEAD_DISK(diskPtr->status)) {
1294 ci_label = raidget_component_label(raidPtr,
1295 column);
1296 /* Zeroing this is important. */
1297 memset(ci_label, 0, sizeof(*ci_label));
1298 raid_init_component_label(raidPtr, ci_label);
1299 ci_label->serial_number =
1300 raidPtr->serial_number;
1301 ci_label->row = 0; /* we dont' pretend to support more */
1302 rf_component_label_set_partitionsize(ci_label,
1303 diskPtr->partitionSize);
1304 ci_label->column = column;
1305 raidflush_component_label(raidPtr, column);
1306 }
1307 /* XXXjld what about the spares? */
1308 }
1309
1310 return (retcode);
1311 case RAIDFRAME_SET_AUTOCONFIG:
1312 d = rf_set_autoconfig(raidPtr, *(int *) data);
1313 printf("raid%d: New autoconfig value is: %d\n",
1314 raidPtr->raidid, d);
1315 *(int *) data = d;
1316 return (retcode);
1317
1318 case RAIDFRAME_SET_ROOT:
1319 d = rf_set_rootpartition(raidPtr, *(int *) data);
1320 printf("raid%d: New rootpartition value is: %d\n",
1321 raidPtr->raidid, d);
1322 *(int *) data = d;
1323 return (retcode);
1324
1325 /* initialize all parity */
1326 case RAIDFRAME_REWRITEPARITY:
1327
1328 if (raidPtr->Layout.map->faultsTolerated == 0) {
1329 /* Parity for RAID 0 is trivially correct */
1330 raidPtr->parity_good = RF_RAID_CLEAN;
1331 return(0);
1332 }
1333
1334 if (raidPtr->parity_rewrite_in_progress == 1) {
1335 /* Re-write is already in progress! */
1336 return(EINVAL);
1337 }
1338
1339 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1340 rf_RewriteParityThread,
1341 raidPtr,"raid_parity");
1342 return (retcode);
1343
1344
1345 case RAIDFRAME_ADD_HOT_SPARE:
1346 sparePtr = (RF_SingleComponent_t *) data;
1347 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1348 retcode = rf_add_hot_spare(raidPtr, &component);
1349 return(retcode);
1350
1351 case RAIDFRAME_REMOVE_HOT_SPARE:
1352 return(retcode);
1353
1354 case RAIDFRAME_DELETE_COMPONENT:
1355 componentPtr = (RF_SingleComponent_t *)data;
1356 memcpy( &component, componentPtr,
1357 sizeof(RF_SingleComponent_t));
1358 retcode = rf_delete_component(raidPtr, &component);
1359 return(retcode);
1360
1361 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1362 componentPtr = (RF_SingleComponent_t *)data;
1363 memcpy( &component, componentPtr,
1364 sizeof(RF_SingleComponent_t));
1365 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1366 return(retcode);
1367
1368 case RAIDFRAME_REBUILD_IN_PLACE:
1369
1370 if (raidPtr->Layout.map->faultsTolerated == 0) {
1371 /* Can't do this on a RAID 0!! */
1372 return(EINVAL);
1373 }
1374
1375 if (raidPtr->recon_in_progress == 1) {
1376 /* a reconstruct is already in progress! */
1377 return(EINVAL);
1378 }
1379
1380 componentPtr = (RF_SingleComponent_t *) data;
1381 memcpy( &component, componentPtr,
1382 sizeof(RF_SingleComponent_t));
1383 component.row = 0; /* we don't support any more */
1384 column = component.column;
1385
1386 if ((column < 0) || (column >= raidPtr->numCol)) {
1387 return(EINVAL);
1388 }
1389
1390 RF_LOCK_MUTEX(raidPtr->mutex);
1391 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1392 (raidPtr->numFailures > 0)) {
1393 /* XXX 0 above shouldn't be constant!!! */
1394 /* some component other than this has failed.
1395 Let's not make things worse than they already
1396 are... */
1397 printf("raid%d: Unable to reconstruct to disk at:\n",
1398 raidPtr->raidid);
1399 printf("raid%d: Col: %d Too many failures.\n",
1400 raidPtr->raidid, column);
1401 RF_UNLOCK_MUTEX(raidPtr->mutex);
1402 return (EINVAL);
1403 }
1404 if (raidPtr->Disks[column].status ==
1405 rf_ds_reconstructing) {
1406 printf("raid%d: Unable to reconstruct to disk at:\n",
1407 raidPtr->raidid);
1408 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1409
1410 RF_UNLOCK_MUTEX(raidPtr->mutex);
1411 return (EINVAL);
1412 }
1413 if (raidPtr->Disks[column].status == rf_ds_spared) {
1414 RF_UNLOCK_MUTEX(raidPtr->mutex);
1415 return (EINVAL);
1416 }
1417 RF_UNLOCK_MUTEX(raidPtr->mutex);
1418
1419 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1420 if (rrcopy == NULL)
1421 return(ENOMEM);
1422
1423 rrcopy->raidPtr = (void *) raidPtr;
1424 rrcopy->col = column;
1425
1426 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1427 rf_ReconstructInPlaceThread,
1428 rrcopy,"raid_reconip");
1429 return(retcode);
1430
1431 case RAIDFRAME_GET_INFO:
1432 if (!raidPtr->valid)
1433 return (ENODEV);
1434 ucfgp = (RF_DeviceConfig_t **) data;
1435 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1436 (RF_DeviceConfig_t *));
1437 if (d_cfg == NULL)
1438 return (ENOMEM);
1439 d_cfg->rows = 1; /* there is only 1 row now */
1440 d_cfg->cols = raidPtr->numCol;
1441 d_cfg->ndevs = raidPtr->numCol;
1442 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1443 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1444 return (ENOMEM);
1445 }
1446 d_cfg->nspares = raidPtr->numSpare;
1447 if (d_cfg->nspares >= RF_MAX_DISKS) {
1448 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1449 return (ENOMEM);
1450 }
1451 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1452 d = 0;
1453 for (j = 0; j < d_cfg->cols; j++) {
1454 d_cfg->devs[d] = raidPtr->Disks[j];
1455 d++;
1456 }
1457 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1458 d_cfg->spares[i] = raidPtr->Disks[j];
1459 }
1460 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1461 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1462
1463 return (retcode);
1464
1465 case RAIDFRAME_CHECK_PARITY:
1466 *(int *) data = raidPtr->parity_good;
1467 return (0);
1468
1469 case RAIDFRAME_PARITYMAP_STATUS:
1470 if (rf_paritymap_ineligible(raidPtr))
1471 return EINVAL;
1472 rf_paritymap_status(raidPtr->parity_map,
1473 (struct rf_pmstat *)data);
1474 return 0;
1475
1476 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1477 if (rf_paritymap_ineligible(raidPtr))
1478 return EINVAL;
1479 if (raidPtr->parity_map == NULL)
1480 return ENOENT; /* ??? */
1481 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1482 (struct rf_pmparams *)data, 1))
1483 return EINVAL;
1484 return 0;
1485
1486 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1487 if (rf_paritymap_ineligible(raidPtr))
1488 return EINVAL;
1489 *(int *) data = rf_paritymap_get_disable(raidPtr);
1490 return 0;
1491
1492 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1493 if (rf_paritymap_ineligible(raidPtr))
1494 return EINVAL;
1495 rf_paritymap_set_disable(raidPtr, *(int *)data);
1496 /* XXX should errors be passed up? */
1497 return 0;
1498
1499 case RAIDFRAME_RESET_ACCTOTALS:
1500 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1501 return (0);
1502
1503 case RAIDFRAME_GET_ACCTOTALS:
1504 totals = (RF_AccTotals_t *) data;
1505 *totals = raidPtr->acc_totals;
1506 return (0);
1507
1508 case RAIDFRAME_KEEP_ACCTOTALS:
1509 raidPtr->keep_acc_totals = *(int *)data;
1510 return (0);
1511
1512 case RAIDFRAME_GET_SIZE:
1513 *(int *) data = raidPtr->totalSectors;
1514 return (0);
1515
1516 /* fail a disk & optionally start reconstruction */
1517 case RAIDFRAME_FAIL_DISK:
1518
1519 if (raidPtr->Layout.map->faultsTolerated == 0) {
1520 /* Can't do this on a RAID 0!! */
1521 return(EINVAL);
1522 }
1523
1524 rr = (struct rf_recon_req *) data;
1525 rr->row = 0;
1526 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1527 return (EINVAL);
1528
1529
1530 RF_LOCK_MUTEX(raidPtr->mutex);
1531 if (raidPtr->status == rf_rs_reconstructing) {
1532 /* you can't fail a disk while we're reconstructing! */
1533 /* XXX wrong for RAID6 */
1534 RF_UNLOCK_MUTEX(raidPtr->mutex);
1535 return (EINVAL);
1536 }
1537 if ((raidPtr->Disks[rr->col].status ==
1538 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1539 /* some other component has failed. Let's not make
1540 things worse. XXX wrong for RAID6 */
1541 RF_UNLOCK_MUTEX(raidPtr->mutex);
1542 return (EINVAL);
1543 }
1544 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1545 /* Can't fail a spared disk! */
1546 RF_UNLOCK_MUTEX(raidPtr->mutex);
1547 return (EINVAL);
1548 }
1549 RF_UNLOCK_MUTEX(raidPtr->mutex);
1550
1551 /* make a copy of the recon request so that we don't rely on
1552 * the user's buffer */
1553 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1554 if (rrcopy == NULL)
1555 return(ENOMEM);
1556 memcpy(rrcopy, rr, sizeof(*rr));
1557 rrcopy->raidPtr = (void *) raidPtr;
1558
1559 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1560 rf_ReconThread,
1561 rrcopy,"raid_recon");
1562 return (0);
1563
1564 /* invoke a copyback operation after recon on whatever disk
1565 * needs it, if any */
1566 case RAIDFRAME_COPYBACK:
1567
1568 if (raidPtr->Layout.map->faultsTolerated == 0) {
1569 /* This makes no sense on a RAID 0!! */
1570 return(EINVAL);
1571 }
1572
1573 if (raidPtr->copyback_in_progress == 1) {
1574 /* Copyback is already in progress! */
1575 return(EINVAL);
1576 }
1577
1578 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1579 rf_CopybackThread,
1580 raidPtr,"raid_copyback");
1581 return (retcode);
1582
1583 /* return the percentage completion of reconstruction */
1584 case RAIDFRAME_CHECK_RECON_STATUS:
1585 if (raidPtr->Layout.map->faultsTolerated == 0) {
1586 /* This makes no sense on a RAID 0, so tell the
1587 user it's done. */
1588 *(int *) data = 100;
1589 return(0);
1590 }
1591 if (raidPtr->status != rf_rs_reconstructing)
1592 *(int *) data = 100;
1593 else {
1594 if (raidPtr->reconControl->numRUsTotal > 0) {
1595 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1596 } else {
1597 *(int *) data = 0;
1598 }
1599 }
1600 return (0);
1601 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1602 progressInfoPtr = (RF_ProgressInfo_t **) data;
1603 if (raidPtr->status != rf_rs_reconstructing) {
1604 progressInfo.remaining = 0;
1605 progressInfo.completed = 100;
1606 progressInfo.total = 100;
1607 } else {
1608 progressInfo.total =
1609 raidPtr->reconControl->numRUsTotal;
1610 progressInfo.completed =
1611 raidPtr->reconControl->numRUsComplete;
1612 progressInfo.remaining = progressInfo.total -
1613 progressInfo.completed;
1614 }
1615 retcode = copyout(&progressInfo, *progressInfoPtr,
1616 sizeof(RF_ProgressInfo_t));
1617 return (retcode);
1618
1619 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1620 if (raidPtr->Layout.map->faultsTolerated == 0) {
1621 /* This makes no sense on a RAID 0, so tell the
1622 user it's done. */
1623 *(int *) data = 100;
1624 return(0);
1625 }
1626 if (raidPtr->parity_rewrite_in_progress == 1) {
1627 *(int *) data = 100 *
1628 raidPtr->parity_rewrite_stripes_done /
1629 raidPtr->Layout.numStripe;
1630 } else {
1631 *(int *) data = 100;
1632 }
1633 return (0);
1634
1635 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1636 progressInfoPtr = (RF_ProgressInfo_t **) data;
1637 if (raidPtr->parity_rewrite_in_progress == 1) {
1638 progressInfo.total = raidPtr->Layout.numStripe;
1639 progressInfo.completed =
1640 raidPtr->parity_rewrite_stripes_done;
1641 progressInfo.remaining = progressInfo.total -
1642 progressInfo.completed;
1643 } else {
1644 progressInfo.remaining = 0;
1645 progressInfo.completed = 100;
1646 progressInfo.total = 100;
1647 }
1648 retcode = copyout(&progressInfo, *progressInfoPtr,
1649 sizeof(RF_ProgressInfo_t));
1650 return (retcode);
1651
1652 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1653 if (raidPtr->Layout.map->faultsTolerated == 0) {
1654 /* This makes no sense on a RAID 0 */
1655 *(int *) data = 100;
1656 return(0);
1657 }
1658 if (raidPtr->copyback_in_progress == 1) {
1659 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1660 raidPtr->Layout.numStripe;
1661 } else {
1662 *(int *) data = 100;
1663 }
1664 return (0);
1665
1666 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1667 progressInfoPtr = (RF_ProgressInfo_t **) data;
1668 if (raidPtr->copyback_in_progress == 1) {
1669 progressInfo.total = raidPtr->Layout.numStripe;
1670 progressInfo.completed =
1671 raidPtr->copyback_stripes_done;
1672 progressInfo.remaining = progressInfo.total -
1673 progressInfo.completed;
1674 } else {
1675 progressInfo.remaining = 0;
1676 progressInfo.completed = 100;
1677 progressInfo.total = 100;
1678 }
1679 retcode = copyout(&progressInfo, *progressInfoPtr,
1680 sizeof(RF_ProgressInfo_t));
1681 return (retcode);
1682
1683 /* the sparetable daemon calls this to wait for the kernel to
1684 * need a spare table. this ioctl does not return until a
1685 * spare table is needed. XXX -- calling mpsleep here in the
1686 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1687 * -- I should either compute the spare table in the kernel,
1688 * or have a different -- XXX XXX -- interface (a different
1689 * character device) for delivering the table -- XXX */
1690 #if 0
1691 case RAIDFRAME_SPARET_WAIT:
1692 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1693 while (!rf_sparet_wait_queue)
1694 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1695 waitreq = rf_sparet_wait_queue;
1696 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1697 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1698
1699 /* structure assignment */
1700 *((RF_SparetWait_t *) data) = *waitreq;
1701
1702 RF_Free(waitreq, sizeof(*waitreq));
1703 return (0);
1704
1705 /* wakes up a process waiting on SPARET_WAIT and puts an error
1706 * code in it that will cause the dameon to exit */
1707 case RAIDFRAME_ABORT_SPARET_WAIT:
1708 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1709 waitreq->fcol = -1;
1710 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1711 waitreq->next = rf_sparet_wait_queue;
1712 rf_sparet_wait_queue = waitreq;
1713 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1714 wakeup(&rf_sparet_wait_queue);
1715 return (0);
1716
1717 /* used by the spare table daemon to deliver a spare table
1718 * into the kernel */
1719 case RAIDFRAME_SEND_SPARET:
1720
1721 /* install the spare table */
1722 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1723
1724 /* respond to the requestor. the return status of the spare
1725 * table installation is passed in the "fcol" field */
1726 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1727 waitreq->fcol = retcode;
1728 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1729 waitreq->next = rf_sparet_resp_queue;
1730 rf_sparet_resp_queue = waitreq;
1731 wakeup(&rf_sparet_resp_queue);
1732 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1733
1734 return (retcode);
1735 #endif
1736
1737 default:
1738 break; /* fall through to the os-specific code below */
1739
1740 }
1741
1742 if (!raidPtr->valid)
1743 return (EINVAL);
1744
1745 /*
1746 * Add support for "regular" device ioctls here.
1747 */
1748
1749 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1750 if (error != EPASSTHROUGH)
1751 return (error);
1752
1753 switch (cmd) {
1754 case DIOCGDINFO:
1755 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1756 break;
1757 #ifdef __HAVE_OLD_DISKLABEL
1758 case ODIOCGDINFO:
1759 newlabel = *(rs->sc_dkdev.dk_label);
1760 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1761 return ENOTTY;
1762 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1763 break;
1764 #endif
1765
1766 case DIOCGPART:
1767 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1768 ((struct partinfo *) data)->part =
1769 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1770 break;
1771
1772 case DIOCWDINFO:
1773 case DIOCSDINFO:
1774 #ifdef __HAVE_OLD_DISKLABEL
1775 case ODIOCWDINFO:
1776 case ODIOCSDINFO:
1777 #endif
1778 {
1779 struct disklabel *lp;
1780 #ifdef __HAVE_OLD_DISKLABEL
1781 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1782 memset(&newlabel, 0, sizeof newlabel);
1783 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1784 lp = &newlabel;
1785 } else
1786 #endif
1787 lp = (struct disklabel *)data;
1788
1789 if ((error = raidlock(rs)) != 0)
1790 return (error);
1791
1792 rs->sc_flags |= RAIDF_LABELLING;
1793
1794 error = setdisklabel(rs->sc_dkdev.dk_label,
1795 lp, 0, rs->sc_dkdev.dk_cpulabel);
1796 if (error == 0) {
1797 if (cmd == DIOCWDINFO
1798 #ifdef __HAVE_OLD_DISKLABEL
1799 || cmd == ODIOCWDINFO
1800 #endif
1801 )
1802 error = writedisklabel(RAIDLABELDEV(dev),
1803 raidstrategy, rs->sc_dkdev.dk_label,
1804 rs->sc_dkdev.dk_cpulabel);
1805 }
1806 rs->sc_flags &= ~RAIDF_LABELLING;
1807
1808 raidunlock(rs);
1809
1810 if (error)
1811 return (error);
1812 break;
1813 }
1814
1815 case DIOCWLABEL:
1816 if (*(int *) data != 0)
1817 rs->sc_flags |= RAIDF_WLABEL;
1818 else
1819 rs->sc_flags &= ~RAIDF_WLABEL;
1820 break;
1821
1822 case DIOCGDEFLABEL:
1823 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1824 break;
1825
1826 #ifdef __HAVE_OLD_DISKLABEL
1827 case ODIOCGDEFLABEL:
1828 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1829 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1830 return ENOTTY;
1831 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1832 break;
1833 #endif
1834
1835 case DIOCAWEDGE:
1836 case DIOCDWEDGE:
1837 dkw = (void *)data;
1838
1839 /* If the ioctl happens here, the parent is us. */
1840 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1841 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1842
1843 case DIOCLWEDGES:
1844 return dkwedge_list(&rs->sc_dkdev,
1845 (struct dkwedge_list *)data, l);
1846 case DIOCCACHESYNC:
1847 return rf_sync_component_caches(raidPtr);
1848 default:
1849 retcode = ENOTTY;
1850 }
1851 return (retcode);
1852
1853 }
1854
1855
1856 /* raidinit -- complete the rest of the initialization for the
1857 RAIDframe device. */
1858
1859
1860 static void
1861 raidinit(RF_Raid_t *raidPtr)
1862 {
1863 cfdata_t cf;
1864 struct raid_softc *rs;
1865 int unit;
1866
1867 unit = raidPtr->raidid;
1868
1869 rs = &raid_softc[unit];
1870
1871 /* XXX should check return code first... */
1872 rs->sc_flags |= RAIDF_INITED;
1873
1874 /* XXX doesn't check bounds. */
1875 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1876
1877 /* attach the pseudo device */
1878 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1879 cf->cf_name = raid_cd.cd_name;
1880 cf->cf_atname = raid_cd.cd_name;
1881 cf->cf_unit = unit;
1882 cf->cf_fstate = FSTATE_STAR;
1883
1884 rs->sc_dev = config_attach_pseudo(cf);
1885
1886 if (rs->sc_dev == NULL) {
1887 printf("raid%d: config_attach_pseudo failed\n",
1888 raidPtr->raidid);
1889 rs->sc_flags &= ~RAIDF_INITED;
1890 free(cf, M_RAIDFRAME);
1891 return;
1892 }
1893
1894 /* disk_attach actually creates space for the CPU disklabel, among
1895 * other things, so it's critical to call this *BEFORE* we try putzing
1896 * with disklabels. */
1897
1898 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1899 disk_attach(&rs->sc_dkdev);
1900 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1901
1902 /* XXX There may be a weird interaction here between this, and
1903 * protectedSectors, as used in RAIDframe. */
1904
1905 rs->sc_size = raidPtr->totalSectors;
1906
1907 dkwedge_discover(&rs->sc_dkdev);
1908
1909 rf_set_properties(rs, raidPtr);
1910
1911 }
1912 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1913 /* wake up the daemon & tell it to get us a spare table
1914 * XXX
1915 * the entries in the queues should be tagged with the raidPtr
1916 * so that in the extremely rare case that two recons happen at once,
1917 * we know for which device were requesting a spare table
1918 * XXX
1919 *
1920 * XXX This code is not currently used. GO
1921 */
1922 int
1923 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1924 {
1925 int retcode;
1926
1927 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1928 req->next = rf_sparet_wait_queue;
1929 rf_sparet_wait_queue = req;
1930 wakeup(&rf_sparet_wait_queue);
1931
1932 /* mpsleep unlocks the mutex */
1933 while (!rf_sparet_resp_queue) {
1934 tsleep(&rf_sparet_resp_queue, PRIBIO,
1935 "raidframe getsparetable", 0);
1936 }
1937 req = rf_sparet_resp_queue;
1938 rf_sparet_resp_queue = req->next;
1939 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1940
1941 retcode = req->fcol;
1942 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1943 * alloc'd */
1944 return (retcode);
1945 }
1946 #endif
1947
1948 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1949 * bp & passes it down.
1950 * any calls originating in the kernel must use non-blocking I/O
1951 * do some extra sanity checking to return "appropriate" error values for
1952 * certain conditions (to make some standard utilities work)
1953 *
1954 * Formerly known as: rf_DoAccessKernel
1955 */
1956 void
1957 raidstart(RF_Raid_t *raidPtr)
1958 {
1959 RF_SectorCount_t num_blocks, pb, sum;
1960 RF_RaidAddr_t raid_addr;
1961 struct partition *pp;
1962 daddr_t blocknum;
1963 int unit;
1964 struct raid_softc *rs;
1965 int do_async;
1966 struct buf *bp;
1967 int rc;
1968
1969 unit = raidPtr->raidid;
1970 rs = &raid_softc[unit];
1971
1972 /* quick check to see if anything has died recently */
1973 RF_LOCK_MUTEX(raidPtr->mutex);
1974 if (raidPtr->numNewFailures > 0) {
1975 RF_UNLOCK_MUTEX(raidPtr->mutex);
1976 rf_update_component_labels(raidPtr,
1977 RF_NORMAL_COMPONENT_UPDATE);
1978 RF_LOCK_MUTEX(raidPtr->mutex);
1979 raidPtr->numNewFailures--;
1980 }
1981
1982 /* Check to see if we're at the limit... */
1983 while (raidPtr->openings > 0) {
1984 RF_UNLOCK_MUTEX(raidPtr->mutex);
1985
1986 /* get the next item, if any, from the queue */
1987 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
1988 /* nothing more to do */
1989 return;
1990 }
1991
1992 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1993 * partition.. Need to make it absolute to the underlying
1994 * device.. */
1995
1996 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
1997 if (DISKPART(bp->b_dev) != RAW_PART) {
1998 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1999 blocknum += pp->p_offset;
2000 }
2001
2002 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2003 (int) blocknum));
2004
2005 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2006 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2007
2008 /* *THIS* is where we adjust what block we're going to...
2009 * but DO NOT TOUCH bp->b_blkno!!! */
2010 raid_addr = blocknum;
2011
2012 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2013 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2014 sum = raid_addr + num_blocks + pb;
2015 if (1 || rf_debugKernelAccess) {
2016 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2017 (int) raid_addr, (int) sum, (int) num_blocks,
2018 (int) pb, (int) bp->b_resid));
2019 }
2020 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2021 || (sum < num_blocks) || (sum < pb)) {
2022 bp->b_error = ENOSPC;
2023 bp->b_resid = bp->b_bcount;
2024 biodone(bp);
2025 RF_LOCK_MUTEX(raidPtr->mutex);
2026 continue;
2027 }
2028 /*
2029 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2030 */
2031
2032 if (bp->b_bcount & raidPtr->sectorMask) {
2033 bp->b_error = EINVAL;
2034 bp->b_resid = bp->b_bcount;
2035 biodone(bp);
2036 RF_LOCK_MUTEX(raidPtr->mutex);
2037 continue;
2038
2039 }
2040 db1_printf(("Calling DoAccess..\n"));
2041
2042
2043 RF_LOCK_MUTEX(raidPtr->mutex);
2044 raidPtr->openings--;
2045 RF_UNLOCK_MUTEX(raidPtr->mutex);
2046
2047 /*
2048 * Everything is async.
2049 */
2050 do_async = 1;
2051
2052 disk_busy(&rs->sc_dkdev);
2053
2054 /* XXX we're still at splbio() here... do we *really*
2055 need to be? */
2056
2057 /* don't ever condition on bp->b_flags & B_WRITE.
2058 * always condition on B_READ instead */
2059
2060 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2061 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2062 do_async, raid_addr, num_blocks,
2063 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2064
2065 if (rc) {
2066 bp->b_error = rc;
2067 bp->b_resid = bp->b_bcount;
2068 biodone(bp);
2069 /* continue loop */
2070 }
2071
2072 RF_LOCK_MUTEX(raidPtr->mutex);
2073 }
2074 RF_UNLOCK_MUTEX(raidPtr->mutex);
2075 }
2076
2077
2078
2079
2080 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2081
2082 int
2083 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2084 {
2085 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2086 struct buf *bp;
2087
2088 req->queue = queue;
2089 bp = req->bp;
2090
2091 switch (req->type) {
2092 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2093 /* XXX need to do something extra here.. */
2094 /* I'm leaving this in, as I've never actually seen it used,
2095 * and I'd like folks to report it... GO */
2096 printf(("WAKEUP CALLED\n"));
2097 queue->numOutstanding++;
2098
2099 bp->b_flags = 0;
2100 bp->b_private = req;
2101
2102 KernelWakeupFunc(bp);
2103 break;
2104
2105 case RF_IO_TYPE_READ:
2106 case RF_IO_TYPE_WRITE:
2107 #if RF_ACC_TRACE > 0
2108 if (req->tracerec) {
2109 RF_ETIMER_START(req->tracerec->timer);
2110 }
2111 #endif
2112 InitBP(bp, queue->rf_cinfo->ci_vp,
2113 op, queue->rf_cinfo->ci_dev,
2114 req->sectorOffset, req->numSector,
2115 req->buf, KernelWakeupFunc, (void *) req,
2116 queue->raidPtr->logBytesPerSector, req->b_proc);
2117
2118 if (rf_debugKernelAccess) {
2119 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2120 (long) bp->b_blkno));
2121 }
2122 queue->numOutstanding++;
2123 queue->last_deq_sector = req->sectorOffset;
2124 /* acc wouldn't have been let in if there were any pending
2125 * reqs at any other priority */
2126 queue->curPriority = req->priority;
2127
2128 db1_printf(("Going for %c to unit %d col %d\n",
2129 req->type, queue->raidPtr->raidid,
2130 queue->col));
2131 db1_printf(("sector %d count %d (%d bytes) %d\n",
2132 (int) req->sectorOffset, (int) req->numSector,
2133 (int) (req->numSector <<
2134 queue->raidPtr->logBytesPerSector),
2135 (int) queue->raidPtr->logBytesPerSector));
2136
2137 /*
2138 * XXX: drop lock here since this can block at
2139 * least with backing SCSI devices. Retake it
2140 * to minimize fuss with calling interfaces.
2141 */
2142
2143 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2144 bdev_strategy(bp);
2145 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2146 break;
2147
2148 default:
2149 panic("bad req->type in rf_DispatchKernelIO");
2150 }
2151 db1_printf(("Exiting from DispatchKernelIO\n"));
2152
2153 return (0);
2154 }
2155 /* this is the callback function associated with a I/O invoked from
2156 kernel code.
2157 */
2158 static void
2159 KernelWakeupFunc(struct buf *bp)
2160 {
2161 RF_DiskQueueData_t *req = NULL;
2162 RF_DiskQueue_t *queue;
2163 int s;
2164
2165 s = splbio();
2166 db1_printf(("recovering the request queue:\n"));
2167 req = bp->b_private;
2168
2169 queue = (RF_DiskQueue_t *) req->queue;
2170
2171 #if RF_ACC_TRACE > 0
2172 if (req->tracerec) {
2173 RF_ETIMER_STOP(req->tracerec->timer);
2174 RF_ETIMER_EVAL(req->tracerec->timer);
2175 RF_LOCK_MUTEX(rf_tracing_mutex);
2176 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2177 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2178 req->tracerec->num_phys_ios++;
2179 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2180 }
2181 #endif
2182
2183 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2184 * ballistic, and mark the component as hosed... */
2185
2186 if (bp->b_error != 0) {
2187 /* Mark the disk as dead */
2188 /* but only mark it once... */
2189 /* and only if it wouldn't leave this RAID set
2190 completely broken */
2191 if (((queue->raidPtr->Disks[queue->col].status ==
2192 rf_ds_optimal) ||
2193 (queue->raidPtr->Disks[queue->col].status ==
2194 rf_ds_used_spare)) &&
2195 (queue->raidPtr->numFailures <
2196 queue->raidPtr->Layout.map->faultsTolerated)) {
2197 printf("raid%d: IO Error. Marking %s as failed.\n",
2198 queue->raidPtr->raidid,
2199 queue->raidPtr->Disks[queue->col].devname);
2200 queue->raidPtr->Disks[queue->col].status =
2201 rf_ds_failed;
2202 queue->raidPtr->status = rf_rs_degraded;
2203 queue->raidPtr->numFailures++;
2204 queue->raidPtr->numNewFailures++;
2205 } else { /* Disk is already dead... */
2206 /* printf("Disk already marked as dead!\n"); */
2207 }
2208
2209 }
2210
2211 /* Fill in the error value */
2212
2213 req->error = bp->b_error;
2214
2215 simple_lock(&queue->raidPtr->iodone_lock);
2216
2217 /* Drop this one on the "finished" queue... */
2218 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2219
2220 /* Let the raidio thread know there is work to be done. */
2221 wakeup(&(queue->raidPtr->iodone));
2222
2223 simple_unlock(&queue->raidPtr->iodone_lock);
2224
2225 splx(s);
2226 }
2227
2228
2229
2230 /*
2231 * initialize a buf structure for doing an I/O in the kernel.
2232 */
2233 static void
2234 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2235 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2236 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2237 struct proc *b_proc)
2238 {
2239 /* bp->b_flags = B_PHYS | rw_flag; */
2240 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2241 bp->b_oflags = 0;
2242 bp->b_cflags = 0;
2243 bp->b_bcount = numSect << logBytesPerSector;
2244 bp->b_bufsize = bp->b_bcount;
2245 bp->b_error = 0;
2246 bp->b_dev = dev;
2247 bp->b_data = bf;
2248 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2249 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2250 if (bp->b_bcount == 0) {
2251 panic("bp->b_bcount is zero in InitBP!!");
2252 }
2253 bp->b_proc = b_proc;
2254 bp->b_iodone = cbFunc;
2255 bp->b_private = cbArg;
2256 }
2257
2258 static void
2259 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2260 struct disklabel *lp)
2261 {
2262 memset(lp, 0, sizeof(*lp));
2263
2264 /* fabricate a label... */
2265 lp->d_secperunit = raidPtr->totalSectors;
2266 lp->d_secsize = raidPtr->bytesPerSector;
2267 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2268 lp->d_ntracks = 4 * raidPtr->numCol;
2269 lp->d_ncylinders = raidPtr->totalSectors /
2270 (lp->d_nsectors * lp->d_ntracks);
2271 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2272
2273 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2274 lp->d_type = DTYPE_RAID;
2275 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2276 lp->d_rpm = 3600;
2277 lp->d_interleave = 1;
2278 lp->d_flags = 0;
2279
2280 lp->d_partitions[RAW_PART].p_offset = 0;
2281 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2282 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2283 lp->d_npartitions = RAW_PART + 1;
2284
2285 lp->d_magic = DISKMAGIC;
2286 lp->d_magic2 = DISKMAGIC;
2287 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2288
2289 }
2290 /*
2291 * Read the disklabel from the raid device. If one is not present, fake one
2292 * up.
2293 */
2294 static void
2295 raidgetdisklabel(dev_t dev)
2296 {
2297 int unit = raidunit(dev);
2298 struct raid_softc *rs = &raid_softc[unit];
2299 const char *errstring;
2300 struct disklabel *lp = rs->sc_dkdev.dk_label;
2301 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2302 RF_Raid_t *raidPtr;
2303
2304 db1_printf(("Getting the disklabel...\n"));
2305
2306 memset(clp, 0, sizeof(*clp));
2307
2308 raidPtr = raidPtrs[unit];
2309
2310 raidgetdefaultlabel(raidPtr, rs, lp);
2311
2312 /*
2313 * Call the generic disklabel extraction routine.
2314 */
2315 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2316 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2317 if (errstring)
2318 raidmakedisklabel(rs);
2319 else {
2320 int i;
2321 struct partition *pp;
2322
2323 /*
2324 * Sanity check whether the found disklabel is valid.
2325 *
2326 * This is necessary since total size of the raid device
2327 * may vary when an interleave is changed even though exactly
2328 * same components are used, and old disklabel may used
2329 * if that is found.
2330 */
2331 if (lp->d_secperunit != rs->sc_size)
2332 printf("raid%d: WARNING: %s: "
2333 "total sector size in disklabel (%" PRIu32 ") != "
2334 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2335 lp->d_secperunit, rs->sc_size);
2336 for (i = 0; i < lp->d_npartitions; i++) {
2337 pp = &lp->d_partitions[i];
2338 if (pp->p_offset + pp->p_size > rs->sc_size)
2339 printf("raid%d: WARNING: %s: end of partition `%c' "
2340 "exceeds the size of raid (%" PRIu64 ")\n",
2341 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2342 }
2343 }
2344
2345 }
2346 /*
2347 * Take care of things one might want to take care of in the event
2348 * that a disklabel isn't present.
2349 */
2350 static void
2351 raidmakedisklabel(struct raid_softc *rs)
2352 {
2353 struct disklabel *lp = rs->sc_dkdev.dk_label;
2354 db1_printf(("Making a label..\n"));
2355
2356 /*
2357 * For historical reasons, if there's no disklabel present
2358 * the raw partition must be marked FS_BSDFFS.
2359 */
2360
2361 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2362
2363 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2364
2365 lp->d_checksum = dkcksum(lp);
2366 }
2367 /*
2368 * Wait interruptibly for an exclusive lock.
2369 *
2370 * XXX
2371 * Several drivers do this; it should be abstracted and made MP-safe.
2372 * (Hmm... where have we seen this warning before :-> GO )
2373 */
2374 static int
2375 raidlock(struct raid_softc *rs)
2376 {
2377 int error;
2378
2379 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2380 rs->sc_flags |= RAIDF_WANTED;
2381 if ((error =
2382 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2383 return (error);
2384 }
2385 rs->sc_flags |= RAIDF_LOCKED;
2386 return (0);
2387 }
2388 /*
2389 * Unlock and wake up any waiters.
2390 */
2391 static void
2392 raidunlock(struct raid_softc *rs)
2393 {
2394
2395 rs->sc_flags &= ~RAIDF_LOCKED;
2396 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2397 rs->sc_flags &= ~RAIDF_WANTED;
2398 wakeup(rs);
2399 }
2400 }
2401
2402
2403 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2404 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2405 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2406
2407 static daddr_t
2408 rf_component_info_offset(void)
2409 {
2410
2411 return RF_COMPONENT_INFO_OFFSET;
2412 }
2413
2414 static daddr_t
2415 rf_component_info_size(unsigned secsize)
2416 {
2417 daddr_t info_size;
2418
2419 KASSERT(secsize);
2420 if (secsize > RF_COMPONENT_INFO_SIZE)
2421 info_size = secsize;
2422 else
2423 info_size = RF_COMPONENT_INFO_SIZE;
2424
2425 return info_size;
2426 }
2427
2428 static daddr_t
2429 rf_parity_map_offset(RF_Raid_t *raidPtr)
2430 {
2431 daddr_t map_offset;
2432
2433 KASSERT(raidPtr->bytesPerSector);
2434 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2435 map_offset = raidPtr->bytesPerSector;
2436 else
2437 map_offset = RF_COMPONENT_INFO_SIZE;
2438 map_offset += rf_component_info_offset();
2439
2440 return map_offset;
2441 }
2442
2443 static daddr_t
2444 rf_parity_map_size(RF_Raid_t *raidPtr)
2445 {
2446 daddr_t map_size;
2447
2448 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2449 map_size = raidPtr->bytesPerSector;
2450 else
2451 map_size = RF_PARITY_MAP_SIZE;
2452
2453 return map_size;
2454 }
2455
2456 int
2457 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2458 {
2459 RF_ComponentLabel_t *clabel;
2460
2461 clabel = raidget_component_label(raidPtr, col);
2462 clabel->clean = RF_RAID_CLEAN;
2463 raidflush_component_label(raidPtr, col);
2464 return(0);
2465 }
2466
2467
2468 int
2469 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2470 {
2471 RF_ComponentLabel_t *clabel;
2472
2473 clabel = raidget_component_label(raidPtr, col);
2474 clabel->clean = RF_RAID_DIRTY;
2475 raidflush_component_label(raidPtr, col);
2476 return(0);
2477 }
2478
2479 int
2480 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2481 {
2482 KASSERT(raidPtr->bytesPerSector);
2483 return raidread_component_label(raidPtr->bytesPerSector,
2484 raidPtr->Disks[col].dev,
2485 raidPtr->raid_cinfo[col].ci_vp,
2486 &raidPtr->raid_cinfo[col].ci_label);
2487 }
2488
2489 RF_ComponentLabel_t *
2490 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2491 {
2492 return &raidPtr->raid_cinfo[col].ci_label;
2493 }
2494
2495 int
2496 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2497 {
2498 RF_ComponentLabel_t *label;
2499
2500 label = &raidPtr->raid_cinfo[col].ci_label;
2501 label->mod_counter = raidPtr->mod_counter;
2502 #ifndef RF_NO_PARITY_MAP
2503 label->parity_map_modcount = label->mod_counter;
2504 #endif
2505 return raidwrite_component_label(raidPtr->bytesPerSector,
2506 raidPtr->Disks[col].dev,
2507 raidPtr->raid_cinfo[col].ci_vp, label);
2508 }
2509
2510
2511 static int
2512 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2513 RF_ComponentLabel_t *clabel)
2514 {
2515 return raidread_component_area(dev, b_vp, clabel,
2516 sizeof(RF_ComponentLabel_t),
2517 rf_component_info_offset(),
2518 rf_component_info_size(secsize));
2519 }
2520
2521 /* ARGSUSED */
2522 static int
2523 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2524 size_t msize, daddr_t offset, daddr_t dsize)
2525 {
2526 struct buf *bp;
2527 const struct bdevsw *bdev;
2528 int error;
2529
2530 /* XXX should probably ensure that we don't try to do this if
2531 someone has changed rf_protected_sectors. */
2532
2533 if (b_vp == NULL) {
2534 /* For whatever reason, this component is not valid.
2535 Don't try to read a component label from it. */
2536 return(EINVAL);
2537 }
2538
2539 /* get a block of the appropriate size... */
2540 bp = geteblk((int)dsize);
2541 bp->b_dev = dev;
2542
2543 /* get our ducks in a row for the read */
2544 bp->b_blkno = offset / DEV_BSIZE;
2545 bp->b_bcount = dsize;
2546 bp->b_flags |= B_READ;
2547 bp->b_resid = dsize;
2548
2549 bdev = bdevsw_lookup(bp->b_dev);
2550 if (bdev == NULL)
2551 return (ENXIO);
2552 (*bdev->d_strategy)(bp);
2553
2554 error = biowait(bp);
2555
2556 if (!error) {
2557 memcpy(data, bp->b_data, msize);
2558 }
2559
2560 brelse(bp, 0);
2561 return(error);
2562 }
2563
2564
2565 static int
2566 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2567 RF_ComponentLabel_t *clabel)
2568 {
2569 return raidwrite_component_area(dev, b_vp, clabel,
2570 sizeof(RF_ComponentLabel_t),
2571 rf_component_info_offset(),
2572 rf_component_info_size(secsize), 0);
2573 }
2574
2575 /* ARGSUSED */
2576 static int
2577 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2578 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2579 {
2580 struct buf *bp;
2581 const struct bdevsw *bdev;
2582 int error;
2583
2584 /* get a block of the appropriate size... */
2585 bp = geteblk((int)dsize);
2586 bp->b_dev = dev;
2587
2588 /* get our ducks in a row for the write */
2589 bp->b_blkno = offset / DEV_BSIZE;
2590 bp->b_bcount = dsize;
2591 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2592 bp->b_resid = dsize;
2593
2594 memset(bp->b_data, 0, dsize);
2595 memcpy(bp->b_data, data, msize);
2596
2597 bdev = bdevsw_lookup(bp->b_dev);
2598 if (bdev == NULL)
2599 return (ENXIO);
2600 (*bdev->d_strategy)(bp);
2601 if (asyncp)
2602 return 0;
2603 error = biowait(bp);
2604 brelse(bp, 0);
2605 if (error) {
2606 #if 1
2607 printf("Failed to write RAID component info!\n");
2608 #endif
2609 }
2610
2611 return(error);
2612 }
2613
2614 void
2615 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2616 {
2617 int c;
2618
2619 for (c = 0; c < raidPtr->numCol; c++) {
2620 /* Skip dead disks. */
2621 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2622 continue;
2623 /* XXXjld: what if an error occurs here? */
2624 raidwrite_component_area(raidPtr->Disks[c].dev,
2625 raidPtr->raid_cinfo[c].ci_vp, map,
2626 RF_PARITYMAP_NBYTE,
2627 rf_parity_map_offset(raidPtr),
2628 rf_parity_map_size(raidPtr), 0);
2629 }
2630 }
2631
2632 void
2633 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2634 {
2635 struct rf_paritymap_ondisk tmp;
2636 int c,first;
2637
2638 first=1;
2639 for (c = 0; c < raidPtr->numCol; c++) {
2640 /* Skip dead disks. */
2641 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2642 continue;
2643 raidread_component_area(raidPtr->Disks[c].dev,
2644 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2645 RF_PARITYMAP_NBYTE,
2646 rf_parity_map_offset(raidPtr),
2647 rf_parity_map_size(raidPtr));
2648 if (first) {
2649 memcpy(map, &tmp, sizeof(*map));
2650 first = 0;
2651 } else {
2652 rf_paritymap_merge(map, &tmp);
2653 }
2654 }
2655 }
2656
2657 void
2658 rf_markalldirty(RF_Raid_t *raidPtr)
2659 {
2660 RF_ComponentLabel_t *clabel;
2661 int sparecol;
2662 int c;
2663 int j;
2664 int scol = -1;
2665
2666 raidPtr->mod_counter++;
2667 for (c = 0; c < raidPtr->numCol; c++) {
2668 /* we don't want to touch (at all) a disk that has
2669 failed */
2670 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2671 clabel = raidget_component_label(raidPtr, c);
2672 if (clabel->status == rf_ds_spared) {
2673 /* XXX do something special...
2674 but whatever you do, don't
2675 try to access it!! */
2676 } else {
2677 raidmarkdirty(raidPtr, c);
2678 }
2679 }
2680 }
2681
2682 for( c = 0; c < raidPtr->numSpare ; c++) {
2683 sparecol = raidPtr->numCol + c;
2684 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2685 /*
2686
2687 we claim this disk is "optimal" if it's
2688 rf_ds_used_spare, as that means it should be
2689 directly substitutable for the disk it replaced.
2690 We note that too...
2691
2692 */
2693
2694 for(j=0;j<raidPtr->numCol;j++) {
2695 if (raidPtr->Disks[j].spareCol == sparecol) {
2696 scol = j;
2697 break;
2698 }
2699 }
2700
2701 clabel = raidget_component_label(raidPtr, sparecol);
2702 /* make sure status is noted */
2703
2704 raid_init_component_label(raidPtr, clabel);
2705
2706 clabel->row = 0;
2707 clabel->column = scol;
2708 /* Note: we *don't* change status from rf_ds_used_spare
2709 to rf_ds_optimal */
2710 /* clabel.status = rf_ds_optimal; */
2711
2712 raidmarkdirty(raidPtr, sparecol);
2713 }
2714 }
2715 }
2716
2717
2718 void
2719 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2720 {
2721 RF_ComponentLabel_t *clabel;
2722 int sparecol;
2723 int c;
2724 int j;
2725 int scol;
2726
2727 scol = -1;
2728
2729 /* XXX should do extra checks to make sure things really are clean,
2730 rather than blindly setting the clean bit... */
2731
2732 raidPtr->mod_counter++;
2733
2734 for (c = 0; c < raidPtr->numCol; c++) {
2735 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2736 clabel = raidget_component_label(raidPtr, c);
2737 /* make sure status is noted */
2738 clabel->status = rf_ds_optimal;
2739
2740 /* note what unit we are configured as */
2741 clabel->last_unit = raidPtr->raidid;
2742
2743 raidflush_component_label(raidPtr, c);
2744 if (final == RF_FINAL_COMPONENT_UPDATE) {
2745 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2746 raidmarkclean(raidPtr, c);
2747 }
2748 }
2749 }
2750 /* else we don't touch it.. */
2751 }
2752
2753 for( c = 0; c < raidPtr->numSpare ; c++) {
2754 sparecol = raidPtr->numCol + c;
2755 /* Need to ensure that the reconstruct actually completed! */
2756 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2757 /*
2758
2759 we claim this disk is "optimal" if it's
2760 rf_ds_used_spare, as that means it should be
2761 directly substitutable for the disk it replaced.
2762 We note that too...
2763
2764 */
2765
2766 for(j=0;j<raidPtr->numCol;j++) {
2767 if (raidPtr->Disks[j].spareCol == sparecol) {
2768 scol = j;
2769 break;
2770 }
2771 }
2772
2773 /* XXX shouldn't *really* need this... */
2774 clabel = raidget_component_label(raidPtr, sparecol);
2775 /* make sure status is noted */
2776
2777 raid_init_component_label(raidPtr, clabel);
2778
2779 clabel->column = scol;
2780 clabel->status = rf_ds_optimal;
2781 clabel->last_unit = raidPtr->raidid;
2782
2783 raidflush_component_label(raidPtr, sparecol);
2784 if (final == RF_FINAL_COMPONENT_UPDATE) {
2785 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2786 raidmarkclean(raidPtr, sparecol);
2787 }
2788 }
2789 }
2790 }
2791 }
2792
2793 void
2794 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2795 {
2796
2797 if (vp != NULL) {
2798 if (auto_configured == 1) {
2799 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2800 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2801 vput(vp);
2802
2803 } else {
2804 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2805 }
2806 }
2807 }
2808
2809
2810 void
2811 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2812 {
2813 int r,c;
2814 struct vnode *vp;
2815 int acd;
2816
2817
2818 /* We take this opportunity to close the vnodes like we should.. */
2819
2820 for (c = 0; c < raidPtr->numCol; c++) {
2821 vp = raidPtr->raid_cinfo[c].ci_vp;
2822 acd = raidPtr->Disks[c].auto_configured;
2823 rf_close_component(raidPtr, vp, acd);
2824 raidPtr->raid_cinfo[c].ci_vp = NULL;
2825 raidPtr->Disks[c].auto_configured = 0;
2826 }
2827
2828 for (r = 0; r < raidPtr->numSpare; r++) {
2829 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2830 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2831 rf_close_component(raidPtr, vp, acd);
2832 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2833 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2834 }
2835 }
2836
2837
2838 void
2839 rf_ReconThread(struct rf_recon_req *req)
2840 {
2841 int s;
2842 RF_Raid_t *raidPtr;
2843
2844 s = splbio();
2845 raidPtr = (RF_Raid_t *) req->raidPtr;
2846 raidPtr->recon_in_progress = 1;
2847
2848 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2849 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2850
2851 RF_Free(req, sizeof(*req));
2852
2853 raidPtr->recon_in_progress = 0;
2854 splx(s);
2855
2856 /* That's all... */
2857 kthread_exit(0); /* does not return */
2858 }
2859
2860 void
2861 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2862 {
2863 int retcode;
2864 int s;
2865
2866 raidPtr->parity_rewrite_stripes_done = 0;
2867 raidPtr->parity_rewrite_in_progress = 1;
2868 s = splbio();
2869 retcode = rf_RewriteParity(raidPtr);
2870 splx(s);
2871 if (retcode) {
2872 printf("raid%d: Error re-writing parity (%d)!\n",
2873 raidPtr->raidid, retcode);
2874 } else {
2875 /* set the clean bit! If we shutdown correctly,
2876 the clean bit on each component label will get
2877 set */
2878 raidPtr->parity_good = RF_RAID_CLEAN;
2879 }
2880 raidPtr->parity_rewrite_in_progress = 0;
2881
2882 /* Anyone waiting for us to stop? If so, inform them... */
2883 if (raidPtr->waitShutdown) {
2884 wakeup(&raidPtr->parity_rewrite_in_progress);
2885 }
2886
2887 /* That's all... */
2888 kthread_exit(0); /* does not return */
2889 }
2890
2891
2892 void
2893 rf_CopybackThread(RF_Raid_t *raidPtr)
2894 {
2895 int s;
2896
2897 raidPtr->copyback_in_progress = 1;
2898 s = splbio();
2899 rf_CopybackReconstructedData(raidPtr);
2900 splx(s);
2901 raidPtr->copyback_in_progress = 0;
2902
2903 /* That's all... */
2904 kthread_exit(0); /* does not return */
2905 }
2906
2907
2908 void
2909 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2910 {
2911 int s;
2912 RF_Raid_t *raidPtr;
2913
2914 s = splbio();
2915 raidPtr = req->raidPtr;
2916 raidPtr->recon_in_progress = 1;
2917 rf_ReconstructInPlace(raidPtr, req->col);
2918 RF_Free(req, sizeof(*req));
2919 raidPtr->recon_in_progress = 0;
2920 splx(s);
2921
2922 /* That's all... */
2923 kthread_exit(0); /* does not return */
2924 }
2925
2926 static RF_AutoConfig_t *
2927 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2928 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2929 unsigned secsize)
2930 {
2931 int good_one = 0;
2932 RF_ComponentLabel_t *clabel;
2933 RF_AutoConfig_t *ac;
2934
2935 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2936 if (clabel == NULL) {
2937 oomem:
2938 while(ac_list) {
2939 ac = ac_list;
2940 if (ac->clabel)
2941 free(ac->clabel, M_RAIDFRAME);
2942 ac_list = ac_list->next;
2943 free(ac, M_RAIDFRAME);
2944 }
2945 printf("RAID auto config: out of memory!\n");
2946 return NULL; /* XXX probably should panic? */
2947 }
2948
2949 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2950 /* Got the label. Does it look reasonable? */
2951 if (rf_reasonable_label(clabel, numsecs) &&
2952 (rf_component_label_partitionsize(clabel) <= size)) {
2953 #ifdef DEBUG
2954 printf("Component on: %s: %llu\n",
2955 cname, (unsigned long long)size);
2956 rf_print_component_label(clabel);
2957 #endif
2958 /* if it's reasonable, add it, else ignore it. */
2959 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2960 M_NOWAIT);
2961 if (ac == NULL) {
2962 free(clabel, M_RAIDFRAME);
2963 goto oomem;
2964 }
2965 strlcpy(ac->devname, cname, sizeof(ac->devname));
2966 ac->dev = dev;
2967 ac->vp = vp;
2968 ac->clabel = clabel;
2969 ac->next = ac_list;
2970 ac_list = ac;
2971 good_one = 1;
2972 }
2973 }
2974 if (!good_one) {
2975 /* cleanup */
2976 free(clabel, M_RAIDFRAME);
2977 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2978 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2979 vput(vp);
2980 }
2981 return ac_list;
2982 }
2983
2984 RF_AutoConfig_t *
2985 rf_find_raid_components(void)
2986 {
2987 struct vnode *vp;
2988 struct disklabel label;
2989 device_t dv;
2990 deviter_t di;
2991 dev_t dev;
2992 int bmajor, bminor, wedge;
2993 int error;
2994 int i;
2995 RF_AutoConfig_t *ac_list;
2996 uint64_t numsecs;
2997 unsigned secsize;
2998
2999 /* initialize the AutoConfig list */
3000 ac_list = NULL;
3001
3002 /* we begin by trolling through *all* the devices on the system */
3003
3004 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3005 dv = deviter_next(&di)) {
3006
3007 /* we are only interested in disks... */
3008 if (device_class(dv) != DV_DISK)
3009 continue;
3010
3011 /* we don't care about floppies... */
3012 if (device_is_a(dv, "fd")) {
3013 continue;
3014 }
3015
3016 /* we don't care about CD's... */
3017 if (device_is_a(dv, "cd")) {
3018 continue;
3019 }
3020
3021 /* we don't care about md's... */
3022 if (device_is_a(dv, "md")) {
3023 continue;
3024 }
3025
3026 /* hdfd is the Atari/Hades floppy driver */
3027 if (device_is_a(dv, "hdfd")) {
3028 continue;
3029 }
3030
3031 /* fdisa is the Atari/Milan floppy driver */
3032 if (device_is_a(dv, "fdisa")) {
3033 continue;
3034 }
3035
3036 /* need to find the device_name_to_block_device_major stuff */
3037 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3038
3039 /* get a vnode for the raw partition of this disk */
3040
3041 wedge = device_is_a(dv, "dk");
3042 bminor = minor(device_unit(dv));
3043 dev = wedge ? makedev(bmajor, bminor) :
3044 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3045 if (bdevvp(dev, &vp))
3046 panic("RAID can't alloc vnode");
3047
3048 error = VOP_OPEN(vp, FREAD, NOCRED);
3049
3050 if (error) {
3051 /* "Who cares." Continue looking
3052 for something that exists*/
3053 vput(vp);
3054 continue;
3055 }
3056
3057 error = getdisksize(vp, &numsecs, &secsize);
3058 if (error) {
3059 vput(vp);
3060 continue;
3061 }
3062 if (wedge) {
3063 struct dkwedge_info dkw;
3064 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3065 NOCRED);
3066 if (error) {
3067 printf("RAIDframe: can't get wedge info for "
3068 "dev %s (%d)\n", device_xname(dv), error);
3069 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3070 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3071 vput(vp);
3072 continue;
3073 }
3074
3075 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3076 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3077 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3078 vput(vp);
3079 continue;
3080 }
3081
3082 ac_list = rf_get_component(ac_list, dev, vp,
3083 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3084 continue;
3085 }
3086
3087 /* Ok, the disk exists. Go get the disklabel. */
3088 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3089 if (error) {
3090 /*
3091 * XXX can't happen - open() would
3092 * have errored out (or faked up one)
3093 */
3094 if (error != ENOTTY)
3095 printf("RAIDframe: can't get label for dev "
3096 "%s (%d)\n", device_xname(dv), error);
3097 }
3098
3099 /* don't need this any more. We'll allocate it again
3100 a little later if we really do... */
3101 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3102 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3103 vput(vp);
3104
3105 if (error)
3106 continue;
3107
3108 for (i = 0; i < label.d_npartitions; i++) {
3109 char cname[sizeof(ac_list->devname)];
3110
3111 /* We only support partitions marked as RAID */
3112 if (label.d_partitions[i].p_fstype != FS_RAID)
3113 continue;
3114
3115 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3116 if (bdevvp(dev, &vp))
3117 panic("RAID can't alloc vnode");
3118
3119 error = VOP_OPEN(vp, FREAD, NOCRED);
3120 if (error) {
3121 /* Whatever... */
3122 vput(vp);
3123 continue;
3124 }
3125 snprintf(cname, sizeof(cname), "%s%c",
3126 device_xname(dv), 'a' + i);
3127 ac_list = rf_get_component(ac_list, dev, vp, cname,
3128 label.d_partitions[i].p_size, numsecs, secsize);
3129 }
3130 }
3131 deviter_release(&di);
3132 return ac_list;
3133 }
3134
3135
3136 static int
3137 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3138 {
3139
3140 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3141 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3142 ((clabel->clean == RF_RAID_CLEAN) ||
3143 (clabel->clean == RF_RAID_DIRTY)) &&
3144 clabel->row >=0 &&
3145 clabel->column >= 0 &&
3146 clabel->num_rows > 0 &&
3147 clabel->num_columns > 0 &&
3148 clabel->row < clabel->num_rows &&
3149 clabel->column < clabel->num_columns &&
3150 clabel->blockSize > 0 &&
3151 /*
3152 * numBlocksHi may contain garbage, but it is ok since
3153 * the type is unsigned. If it is really garbage,
3154 * rf_fix_old_label_size() will fix it.
3155 */
3156 rf_component_label_numblocks(clabel) > 0) {
3157 /*
3158 * label looks reasonable enough...
3159 * let's make sure it has no old garbage.
3160 */
3161 rf_fix_old_label_size(clabel, numsecs);
3162 return(1);
3163 }
3164 return(0);
3165 }
3166
3167
3168 /*
3169 * For reasons yet unknown, some old component labels have garbage in
3170 * the newer numBlocksHi region, and this causes lossage. Since those
3171 * disks will also have numsecs set to less than 32 bits of sectors,
3172 * we can determine when this corruption has occured, and fix it.
3173 *
3174 * The exact same problem, with the same unknown reason, happens to
3175 * the partitionSizeHi member as well.
3176 */
3177 static void
3178 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3179 {
3180
3181 if (numsecs < ((uint64_t)1 << 32)) {
3182 if (clabel->numBlocksHi) {
3183 printf("WARNING: total sectors < 32 bits, yet "
3184 "numBlocksHi set\n"
3185 "WARNING: resetting numBlocksHi to zero.\n");
3186 clabel->numBlocksHi = 0;
3187 }
3188
3189 if (clabel->partitionSizeHi) {
3190 printf("WARNING: total sectors < 32 bits, yet "
3191 "partitionSizeHi set\n"
3192 "WARNING: resetting partitionSizeHi to zero.\n");
3193 clabel->partitionSizeHi = 0;
3194 }
3195 }
3196 }
3197
3198
3199 #ifdef DEBUG
3200 void
3201 rf_print_component_label(RF_ComponentLabel_t *clabel)
3202 {
3203 uint64_t numBlocks;
3204
3205 numBlocks = rf_component_label_numblocks(clabel);
3206
3207 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3208 clabel->row, clabel->column,
3209 clabel->num_rows, clabel->num_columns);
3210 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3211 clabel->version, clabel->serial_number,
3212 clabel->mod_counter);
3213 printf(" Clean: %s Status: %d\n",
3214 clabel->clean ? "Yes" : "No", clabel->status);
3215 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3216 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3217 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3218 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3219 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3220 printf(" Contains root partition: %s\n",
3221 clabel->root_partition ? "Yes" : "No");
3222 printf(" Last configured as: raid%d\n", clabel->last_unit);
3223 #if 0
3224 printf(" Config order: %d\n", clabel->config_order);
3225 #endif
3226
3227 }
3228 #endif
3229
3230 RF_ConfigSet_t *
3231 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3232 {
3233 RF_AutoConfig_t *ac;
3234 RF_ConfigSet_t *config_sets;
3235 RF_ConfigSet_t *cset;
3236 RF_AutoConfig_t *ac_next;
3237
3238
3239 config_sets = NULL;
3240
3241 /* Go through the AutoConfig list, and figure out which components
3242 belong to what sets. */
3243 ac = ac_list;
3244 while(ac!=NULL) {
3245 /* we're going to putz with ac->next, so save it here
3246 for use at the end of the loop */
3247 ac_next = ac->next;
3248
3249 if (config_sets == NULL) {
3250 /* will need at least this one... */
3251 config_sets = (RF_ConfigSet_t *)
3252 malloc(sizeof(RF_ConfigSet_t),
3253 M_RAIDFRAME, M_NOWAIT);
3254 if (config_sets == NULL) {
3255 panic("rf_create_auto_sets: No memory!");
3256 }
3257 /* this one is easy :) */
3258 config_sets->ac = ac;
3259 config_sets->next = NULL;
3260 config_sets->rootable = 0;
3261 ac->next = NULL;
3262 } else {
3263 /* which set does this component fit into? */
3264 cset = config_sets;
3265 while(cset!=NULL) {
3266 if (rf_does_it_fit(cset, ac)) {
3267 /* looks like it matches... */
3268 ac->next = cset->ac;
3269 cset->ac = ac;
3270 break;
3271 }
3272 cset = cset->next;
3273 }
3274 if (cset==NULL) {
3275 /* didn't find a match above... new set..*/
3276 cset = (RF_ConfigSet_t *)
3277 malloc(sizeof(RF_ConfigSet_t),
3278 M_RAIDFRAME, M_NOWAIT);
3279 if (cset == NULL) {
3280 panic("rf_create_auto_sets: No memory!");
3281 }
3282 cset->ac = ac;
3283 ac->next = NULL;
3284 cset->next = config_sets;
3285 cset->rootable = 0;
3286 config_sets = cset;
3287 }
3288 }
3289 ac = ac_next;
3290 }
3291
3292
3293 return(config_sets);
3294 }
3295
3296 static int
3297 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3298 {
3299 RF_ComponentLabel_t *clabel1, *clabel2;
3300
3301 /* If this one matches the *first* one in the set, that's good
3302 enough, since the other members of the set would have been
3303 through here too... */
3304 /* note that we are not checking partitionSize here..
3305
3306 Note that we are also not checking the mod_counters here.
3307 If everything else matches execpt the mod_counter, that's
3308 good enough for this test. We will deal with the mod_counters
3309 a little later in the autoconfiguration process.
3310
3311 (clabel1->mod_counter == clabel2->mod_counter) &&
3312
3313 The reason we don't check for this is that failed disks
3314 will have lower modification counts. If those disks are
3315 not added to the set they used to belong to, then they will
3316 form their own set, which may result in 2 different sets,
3317 for example, competing to be configured at raid0, and
3318 perhaps competing to be the root filesystem set. If the
3319 wrong ones get configured, or both attempt to become /,
3320 weird behaviour and or serious lossage will occur. Thus we
3321 need to bring them into the fold here, and kick them out at
3322 a later point.
3323
3324 */
3325
3326 clabel1 = cset->ac->clabel;
3327 clabel2 = ac->clabel;
3328 if ((clabel1->version == clabel2->version) &&
3329 (clabel1->serial_number == clabel2->serial_number) &&
3330 (clabel1->num_rows == clabel2->num_rows) &&
3331 (clabel1->num_columns == clabel2->num_columns) &&
3332 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3333 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3334 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3335 (clabel1->parityConfig == clabel2->parityConfig) &&
3336 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3337 (clabel1->blockSize == clabel2->blockSize) &&
3338 rf_component_label_numblocks(clabel1) ==
3339 rf_component_label_numblocks(clabel2) &&
3340 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3341 (clabel1->root_partition == clabel2->root_partition) &&
3342 (clabel1->last_unit == clabel2->last_unit) &&
3343 (clabel1->config_order == clabel2->config_order)) {
3344 /* if it get's here, it almost *has* to be a match */
3345 } else {
3346 /* it's not consistent with somebody in the set..
3347 punt */
3348 return(0);
3349 }
3350 /* all was fine.. it must fit... */
3351 return(1);
3352 }
3353
3354 int
3355 rf_have_enough_components(RF_ConfigSet_t *cset)
3356 {
3357 RF_AutoConfig_t *ac;
3358 RF_AutoConfig_t *auto_config;
3359 RF_ComponentLabel_t *clabel;
3360 int c;
3361 int num_cols;
3362 int num_missing;
3363 int mod_counter;
3364 int mod_counter_found;
3365 int even_pair_failed;
3366 char parity_type;
3367
3368
3369 /* check to see that we have enough 'live' components
3370 of this set. If so, we can configure it if necessary */
3371
3372 num_cols = cset->ac->clabel->num_columns;
3373 parity_type = cset->ac->clabel->parityConfig;
3374
3375 /* XXX Check for duplicate components!?!?!? */
3376
3377 /* Determine what the mod_counter is supposed to be for this set. */
3378
3379 mod_counter_found = 0;
3380 mod_counter = 0;
3381 ac = cset->ac;
3382 while(ac!=NULL) {
3383 if (mod_counter_found==0) {
3384 mod_counter = ac->clabel->mod_counter;
3385 mod_counter_found = 1;
3386 } else {
3387 if (ac->clabel->mod_counter > mod_counter) {
3388 mod_counter = ac->clabel->mod_counter;
3389 }
3390 }
3391 ac = ac->next;
3392 }
3393
3394 num_missing = 0;
3395 auto_config = cset->ac;
3396
3397 even_pair_failed = 0;
3398 for(c=0; c<num_cols; c++) {
3399 ac = auto_config;
3400 while(ac!=NULL) {
3401 if ((ac->clabel->column == c) &&
3402 (ac->clabel->mod_counter == mod_counter)) {
3403 /* it's this one... */
3404 #ifdef DEBUG
3405 printf("Found: %s at %d\n",
3406 ac->devname,c);
3407 #endif
3408 break;
3409 }
3410 ac=ac->next;
3411 }
3412 if (ac==NULL) {
3413 /* Didn't find one here! */
3414 /* special case for RAID 1, especially
3415 where there are more than 2
3416 components (where RAIDframe treats
3417 things a little differently :( ) */
3418 if (parity_type == '1') {
3419 if (c%2 == 0) { /* even component */
3420 even_pair_failed = 1;
3421 } else { /* odd component. If
3422 we're failed, and
3423 so is the even
3424 component, it's
3425 "Good Night, Charlie" */
3426 if (even_pair_failed == 1) {
3427 return(0);
3428 }
3429 }
3430 } else {
3431 /* normal accounting */
3432 num_missing++;
3433 }
3434 }
3435 if ((parity_type == '1') && (c%2 == 1)) {
3436 /* Just did an even component, and we didn't
3437 bail.. reset the even_pair_failed flag,
3438 and go on to the next component.... */
3439 even_pair_failed = 0;
3440 }
3441 }
3442
3443 clabel = cset->ac->clabel;
3444
3445 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3446 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3447 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3448 /* XXX this needs to be made *much* more general */
3449 /* Too many failures */
3450 return(0);
3451 }
3452 /* otherwise, all is well, and we've got enough to take a kick
3453 at autoconfiguring this set */
3454 return(1);
3455 }
3456
3457 void
3458 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3459 RF_Raid_t *raidPtr)
3460 {
3461 RF_ComponentLabel_t *clabel;
3462 int i;
3463
3464 clabel = ac->clabel;
3465
3466 /* 1. Fill in the common stuff */
3467 config->numRow = clabel->num_rows = 1;
3468 config->numCol = clabel->num_columns;
3469 config->numSpare = 0; /* XXX should this be set here? */
3470 config->sectPerSU = clabel->sectPerSU;
3471 config->SUsPerPU = clabel->SUsPerPU;
3472 config->SUsPerRU = clabel->SUsPerRU;
3473 config->parityConfig = clabel->parityConfig;
3474 /* XXX... */
3475 strcpy(config->diskQueueType,"fifo");
3476 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3477 config->layoutSpecificSize = 0; /* XXX ?? */
3478
3479 while(ac!=NULL) {
3480 /* row/col values will be in range due to the checks
3481 in reasonable_label() */
3482 strcpy(config->devnames[0][ac->clabel->column],
3483 ac->devname);
3484 ac = ac->next;
3485 }
3486
3487 for(i=0;i<RF_MAXDBGV;i++) {
3488 config->debugVars[i][0] = 0;
3489 }
3490 }
3491
3492 int
3493 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3494 {
3495 RF_ComponentLabel_t *clabel;
3496 int column;
3497 int sparecol;
3498
3499 raidPtr->autoconfigure = new_value;
3500
3501 for(column=0; column<raidPtr->numCol; column++) {
3502 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3503 clabel = raidget_component_label(raidPtr, column);
3504 clabel->autoconfigure = new_value;
3505 raidflush_component_label(raidPtr, column);
3506 }
3507 }
3508 for(column = 0; column < raidPtr->numSpare ; column++) {
3509 sparecol = raidPtr->numCol + column;
3510 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3511 clabel = raidget_component_label(raidPtr, sparecol);
3512 clabel->autoconfigure = new_value;
3513 raidflush_component_label(raidPtr, sparecol);
3514 }
3515 }
3516 return(new_value);
3517 }
3518
3519 int
3520 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3521 {
3522 RF_ComponentLabel_t *clabel;
3523 int column;
3524 int sparecol;
3525
3526 raidPtr->root_partition = new_value;
3527 for(column=0; column<raidPtr->numCol; column++) {
3528 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3529 clabel = raidget_component_label(raidPtr, column);
3530 clabel->root_partition = new_value;
3531 raidflush_component_label(raidPtr, column);
3532 }
3533 }
3534 for(column = 0; column < raidPtr->numSpare ; column++) {
3535 sparecol = raidPtr->numCol + column;
3536 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3537 clabel = raidget_component_label(raidPtr, sparecol);
3538 clabel->root_partition = new_value;
3539 raidflush_component_label(raidPtr, sparecol);
3540 }
3541 }
3542 return(new_value);
3543 }
3544
3545 void
3546 rf_release_all_vps(RF_ConfigSet_t *cset)
3547 {
3548 RF_AutoConfig_t *ac;
3549
3550 ac = cset->ac;
3551 while(ac!=NULL) {
3552 /* Close the vp, and give it back */
3553 if (ac->vp) {
3554 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3555 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3556 vput(ac->vp);
3557 ac->vp = NULL;
3558 }
3559 ac = ac->next;
3560 }
3561 }
3562
3563
3564 void
3565 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3566 {
3567 RF_AutoConfig_t *ac;
3568 RF_AutoConfig_t *next_ac;
3569
3570 ac = cset->ac;
3571 while(ac!=NULL) {
3572 next_ac = ac->next;
3573 /* nuke the label */
3574 free(ac->clabel, M_RAIDFRAME);
3575 /* cleanup the config structure */
3576 free(ac, M_RAIDFRAME);
3577 /* "next.." */
3578 ac = next_ac;
3579 }
3580 /* and, finally, nuke the config set */
3581 free(cset, M_RAIDFRAME);
3582 }
3583
3584
3585 void
3586 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3587 {
3588 /* current version number */
3589 clabel->version = RF_COMPONENT_LABEL_VERSION;
3590 clabel->serial_number = raidPtr->serial_number;
3591 clabel->mod_counter = raidPtr->mod_counter;
3592
3593 clabel->num_rows = 1;
3594 clabel->num_columns = raidPtr->numCol;
3595 clabel->clean = RF_RAID_DIRTY; /* not clean */
3596 clabel->status = rf_ds_optimal; /* "It's good!" */
3597
3598 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3599 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3600 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3601
3602 clabel->blockSize = raidPtr->bytesPerSector;
3603 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3604
3605 /* XXX not portable */
3606 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3607 clabel->maxOutstanding = raidPtr->maxOutstanding;
3608 clabel->autoconfigure = raidPtr->autoconfigure;
3609 clabel->root_partition = raidPtr->root_partition;
3610 clabel->last_unit = raidPtr->raidid;
3611 clabel->config_order = raidPtr->config_order;
3612
3613 #ifndef RF_NO_PARITY_MAP
3614 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3615 #endif
3616 }
3617
3618 int
3619 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3620 {
3621 RF_Raid_t *raidPtr;
3622 RF_Config_t *config;
3623 int raidID;
3624 int retcode;
3625
3626 #ifdef DEBUG
3627 printf("RAID autoconfigure\n");
3628 #endif
3629
3630 retcode = 0;
3631 *unit = -1;
3632
3633 /* 1. Create a config structure */
3634
3635 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3636 M_RAIDFRAME,
3637 M_NOWAIT);
3638 if (config==NULL) {
3639 printf("Out of mem!?!?\n");
3640 /* XXX do something more intelligent here. */
3641 return(1);
3642 }
3643
3644 memset(config, 0, sizeof(RF_Config_t));
3645
3646 /*
3647 2. Figure out what RAID ID this one is supposed to live at
3648 See if we can get the same RAID dev that it was configured
3649 on last time..
3650 */
3651
3652 raidID = cset->ac->clabel->last_unit;
3653 if ((raidID < 0) || (raidID >= numraid)) {
3654 /* let's not wander off into lala land. */
3655 raidID = numraid - 1;
3656 }
3657 if (raidPtrs[raidID]->valid != 0) {
3658
3659 /*
3660 Nope... Go looking for an alternative...
3661 Start high so we don't immediately use raid0 if that's
3662 not taken.
3663 */
3664
3665 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3666 if (raidPtrs[raidID]->valid == 0) {
3667 /* can use this one! */
3668 break;
3669 }
3670 }
3671 }
3672
3673 if (raidID < 0) {
3674 /* punt... */
3675 printf("Unable to auto configure this set!\n");
3676 printf("(Out of RAID devs!)\n");
3677 free(config, M_RAIDFRAME);
3678 return(1);
3679 }
3680
3681 #ifdef DEBUG
3682 printf("Configuring raid%d:\n",raidID);
3683 #endif
3684
3685 raidPtr = raidPtrs[raidID];
3686
3687 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3688 raidPtr->raidid = raidID;
3689 raidPtr->openings = RAIDOUTSTANDING;
3690
3691 /* 3. Build the configuration structure */
3692 rf_create_configuration(cset->ac, config, raidPtr);
3693
3694 /* 4. Do the configuration */
3695 retcode = rf_Configure(raidPtr, config, cset->ac);
3696
3697 if (retcode == 0) {
3698
3699 raidinit(raidPtrs[raidID]);
3700
3701 rf_markalldirty(raidPtrs[raidID]);
3702 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3703 if (cset->ac->clabel->root_partition==1) {
3704 /* everything configured just fine. Make a note
3705 that this set is eligible to be root. */
3706 cset->rootable = 1;
3707 /* XXX do this here? */
3708 raidPtrs[raidID]->root_partition = 1;
3709 }
3710 }
3711
3712 /* 5. Cleanup */
3713 free(config, M_RAIDFRAME);
3714
3715 *unit = raidID;
3716 return(retcode);
3717 }
3718
3719 void
3720 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3721 {
3722 struct buf *bp;
3723
3724 bp = (struct buf *)desc->bp;
3725 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3726 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3727 }
3728
3729 void
3730 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3731 size_t xmin, size_t xmax)
3732 {
3733 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3734 pool_sethiwat(p, xmax);
3735 pool_prime(p, xmin);
3736 pool_setlowat(p, xmin);
3737 }
3738
3739 /*
3740 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3741 * if there is IO pending and if that IO could possibly be done for a
3742 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3743 * otherwise.
3744 *
3745 */
3746
3747 int
3748 rf_buf_queue_check(int raidid)
3749 {
3750 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
3751 raidPtrs[raidid]->openings > 0) {
3752 /* there is work to do */
3753 return 0;
3754 }
3755 /* default is nothing to do */
3756 return 1;
3757 }
3758
3759 int
3760 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3761 {
3762 uint64_t numsecs;
3763 unsigned secsize;
3764 int error;
3765
3766 error = getdisksize(vp, &numsecs, &secsize);
3767 if (error == 0) {
3768 diskPtr->blockSize = secsize;
3769 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3770 diskPtr->partitionSize = numsecs;
3771 return 0;
3772 }
3773 return error;
3774 }
3775
3776 static int
3777 raid_match(device_t self, cfdata_t cfdata, void *aux)
3778 {
3779 return 1;
3780 }
3781
3782 static void
3783 raid_attach(device_t parent, device_t self, void *aux)
3784 {
3785
3786 }
3787
3788
3789 static int
3790 raid_detach(device_t self, int flags)
3791 {
3792 int error;
3793 struct raid_softc *rs = &raid_softc[device_unit(self)];
3794
3795 if ((error = raidlock(rs)) != 0)
3796 return (error);
3797
3798 error = raid_detach_unlocked(rs);
3799
3800 raidunlock(rs);
3801
3802 return error;
3803 }
3804
3805 static void
3806 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3807 {
3808 prop_dictionary_t disk_info, odisk_info, geom;
3809 disk_info = prop_dictionary_create();
3810 geom = prop_dictionary_create();
3811 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3812 raidPtr->totalSectors);
3813 prop_dictionary_set_uint32(geom, "sector-size",
3814 raidPtr->bytesPerSector);
3815
3816 prop_dictionary_set_uint16(geom, "sectors-per-track",
3817 raidPtr->Layout.dataSectorsPerStripe);
3818 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3819 4 * raidPtr->numCol);
3820
3821 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3822 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3823 (4 * raidPtr->numCol)));
3824
3825 prop_dictionary_set(disk_info, "geometry", geom);
3826 prop_object_release(geom);
3827 prop_dictionary_set(device_properties(rs->sc_dev),
3828 "disk-info", disk_info);
3829 odisk_info = rs->sc_dkdev.dk_info;
3830 rs->sc_dkdev.dk_info = disk_info;
3831 if (odisk_info)
3832 prop_object_release(odisk_info);
3833 }
3834
3835 /*
3836 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3837 * We end up returning whatever error was returned by the first cache flush
3838 * that fails.
3839 */
3840
3841 int
3842 rf_sync_component_caches(RF_Raid_t *raidPtr)
3843 {
3844 int c, sparecol;
3845 int e,error;
3846 int force = 1;
3847
3848 error = 0;
3849 for (c = 0; c < raidPtr->numCol; c++) {
3850 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3851 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3852 &force, FWRITE, NOCRED);
3853 if (e) {
3854 if (e != ENODEV)
3855 printf("raid%d: cache flush to component %s failed.\n",
3856 raidPtr->raidid, raidPtr->Disks[c].devname);
3857 if (error == 0) {
3858 error = e;
3859 }
3860 }
3861 }
3862 }
3863
3864 for( c = 0; c < raidPtr->numSpare ; c++) {
3865 sparecol = raidPtr->numCol + c;
3866 /* Need to ensure that the reconstruct actually completed! */
3867 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3868 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3869 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3870 if (e) {
3871 if (e != ENODEV)
3872 printf("raid%d: cache flush to component %s failed.\n",
3873 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3874 if (error == 0) {
3875 error = e;
3876 }
3877 }
3878 }
3879 }
3880 return error;
3881 }
3882