rf_netbsdkintf.c revision 1.267 1 /* $NetBSD: rf_netbsdkintf.c,v 1.267 2009/10/13 22:46:28 pooka Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Copyright (c) 1990, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * the Systems Programming Group of the University of Utah Computer
37 * Science Department.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 * from: Utah $Hdr: cd.c 1.6 90/11/28$
64 *
65 * @(#)cd.c 8.2 (Berkeley) 11/16/93
66 */
67
68 /*
69 * Copyright (c) 1988 University of Utah.
70 *
71 * This code is derived from software contributed to Berkeley by
72 * the Systems Programming Group of the University of Utah Computer
73 * Science Department.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgement:
85 * This product includes software developed by the University of
86 * California, Berkeley and its contributors.
87 * 4. Neither the name of the University nor the names of its contributors
88 * may be used to endorse or promote products derived from this software
89 * without specific prior written permission.
90 *
91 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
94 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
95 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
96 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
97 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
98 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
99 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
100 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
101 * SUCH DAMAGE.
102 *
103 * from: Utah $Hdr: cd.c 1.6 90/11/28$
104 *
105 * @(#)cd.c 8.2 (Berkeley) 11/16/93
106 */
107
108 /*
109 * Copyright (c) 1995 Carnegie-Mellon University.
110 * All rights reserved.
111 *
112 * Authors: Mark Holland, Jim Zelenka
113 *
114 * Permission to use, copy, modify and distribute this software and
115 * its documentation is hereby granted, provided that both the copyright
116 * notice and this permission notice appear in all copies of the
117 * software, derivative works or modified versions, and any portions
118 * thereof, and that both notices appear in supporting documentation.
119 *
120 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
121 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
122 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
123 *
124 * Carnegie Mellon requests users of this software to return to
125 *
126 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
127 * School of Computer Science
128 * Carnegie Mellon University
129 * Pittsburgh PA 15213-3890
130 *
131 * any improvements or extensions that they make and grant Carnegie the
132 * rights to redistribute these changes.
133 */
134
135 /***********************************************************
136 *
137 * rf_kintf.c -- the kernel interface routines for RAIDframe
138 *
139 ***********************************************************/
140
141 #include <sys/cdefs.h>
142 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.267 2009/10/13 22:46:28 pooka Exp $");
143
144 #ifdef _KERNEL_OPT
145 #include "opt_compat_netbsd.h"
146 #include "opt_raid_autoconfig.h"
147 #include "raid.h"
148 #endif
149
150 #include <sys/param.h>
151 #include <sys/errno.h>
152 #include <sys/pool.h>
153 #include <sys/proc.h>
154 #include <sys/queue.h>
155 #include <sys/disk.h>
156 #include <sys/device.h>
157 #include <sys/stat.h>
158 #include <sys/ioctl.h>
159 #include <sys/fcntl.h>
160 #include <sys/systm.h>
161 #include <sys/vnode.h>
162 #include <sys/disklabel.h>
163 #include <sys/conf.h>
164 #include <sys/buf.h>
165 #include <sys/bufq.h>
166 #include <sys/reboot.h>
167 #include <sys/kauth.h>
168
169 #include <prop/proplib.h>
170
171 #include <dev/raidframe/raidframevar.h>
172 #include <dev/raidframe/raidframeio.h>
173
174 #include "rf_raid.h"
175 #include "rf_copyback.h"
176 #include "rf_dag.h"
177 #include "rf_dagflags.h"
178 #include "rf_desc.h"
179 #include "rf_diskqueue.h"
180 #include "rf_etimer.h"
181 #include "rf_general.h"
182 #include "rf_kintf.h"
183 #include "rf_options.h"
184 #include "rf_driver.h"
185 #include "rf_parityscan.h"
186 #include "rf_threadstuff.h"
187
188 #ifdef COMPAT_50
189 #include "rf_compat50.h"
190 #endif
191
192 #ifdef DEBUG
193 int rf_kdebug_level = 0;
194 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
195 #else /* DEBUG */
196 #define db1_printf(a) { }
197 #endif /* DEBUG */
198
199 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
200
201 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
202 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
203
204 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
205 * spare table */
206 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
207 * installation process */
208 #endif
209
210 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
211
212 /* prototypes */
213 static void KernelWakeupFunc(struct buf *);
214 static void InitBP(struct buf *, struct vnode *, unsigned,
215 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
216 void *, int, struct proc *);
217 static void raidinit(RF_Raid_t *);
218
219 void raidattach(int);
220 static int raid_match(device_t, cfdata_t, void *);
221 static void raid_attach(device_t, device_t, void *);
222 static int raid_detach(device_t, int);
223
224 dev_type_open(raidopen);
225 dev_type_close(raidclose);
226 dev_type_read(raidread);
227 dev_type_write(raidwrite);
228 dev_type_ioctl(raidioctl);
229 dev_type_strategy(raidstrategy);
230 dev_type_dump(raiddump);
231 dev_type_size(raidsize);
232
233 const struct bdevsw raid_bdevsw = {
234 raidopen, raidclose, raidstrategy, raidioctl,
235 raiddump, raidsize, D_DISK
236 };
237
238 const struct cdevsw raid_cdevsw = {
239 raidopen, raidclose, raidread, raidwrite, raidioctl,
240 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
241 };
242
243 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
244
245 /* XXX Not sure if the following should be replacing the raidPtrs above,
246 or if it should be used in conjunction with that...
247 */
248
249 struct raid_softc {
250 device_t sc_dev;
251 int sc_flags; /* flags */
252 int sc_cflags; /* configuration flags */
253 uint64_t sc_size; /* size of the raid device */
254 char sc_xname[20]; /* XXX external name */
255 struct disk sc_dkdev; /* generic disk device info */
256 struct bufq_state *buf_queue; /* used for the device queue */
257 };
258 /* sc_flags */
259 #define RAIDF_INITED 0x01 /* unit has been initialized */
260 #define RAIDF_WLABEL 0x02 /* label area is writable */
261 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
262 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
263 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
264 #define RAIDF_LOCKED 0x80 /* unit is locked */
265
266 #define raidunit(x) DISKUNIT(x)
267 int numraid = 0;
268
269 extern struct cfdriver raid_cd;
270 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
271 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
272 DVF_DETACH_SHUTDOWN);
273
274 /*
275 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
276 * Be aware that large numbers can allow the driver to consume a lot of
277 * kernel memory, especially on writes, and in degraded mode reads.
278 *
279 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
280 * a single 64K write will typically require 64K for the old data,
281 * 64K for the old parity, and 64K for the new parity, for a total
282 * of 192K (if the parity buffer is not re-used immediately).
283 * Even it if is used immediately, that's still 128K, which when multiplied
284 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
285 *
286 * Now in degraded mode, for example, a 64K read on the above setup may
287 * require data reconstruction, which will require *all* of the 4 remaining
288 * disks to participate -- 4 * 32K/disk == 128K again.
289 */
290
291 #ifndef RAIDOUTSTANDING
292 #define RAIDOUTSTANDING 6
293 #endif
294
295 #define RAIDLABELDEV(dev) \
296 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
297
298 /* declared here, and made public, for the benefit of KVM stuff.. */
299 struct raid_softc *raid_softc;
300
301 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
302 struct disklabel *);
303 static void raidgetdisklabel(dev_t);
304 static void raidmakedisklabel(struct raid_softc *);
305
306 static int raidlock(struct raid_softc *);
307 static void raidunlock(struct raid_softc *);
308
309 static int raid_detach_unlocked(struct raid_softc *);
310
311 static void rf_markalldirty(RF_Raid_t *);
312 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
313
314 void rf_ReconThread(struct rf_recon_req *);
315 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
316 void rf_CopybackThread(RF_Raid_t *raidPtr);
317 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
318 int rf_autoconfig(device_t);
319 void rf_buildroothack(RF_ConfigSet_t *);
320
321 RF_AutoConfig_t *rf_find_raid_components(void);
322 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
323 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
324 static int rf_reasonable_label(RF_ComponentLabel_t *);
325 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
326 int rf_set_autoconfig(RF_Raid_t *, int);
327 int rf_set_rootpartition(RF_Raid_t *, int);
328 void rf_release_all_vps(RF_ConfigSet_t *);
329 void rf_cleanup_config_set(RF_ConfigSet_t *);
330 int rf_have_enough_components(RF_ConfigSet_t *);
331 int rf_auto_config_set(RF_ConfigSet_t *, int *);
332 static int rf_sync_component_caches(RF_Raid_t *raidPtr);
333
334 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
335 allow autoconfig to take place.
336 Note that this is overridden by having
337 RAID_AUTOCONFIG as an option in the
338 kernel config file. */
339
340 struct RF_Pools_s rf_pools;
341
342 void
343 raidattach(int num)
344 {
345 int raidID;
346 int i, rc;
347
348 aprint_debug("raidattach: Asked for %d units\n", num);
349
350 if (num <= 0) {
351 #ifdef DIAGNOSTIC
352 panic("raidattach: count <= 0");
353 #endif
354 return;
355 }
356 /* This is where all the initialization stuff gets done. */
357
358 numraid = num;
359
360 /* Make some space for requested number of units... */
361
362 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
363 if (raidPtrs == NULL) {
364 panic("raidPtrs is NULL!!");
365 }
366
367 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
368 rf_mutex_init(&rf_sparet_wait_mutex);
369
370 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
371 #endif
372
373 for (i = 0; i < num; i++)
374 raidPtrs[i] = NULL;
375 rc = rf_BootRaidframe();
376 if (rc == 0)
377 aprint_normal("Kernelized RAIDframe activated\n");
378 else
379 panic("Serious error booting RAID!!");
380
381 /* put together some datastructures like the CCD device does.. This
382 * lets us lock the device and what-not when it gets opened. */
383
384 raid_softc = (struct raid_softc *)
385 malloc(num * sizeof(struct raid_softc),
386 M_RAIDFRAME, M_NOWAIT);
387 if (raid_softc == NULL) {
388 aprint_error("WARNING: no memory for RAIDframe driver\n");
389 return;
390 }
391
392 memset(raid_softc, 0, num * sizeof(struct raid_softc));
393
394 for (raidID = 0; raidID < num; raidID++) {
395 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
396
397 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
398 (RF_Raid_t *));
399 if (raidPtrs[raidID] == NULL) {
400 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
401 numraid = raidID;
402 return;
403 }
404 }
405
406 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
407 aprint_error("raidattach: config_cfattach_attach failed?\n");
408 }
409
410 #ifdef RAID_AUTOCONFIG
411 raidautoconfig = 1;
412 #endif
413
414 /*
415 * Register a finalizer which will be used to auto-config RAID
416 * sets once all real hardware devices have been found.
417 */
418 if (config_finalize_register(NULL, rf_autoconfig) != 0)
419 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
420 }
421
422 int
423 rf_autoconfig(device_t self)
424 {
425 RF_AutoConfig_t *ac_list;
426 RF_ConfigSet_t *config_sets;
427
428 if (raidautoconfig == 0)
429 return (0);
430
431 /* XXX This code can only be run once. */
432 raidautoconfig = 0;
433
434 /* 1. locate all RAID components on the system */
435 aprint_debug("Searching for RAID components...\n");
436 ac_list = rf_find_raid_components();
437
438 /* 2. Sort them into their respective sets. */
439 config_sets = rf_create_auto_sets(ac_list);
440
441 /*
442 * 3. Evaluate each set andconfigure the valid ones.
443 * This gets done in rf_buildroothack().
444 */
445 rf_buildroothack(config_sets);
446
447 return 1;
448 }
449
450 void
451 rf_buildroothack(RF_ConfigSet_t *config_sets)
452 {
453 RF_ConfigSet_t *cset;
454 RF_ConfigSet_t *next_cset;
455 int retcode;
456 int raidID;
457 int rootID;
458 int col;
459 int num_root;
460 char *devname;
461
462 rootID = 0;
463 num_root = 0;
464 cset = config_sets;
465 while(cset != NULL ) {
466 next_cset = cset->next;
467 if (rf_have_enough_components(cset) &&
468 cset->ac->clabel->autoconfigure==1) {
469 retcode = rf_auto_config_set(cset,&raidID);
470 if (!retcode) {
471 aprint_debug("raid%d: configured ok\n", raidID);
472 if (cset->rootable) {
473 rootID = raidID;
474 num_root++;
475 }
476 } else {
477 /* The autoconfig didn't work :( */
478 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
479 rf_release_all_vps(cset);
480 }
481 } else {
482 /* we're not autoconfiguring this set...
483 release the associated resources */
484 rf_release_all_vps(cset);
485 }
486 /* cleanup */
487 rf_cleanup_config_set(cset);
488 cset = next_cset;
489 }
490
491 /* if the user has specified what the root device should be
492 then we don't touch booted_device or boothowto... */
493
494 if (rootspec != NULL)
495 return;
496
497 /* we found something bootable... */
498
499 if (num_root == 1) {
500 booted_device = raid_softc[rootID].sc_dev;
501 } else if (num_root > 1) {
502
503 /*
504 * Maybe the MD code can help. If it cannot, then
505 * setroot() will discover that we have no
506 * booted_device and will ask the user if nothing was
507 * hardwired in the kernel config file
508 */
509
510 if (booted_device == NULL)
511 cpu_rootconf();
512 if (booted_device == NULL)
513 return;
514
515 num_root = 0;
516 for (raidID = 0; raidID < numraid; raidID++) {
517 if (raidPtrs[raidID]->valid == 0)
518 continue;
519
520 if (raidPtrs[raidID]->root_partition == 0)
521 continue;
522
523 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
524 devname = raidPtrs[raidID]->Disks[col].devname;
525 devname += sizeof("/dev/") - 1;
526 if (strncmp(devname, device_xname(booted_device),
527 strlen(device_xname(booted_device))) != 0)
528 continue;
529 aprint_debug("raid%d includes boot device %s\n",
530 raidID, devname);
531 num_root++;
532 rootID = raidID;
533 }
534 }
535
536 if (num_root == 1) {
537 booted_device = raid_softc[rootID].sc_dev;
538 } else {
539 /* we can't guess.. require the user to answer... */
540 boothowto |= RB_ASKNAME;
541 }
542 }
543 }
544
545
546 int
547 raidsize(dev_t dev)
548 {
549 struct raid_softc *rs;
550 struct disklabel *lp;
551 int part, unit, omask, size;
552
553 unit = raidunit(dev);
554 if (unit >= numraid)
555 return (-1);
556 rs = &raid_softc[unit];
557
558 if ((rs->sc_flags & RAIDF_INITED) == 0)
559 return (-1);
560
561 part = DISKPART(dev);
562 omask = rs->sc_dkdev.dk_openmask & (1 << part);
563 lp = rs->sc_dkdev.dk_label;
564
565 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
566 return (-1);
567
568 if (lp->d_partitions[part].p_fstype != FS_SWAP)
569 size = -1;
570 else
571 size = lp->d_partitions[part].p_size *
572 (lp->d_secsize / DEV_BSIZE);
573
574 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
575 return (-1);
576
577 return (size);
578
579 }
580
581 int
582 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
583 {
584 int unit = raidunit(dev);
585 struct raid_softc *rs;
586 const struct bdevsw *bdev;
587 struct disklabel *lp;
588 RF_Raid_t *raidPtr;
589 daddr_t offset;
590 int part, c, sparecol, j, scol, dumpto;
591 int error = 0;
592
593 if (unit >= numraid)
594 return (ENXIO);
595
596 rs = &raid_softc[unit];
597 raidPtr = raidPtrs[unit];
598
599 if ((rs->sc_flags & RAIDF_INITED) == 0)
600 return ENXIO;
601
602 /* we only support dumping to RAID 1 sets */
603 if (raidPtr->Layout.numDataCol != 1 ||
604 raidPtr->Layout.numParityCol != 1)
605 return EINVAL;
606
607
608 if ((error = raidlock(rs)) != 0)
609 return error;
610
611 if (size % DEV_BSIZE != 0) {
612 error = EINVAL;
613 goto out;
614 }
615
616 if (blkno + size / DEV_BSIZE > rs->sc_size) {
617 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
618 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
619 size / DEV_BSIZE, rs->sc_size);
620 error = EINVAL;
621 goto out;
622 }
623
624 part = DISKPART(dev);
625 lp = rs->sc_dkdev.dk_label;
626 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
627
628 /* figure out what device is alive.. */
629
630 /*
631 Look for a component to dump to. The preference for the
632 component to dump to is as follows:
633 1) the master
634 2) a used_spare of the master
635 3) the slave
636 4) a used_spare of the slave
637 */
638
639 dumpto = -1;
640 for (c = 0; c < raidPtr->numCol; c++) {
641 if (raidPtr->Disks[c].status == rf_ds_optimal) {
642 /* this might be the one */
643 dumpto = c;
644 break;
645 }
646 }
647
648 /*
649 At this point we have possibly selected a live master or a
650 live slave. We now check to see if there is a spared
651 master (or a spared slave), if we didn't find a live master
652 or a live slave.
653 */
654
655 for (c = 0; c < raidPtr->numSpare; c++) {
656 sparecol = raidPtr->numCol + c;
657 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
658 /* How about this one? */
659 scol = -1;
660 for(j=0;j<raidPtr->numCol;j++) {
661 if (raidPtr->Disks[j].spareCol == sparecol) {
662 scol = j;
663 break;
664 }
665 }
666 if (scol == 0) {
667 /*
668 We must have found a spared master!
669 We'll take that over anything else
670 found so far. (We couldn't have
671 found a real master before, since
672 this is a used spare, and it's
673 saying that it's replacing the
674 master.) On reboot (with
675 autoconfiguration turned on)
676 sparecol will become the 1st
677 component (component0) of this set.
678 */
679 dumpto = sparecol;
680 break;
681 } else if (scol != -1) {
682 /*
683 Must be a spared slave. We'll dump
684 to that if we havn't found anything
685 else so far.
686 */
687 if (dumpto == -1)
688 dumpto = sparecol;
689 }
690 }
691 }
692
693 if (dumpto == -1) {
694 /* we couldn't find any live components to dump to!?!?
695 */
696 error = EINVAL;
697 goto out;
698 }
699
700 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
701
702 /*
703 Note that blkno is relative to this particular partition.
704 By adding the offset of this partition in the RAID
705 set, and also adding RF_PROTECTED_SECTORS, we get a
706 value that is relative to the partition used for the
707 underlying component.
708 */
709
710 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
711 blkno + offset, va, size);
712
713 out:
714 raidunlock(rs);
715
716 return error;
717 }
718 /* ARGSUSED */
719 int
720 raidopen(dev_t dev, int flags, int fmt,
721 struct lwp *l)
722 {
723 int unit = raidunit(dev);
724 struct raid_softc *rs;
725 struct disklabel *lp;
726 int part, pmask;
727 int error = 0;
728
729 if (unit >= numraid)
730 return (ENXIO);
731 rs = &raid_softc[unit];
732
733 if ((error = raidlock(rs)) != 0)
734 return (error);
735
736 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
737 error = EBUSY;
738 goto bad;
739 }
740
741 lp = rs->sc_dkdev.dk_label;
742
743 part = DISKPART(dev);
744
745 /*
746 * If there are wedges, and this is not RAW_PART, then we
747 * need to fail.
748 */
749 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
750 error = EBUSY;
751 goto bad;
752 }
753 pmask = (1 << part);
754
755 if ((rs->sc_flags & RAIDF_INITED) &&
756 (rs->sc_dkdev.dk_openmask == 0))
757 raidgetdisklabel(dev);
758
759 /* make sure that this partition exists */
760
761 if (part != RAW_PART) {
762 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
763 ((part >= lp->d_npartitions) ||
764 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
765 error = ENXIO;
766 goto bad;
767 }
768 }
769 /* Prevent this unit from being unconfigured while open. */
770 switch (fmt) {
771 case S_IFCHR:
772 rs->sc_dkdev.dk_copenmask |= pmask;
773 break;
774
775 case S_IFBLK:
776 rs->sc_dkdev.dk_bopenmask |= pmask;
777 break;
778 }
779
780 if ((rs->sc_dkdev.dk_openmask == 0) &&
781 ((rs->sc_flags & RAIDF_INITED) != 0)) {
782 /* First one... mark things as dirty... Note that we *MUST*
783 have done a configure before this. I DO NOT WANT TO BE
784 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
785 THAT THEY BELONG TOGETHER!!!!! */
786 /* XXX should check to see if we're only open for reading
787 here... If so, we needn't do this, but then need some
788 other way of keeping track of what's happened.. */
789
790 rf_markalldirty( raidPtrs[unit] );
791 }
792
793
794 rs->sc_dkdev.dk_openmask =
795 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
796
797 bad:
798 raidunlock(rs);
799
800 return (error);
801
802
803 }
804 /* ARGSUSED */
805 int
806 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
807 {
808 int unit = raidunit(dev);
809 struct raid_softc *rs;
810 int error = 0;
811 int part;
812
813 if (unit >= numraid)
814 return (ENXIO);
815 rs = &raid_softc[unit];
816
817 if ((error = raidlock(rs)) != 0)
818 return (error);
819
820 part = DISKPART(dev);
821
822 /* ...that much closer to allowing unconfiguration... */
823 switch (fmt) {
824 case S_IFCHR:
825 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
826 break;
827
828 case S_IFBLK:
829 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
830 break;
831 }
832 rs->sc_dkdev.dk_openmask =
833 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
834
835 if ((rs->sc_dkdev.dk_openmask == 0) &&
836 ((rs->sc_flags & RAIDF_INITED) != 0)) {
837 /* Last one... device is not unconfigured yet.
838 Device shutdown has taken care of setting the
839 clean bits if RAIDF_INITED is not set
840 mark things as clean... */
841
842 rf_update_component_labels(raidPtrs[unit],
843 RF_FINAL_COMPONENT_UPDATE);
844
845 /* If the kernel is shutting down, it will detach
846 * this RAID set soon enough.
847 */
848 }
849
850 raidunlock(rs);
851 return (0);
852
853 }
854
855 void
856 raidstrategy(struct buf *bp)
857 {
858 int s;
859
860 unsigned int raidID = raidunit(bp->b_dev);
861 RF_Raid_t *raidPtr;
862 struct raid_softc *rs = &raid_softc[raidID];
863 int wlabel;
864
865 if ((rs->sc_flags & RAIDF_INITED) ==0) {
866 bp->b_error = ENXIO;
867 goto done;
868 }
869 if (raidID >= numraid || !raidPtrs[raidID]) {
870 bp->b_error = ENODEV;
871 goto done;
872 }
873 raidPtr = raidPtrs[raidID];
874 if (!raidPtr->valid) {
875 bp->b_error = ENODEV;
876 goto done;
877 }
878 if (bp->b_bcount == 0) {
879 db1_printf(("b_bcount is zero..\n"));
880 goto done;
881 }
882
883 /*
884 * Do bounds checking and adjust transfer. If there's an
885 * error, the bounds check will flag that for us.
886 */
887
888 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
889 if (DISKPART(bp->b_dev) == RAW_PART) {
890 uint64_t size; /* device size in DEV_BSIZE unit */
891
892 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
893 size = raidPtr->totalSectors <<
894 (raidPtr->logBytesPerSector - DEV_BSHIFT);
895 } else {
896 size = raidPtr->totalSectors >>
897 (DEV_BSHIFT - raidPtr->logBytesPerSector);
898 }
899 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
900 goto done;
901 }
902 } else {
903 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
904 db1_printf(("Bounds check failed!!:%d %d\n",
905 (int) bp->b_blkno, (int) wlabel));
906 goto done;
907 }
908 }
909 s = splbio();
910
911 bp->b_resid = 0;
912
913 /* stuff it onto our queue */
914 bufq_put(rs->buf_queue, bp);
915
916 /* scheduled the IO to happen at the next convenient time */
917 wakeup(&(raidPtrs[raidID]->iodone));
918
919 splx(s);
920 return;
921
922 done:
923 bp->b_resid = bp->b_bcount;
924 biodone(bp);
925 }
926 /* ARGSUSED */
927 int
928 raidread(dev_t dev, struct uio *uio, int flags)
929 {
930 int unit = raidunit(dev);
931 struct raid_softc *rs;
932
933 if (unit >= numraid)
934 return (ENXIO);
935 rs = &raid_softc[unit];
936
937 if ((rs->sc_flags & RAIDF_INITED) == 0)
938 return (ENXIO);
939
940 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
941
942 }
943 /* ARGSUSED */
944 int
945 raidwrite(dev_t dev, struct uio *uio, int flags)
946 {
947 int unit = raidunit(dev);
948 struct raid_softc *rs;
949
950 if (unit >= numraid)
951 return (ENXIO);
952 rs = &raid_softc[unit];
953
954 if ((rs->sc_flags & RAIDF_INITED) == 0)
955 return (ENXIO);
956
957 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
958
959 }
960
961 static int
962 raid_detach_unlocked(struct raid_softc *rs)
963 {
964 int error;
965 RF_Raid_t *raidPtr;
966
967 raidPtr = raidPtrs[device_unit(rs->sc_dev)];
968
969 /*
970 * If somebody has a partition mounted, we shouldn't
971 * shutdown.
972 */
973 if (rs->sc_dkdev.dk_openmask != 0)
974 return EBUSY;
975
976 if ((rs->sc_flags & RAIDF_INITED) == 0)
977 ; /* not initialized: nothing to do */
978 else if ((error = rf_Shutdown(raidPtr)) != 0)
979 return error;
980 else
981 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
982
983 /* Detach the disk. */
984 disk_detach(&rs->sc_dkdev);
985 disk_destroy(&rs->sc_dkdev);
986
987 return 0;
988 }
989
990 int
991 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
992 {
993 int unit = raidunit(dev);
994 int error = 0;
995 int part, pmask;
996 cfdata_t cf;
997 struct raid_softc *rs;
998 RF_Config_t *k_cfg, *u_cfg;
999 RF_Raid_t *raidPtr;
1000 RF_RaidDisk_t *diskPtr;
1001 RF_AccTotals_t *totals;
1002 RF_DeviceConfig_t *d_cfg, **ucfgp;
1003 u_char *specific_buf;
1004 int retcode = 0;
1005 int column;
1006 int raidid;
1007 struct rf_recon_req *rrcopy, *rr;
1008 RF_ComponentLabel_t *clabel;
1009 RF_ComponentLabel_t *ci_label;
1010 RF_ComponentLabel_t **clabel_ptr;
1011 RF_SingleComponent_t *sparePtr,*componentPtr;
1012 RF_SingleComponent_t component;
1013 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1014 int i, j, d;
1015 #ifdef __HAVE_OLD_DISKLABEL
1016 struct disklabel newlabel;
1017 #endif
1018 struct dkwedge_info *dkw;
1019
1020 if (unit >= numraid)
1021 return (ENXIO);
1022 rs = &raid_softc[unit];
1023 raidPtr = raidPtrs[unit];
1024
1025 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
1026 (int) DISKPART(dev), (int) unit, (int) cmd));
1027
1028 /* Must be open for writes for these commands... */
1029 switch (cmd) {
1030 #ifdef DIOCGSECTORSIZE
1031 case DIOCGSECTORSIZE:
1032 *(u_int *)data = raidPtr->bytesPerSector;
1033 return 0;
1034 case DIOCGMEDIASIZE:
1035 *(off_t *)data =
1036 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1037 return 0;
1038 #endif
1039 case DIOCSDINFO:
1040 case DIOCWDINFO:
1041 #ifdef __HAVE_OLD_DISKLABEL
1042 case ODIOCWDINFO:
1043 case ODIOCSDINFO:
1044 #endif
1045 case DIOCWLABEL:
1046 case DIOCAWEDGE:
1047 case DIOCDWEDGE:
1048 if ((flag & FWRITE) == 0)
1049 return (EBADF);
1050 }
1051
1052 /* Must be initialized for these... */
1053 switch (cmd) {
1054 case DIOCGDINFO:
1055 case DIOCSDINFO:
1056 case DIOCWDINFO:
1057 #ifdef __HAVE_OLD_DISKLABEL
1058 case ODIOCGDINFO:
1059 case ODIOCWDINFO:
1060 case ODIOCSDINFO:
1061 case ODIOCGDEFLABEL:
1062 #endif
1063 case DIOCGPART:
1064 case DIOCWLABEL:
1065 case DIOCGDEFLABEL:
1066 case DIOCAWEDGE:
1067 case DIOCDWEDGE:
1068 case DIOCLWEDGES:
1069 case DIOCCACHESYNC:
1070 case RAIDFRAME_SHUTDOWN:
1071 case RAIDFRAME_REWRITEPARITY:
1072 case RAIDFRAME_GET_INFO:
1073 case RAIDFRAME_RESET_ACCTOTALS:
1074 case RAIDFRAME_GET_ACCTOTALS:
1075 case RAIDFRAME_KEEP_ACCTOTALS:
1076 case RAIDFRAME_GET_SIZE:
1077 case RAIDFRAME_FAIL_DISK:
1078 case RAIDFRAME_COPYBACK:
1079 case RAIDFRAME_CHECK_RECON_STATUS:
1080 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1081 case RAIDFRAME_GET_COMPONENT_LABEL:
1082 case RAIDFRAME_SET_COMPONENT_LABEL:
1083 case RAIDFRAME_ADD_HOT_SPARE:
1084 case RAIDFRAME_REMOVE_HOT_SPARE:
1085 case RAIDFRAME_INIT_LABELS:
1086 case RAIDFRAME_REBUILD_IN_PLACE:
1087 case RAIDFRAME_CHECK_PARITY:
1088 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1089 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1090 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1091 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1092 case RAIDFRAME_SET_AUTOCONFIG:
1093 case RAIDFRAME_SET_ROOT:
1094 case RAIDFRAME_DELETE_COMPONENT:
1095 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1096 if ((rs->sc_flags & RAIDF_INITED) == 0)
1097 return (ENXIO);
1098 }
1099
1100 switch (cmd) {
1101 #ifdef COMPAT_50
1102 case RAIDFRAME_GET_INFO50:
1103 return rf_get_info50(raidPtr, data);
1104
1105 case RAIDFRAME_CONFIGURE50:
1106 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1107 return retcode;
1108 goto config;
1109 #endif
1110 /* configure the system */
1111 case RAIDFRAME_CONFIGURE:
1112
1113 if (raidPtr->valid) {
1114 /* There is a valid RAID set running on this unit! */
1115 printf("raid%d: Device already configured!\n",unit);
1116 return(EINVAL);
1117 }
1118
1119 /* copy-in the configuration information */
1120 /* data points to a pointer to the configuration structure */
1121
1122 u_cfg = *((RF_Config_t **) data);
1123 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1124 if (k_cfg == NULL) {
1125 return (ENOMEM);
1126 }
1127 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1128 if (retcode) {
1129 RF_Free(k_cfg, sizeof(RF_Config_t));
1130 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1131 retcode));
1132 return (retcode);
1133 }
1134 goto config;
1135 config:
1136 /* allocate a buffer for the layout-specific data, and copy it
1137 * in */
1138 if (k_cfg->layoutSpecificSize) {
1139 if (k_cfg->layoutSpecificSize > 10000) {
1140 /* sanity check */
1141 RF_Free(k_cfg, sizeof(RF_Config_t));
1142 return (EINVAL);
1143 }
1144 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1145 (u_char *));
1146 if (specific_buf == NULL) {
1147 RF_Free(k_cfg, sizeof(RF_Config_t));
1148 return (ENOMEM);
1149 }
1150 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1151 k_cfg->layoutSpecificSize);
1152 if (retcode) {
1153 RF_Free(k_cfg, sizeof(RF_Config_t));
1154 RF_Free(specific_buf,
1155 k_cfg->layoutSpecificSize);
1156 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1157 retcode));
1158 return (retcode);
1159 }
1160 } else
1161 specific_buf = NULL;
1162 k_cfg->layoutSpecific = specific_buf;
1163
1164 /* should do some kind of sanity check on the configuration.
1165 * Store the sum of all the bytes in the last byte? */
1166
1167 /* configure the system */
1168
1169 /*
1170 * Clear the entire RAID descriptor, just to make sure
1171 * there is no stale data left in the case of a
1172 * reconfiguration
1173 */
1174 memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1175 raidPtr->raidid = unit;
1176
1177 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1178
1179 if (retcode == 0) {
1180
1181 /* allow this many simultaneous IO's to
1182 this RAID device */
1183 raidPtr->openings = RAIDOUTSTANDING;
1184
1185 raidinit(raidPtr);
1186 rf_markalldirty(raidPtr);
1187 }
1188 /* free the buffers. No return code here. */
1189 if (k_cfg->layoutSpecificSize) {
1190 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1191 }
1192 RF_Free(k_cfg, sizeof(RF_Config_t));
1193
1194 return (retcode);
1195
1196 /* shutdown the system */
1197 case RAIDFRAME_SHUTDOWN:
1198
1199 part = DISKPART(dev);
1200 pmask = (1 << part);
1201
1202 if ((error = raidlock(rs)) != 0)
1203 return (error);
1204
1205 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1206 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1207 (rs->sc_dkdev.dk_copenmask & pmask)))
1208 retcode = EBUSY;
1209 else {
1210 rs->sc_flags |= RAIDF_SHUTDOWN;
1211 rs->sc_dkdev.dk_copenmask &= ~pmask;
1212 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1213 rs->sc_dkdev.dk_openmask &= ~pmask;
1214 retcode = 0;
1215 }
1216
1217 raidunlock(rs);
1218
1219 if (retcode != 0)
1220 return retcode;
1221
1222 /* free the pseudo device attach bits */
1223
1224 cf = device_cfdata(rs->sc_dev);
1225 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1226 free(cf, M_RAIDFRAME);
1227
1228 return (retcode);
1229 case RAIDFRAME_GET_COMPONENT_LABEL:
1230 clabel_ptr = (RF_ComponentLabel_t **) data;
1231 /* need to read the component label for the disk indicated
1232 by row,column in clabel */
1233
1234 /* For practice, let's get it directly fromdisk, rather
1235 than from the in-core copy */
1236 RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
1237 (RF_ComponentLabel_t *));
1238 if (clabel == NULL)
1239 return (ENOMEM);
1240
1241 retcode = copyin( *clabel_ptr, clabel,
1242 sizeof(RF_ComponentLabel_t));
1243
1244 if (retcode) {
1245 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1246 return(retcode);
1247 }
1248
1249 clabel->row = 0; /* Don't allow looking at anything else.*/
1250
1251 column = clabel->column;
1252
1253 if ((column < 0) || (column >= raidPtr->numCol +
1254 raidPtr->numSpare)) {
1255 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1256 return(EINVAL);
1257 }
1258
1259 retcode = raidread_component_label(raidPtr->Disks[column].dev,
1260 raidPtr->raid_cinfo[column].ci_vp,
1261 clabel );
1262
1263 if (retcode == 0) {
1264 retcode = copyout(clabel, *clabel_ptr,
1265 sizeof(RF_ComponentLabel_t));
1266 }
1267 RF_Free(clabel, sizeof(RF_ComponentLabel_t));
1268 return (retcode);
1269
1270 case RAIDFRAME_SET_COMPONENT_LABEL:
1271 clabel = (RF_ComponentLabel_t *) data;
1272
1273 /* XXX check the label for valid stuff... */
1274 /* Note that some things *should not* get modified --
1275 the user should be re-initing the labels instead of
1276 trying to patch things.
1277 */
1278
1279 raidid = raidPtr->raidid;
1280 #ifdef DEBUG
1281 printf("raid%d: Got component label:\n", raidid);
1282 printf("raid%d: Version: %d\n", raidid, clabel->version);
1283 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1284 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1285 printf("raid%d: Column: %d\n", raidid, clabel->column);
1286 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1287 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1288 printf("raid%d: Status: %d\n", raidid, clabel->status);
1289 #endif
1290 clabel->row = 0;
1291 column = clabel->column;
1292
1293 if ((column < 0) || (column >= raidPtr->numCol)) {
1294 return(EINVAL);
1295 }
1296
1297 /* XXX this isn't allowed to do anything for now :-) */
1298
1299 /* XXX and before it is, we need to fill in the rest
1300 of the fields!?!?!?! */
1301 #if 0
1302 raidwrite_component_label(
1303 raidPtr->Disks[column].dev,
1304 raidPtr->raid_cinfo[column].ci_vp,
1305 clabel );
1306 #endif
1307 return (0);
1308
1309 case RAIDFRAME_INIT_LABELS:
1310 clabel = (RF_ComponentLabel_t *) data;
1311 /*
1312 we only want the serial number from
1313 the above. We get all the rest of the information
1314 from the config that was used to create this RAID
1315 set.
1316 */
1317
1318 raidPtr->serial_number = clabel->serial_number;
1319
1320 RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t),
1321 (RF_ComponentLabel_t *));
1322 if (ci_label == NULL)
1323 return (ENOMEM);
1324
1325 raid_init_component_label(raidPtr, ci_label);
1326 ci_label->serial_number = clabel->serial_number;
1327 ci_label->row = 0; /* we dont' pretend to support more */
1328
1329 for(column=0;column<raidPtr->numCol;column++) {
1330 diskPtr = &raidPtr->Disks[column];
1331 if (!RF_DEAD_DISK(diskPtr->status)) {
1332 ci_label->partitionSize = diskPtr->partitionSize;
1333 ci_label->column = column;
1334 raidwrite_component_label(
1335 raidPtr->Disks[column].dev,
1336 raidPtr->raid_cinfo[column].ci_vp,
1337 ci_label );
1338 }
1339 }
1340 RF_Free(ci_label, sizeof(RF_ComponentLabel_t));
1341
1342 return (retcode);
1343 case RAIDFRAME_SET_AUTOCONFIG:
1344 d = rf_set_autoconfig(raidPtr, *(int *) data);
1345 printf("raid%d: New autoconfig value is: %d\n",
1346 raidPtr->raidid, d);
1347 *(int *) data = d;
1348 return (retcode);
1349
1350 case RAIDFRAME_SET_ROOT:
1351 d = rf_set_rootpartition(raidPtr, *(int *) data);
1352 printf("raid%d: New rootpartition value is: %d\n",
1353 raidPtr->raidid, d);
1354 *(int *) data = d;
1355 return (retcode);
1356
1357 /* initialize all parity */
1358 case RAIDFRAME_REWRITEPARITY:
1359
1360 if (raidPtr->Layout.map->faultsTolerated == 0) {
1361 /* Parity for RAID 0 is trivially correct */
1362 raidPtr->parity_good = RF_RAID_CLEAN;
1363 return(0);
1364 }
1365
1366 if (raidPtr->parity_rewrite_in_progress == 1) {
1367 /* Re-write is already in progress! */
1368 return(EINVAL);
1369 }
1370
1371 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1372 rf_RewriteParityThread,
1373 raidPtr,"raid_parity");
1374 return (retcode);
1375
1376
1377 case RAIDFRAME_ADD_HOT_SPARE:
1378 sparePtr = (RF_SingleComponent_t *) data;
1379 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1380 retcode = rf_add_hot_spare(raidPtr, &component);
1381 return(retcode);
1382
1383 case RAIDFRAME_REMOVE_HOT_SPARE:
1384 return(retcode);
1385
1386 case RAIDFRAME_DELETE_COMPONENT:
1387 componentPtr = (RF_SingleComponent_t *)data;
1388 memcpy( &component, componentPtr,
1389 sizeof(RF_SingleComponent_t));
1390 retcode = rf_delete_component(raidPtr, &component);
1391 return(retcode);
1392
1393 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1394 componentPtr = (RF_SingleComponent_t *)data;
1395 memcpy( &component, componentPtr,
1396 sizeof(RF_SingleComponent_t));
1397 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1398 return(retcode);
1399
1400 case RAIDFRAME_REBUILD_IN_PLACE:
1401
1402 if (raidPtr->Layout.map->faultsTolerated == 0) {
1403 /* Can't do this on a RAID 0!! */
1404 return(EINVAL);
1405 }
1406
1407 if (raidPtr->recon_in_progress == 1) {
1408 /* a reconstruct is already in progress! */
1409 return(EINVAL);
1410 }
1411
1412 componentPtr = (RF_SingleComponent_t *) data;
1413 memcpy( &component, componentPtr,
1414 sizeof(RF_SingleComponent_t));
1415 component.row = 0; /* we don't support any more */
1416 column = component.column;
1417
1418 if ((column < 0) || (column >= raidPtr->numCol)) {
1419 return(EINVAL);
1420 }
1421
1422 RF_LOCK_MUTEX(raidPtr->mutex);
1423 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1424 (raidPtr->numFailures > 0)) {
1425 /* XXX 0 above shouldn't be constant!!! */
1426 /* some component other than this has failed.
1427 Let's not make things worse than they already
1428 are... */
1429 printf("raid%d: Unable to reconstruct to disk at:\n",
1430 raidPtr->raidid);
1431 printf("raid%d: Col: %d Too many failures.\n",
1432 raidPtr->raidid, column);
1433 RF_UNLOCK_MUTEX(raidPtr->mutex);
1434 return (EINVAL);
1435 }
1436 if (raidPtr->Disks[column].status ==
1437 rf_ds_reconstructing) {
1438 printf("raid%d: Unable to reconstruct to disk at:\n",
1439 raidPtr->raidid);
1440 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1441
1442 RF_UNLOCK_MUTEX(raidPtr->mutex);
1443 return (EINVAL);
1444 }
1445 if (raidPtr->Disks[column].status == rf_ds_spared) {
1446 RF_UNLOCK_MUTEX(raidPtr->mutex);
1447 return (EINVAL);
1448 }
1449 RF_UNLOCK_MUTEX(raidPtr->mutex);
1450
1451 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1452 if (rrcopy == NULL)
1453 return(ENOMEM);
1454
1455 rrcopy->raidPtr = (void *) raidPtr;
1456 rrcopy->col = column;
1457
1458 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1459 rf_ReconstructInPlaceThread,
1460 rrcopy,"raid_reconip");
1461 return(retcode);
1462
1463 case RAIDFRAME_GET_INFO:
1464 if (!raidPtr->valid)
1465 return (ENODEV);
1466 ucfgp = (RF_DeviceConfig_t **) data;
1467 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1468 (RF_DeviceConfig_t *));
1469 if (d_cfg == NULL)
1470 return (ENOMEM);
1471 d_cfg->rows = 1; /* there is only 1 row now */
1472 d_cfg->cols = raidPtr->numCol;
1473 d_cfg->ndevs = raidPtr->numCol;
1474 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1475 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1476 return (ENOMEM);
1477 }
1478 d_cfg->nspares = raidPtr->numSpare;
1479 if (d_cfg->nspares >= RF_MAX_DISKS) {
1480 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1481 return (ENOMEM);
1482 }
1483 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1484 d = 0;
1485 for (j = 0; j < d_cfg->cols; j++) {
1486 d_cfg->devs[d] = raidPtr->Disks[j];
1487 d++;
1488 }
1489 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1490 d_cfg->spares[i] = raidPtr->Disks[j];
1491 }
1492 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1493 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1494
1495 return (retcode);
1496
1497 case RAIDFRAME_CHECK_PARITY:
1498 *(int *) data = raidPtr->parity_good;
1499 return (0);
1500
1501 case RAIDFRAME_RESET_ACCTOTALS:
1502 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1503 return (0);
1504
1505 case RAIDFRAME_GET_ACCTOTALS:
1506 totals = (RF_AccTotals_t *) data;
1507 *totals = raidPtr->acc_totals;
1508 return (0);
1509
1510 case RAIDFRAME_KEEP_ACCTOTALS:
1511 raidPtr->keep_acc_totals = *(int *)data;
1512 return (0);
1513
1514 case RAIDFRAME_GET_SIZE:
1515 *(int *) data = raidPtr->totalSectors;
1516 return (0);
1517
1518 /* fail a disk & optionally start reconstruction */
1519 case RAIDFRAME_FAIL_DISK:
1520
1521 if (raidPtr->Layout.map->faultsTolerated == 0) {
1522 /* Can't do this on a RAID 0!! */
1523 return(EINVAL);
1524 }
1525
1526 rr = (struct rf_recon_req *) data;
1527 rr->row = 0;
1528 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1529 return (EINVAL);
1530
1531
1532 RF_LOCK_MUTEX(raidPtr->mutex);
1533 if (raidPtr->status == rf_rs_reconstructing) {
1534 /* you can't fail a disk while we're reconstructing! */
1535 /* XXX wrong for RAID6 */
1536 RF_UNLOCK_MUTEX(raidPtr->mutex);
1537 return (EINVAL);
1538 }
1539 if ((raidPtr->Disks[rr->col].status ==
1540 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1541 /* some other component has failed. Let's not make
1542 things worse. XXX wrong for RAID6 */
1543 RF_UNLOCK_MUTEX(raidPtr->mutex);
1544 return (EINVAL);
1545 }
1546 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1547 /* Can't fail a spared disk! */
1548 RF_UNLOCK_MUTEX(raidPtr->mutex);
1549 return (EINVAL);
1550 }
1551 RF_UNLOCK_MUTEX(raidPtr->mutex);
1552
1553 /* make a copy of the recon request so that we don't rely on
1554 * the user's buffer */
1555 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1556 if (rrcopy == NULL)
1557 return(ENOMEM);
1558 memcpy(rrcopy, rr, sizeof(*rr));
1559 rrcopy->raidPtr = (void *) raidPtr;
1560
1561 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1562 rf_ReconThread,
1563 rrcopy,"raid_recon");
1564 return (0);
1565
1566 /* invoke a copyback operation after recon on whatever disk
1567 * needs it, if any */
1568 case RAIDFRAME_COPYBACK:
1569
1570 if (raidPtr->Layout.map->faultsTolerated == 0) {
1571 /* This makes no sense on a RAID 0!! */
1572 return(EINVAL);
1573 }
1574
1575 if (raidPtr->copyback_in_progress == 1) {
1576 /* Copyback is already in progress! */
1577 return(EINVAL);
1578 }
1579
1580 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1581 rf_CopybackThread,
1582 raidPtr,"raid_copyback");
1583 return (retcode);
1584
1585 /* return the percentage completion of reconstruction */
1586 case RAIDFRAME_CHECK_RECON_STATUS:
1587 if (raidPtr->Layout.map->faultsTolerated == 0) {
1588 /* This makes no sense on a RAID 0, so tell the
1589 user it's done. */
1590 *(int *) data = 100;
1591 return(0);
1592 }
1593 if (raidPtr->status != rf_rs_reconstructing)
1594 *(int *) data = 100;
1595 else {
1596 if (raidPtr->reconControl->numRUsTotal > 0) {
1597 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1598 } else {
1599 *(int *) data = 0;
1600 }
1601 }
1602 return (0);
1603 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1604 progressInfoPtr = (RF_ProgressInfo_t **) data;
1605 if (raidPtr->status != rf_rs_reconstructing) {
1606 progressInfo.remaining = 0;
1607 progressInfo.completed = 100;
1608 progressInfo.total = 100;
1609 } else {
1610 progressInfo.total =
1611 raidPtr->reconControl->numRUsTotal;
1612 progressInfo.completed =
1613 raidPtr->reconControl->numRUsComplete;
1614 progressInfo.remaining = progressInfo.total -
1615 progressInfo.completed;
1616 }
1617 retcode = copyout(&progressInfo, *progressInfoPtr,
1618 sizeof(RF_ProgressInfo_t));
1619 return (retcode);
1620
1621 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1622 if (raidPtr->Layout.map->faultsTolerated == 0) {
1623 /* This makes no sense on a RAID 0, so tell the
1624 user it's done. */
1625 *(int *) data = 100;
1626 return(0);
1627 }
1628 if (raidPtr->parity_rewrite_in_progress == 1) {
1629 *(int *) data = 100 *
1630 raidPtr->parity_rewrite_stripes_done /
1631 raidPtr->Layout.numStripe;
1632 } else {
1633 *(int *) data = 100;
1634 }
1635 return (0);
1636
1637 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1638 progressInfoPtr = (RF_ProgressInfo_t **) data;
1639 if (raidPtr->parity_rewrite_in_progress == 1) {
1640 progressInfo.total = raidPtr->Layout.numStripe;
1641 progressInfo.completed =
1642 raidPtr->parity_rewrite_stripes_done;
1643 progressInfo.remaining = progressInfo.total -
1644 progressInfo.completed;
1645 } else {
1646 progressInfo.remaining = 0;
1647 progressInfo.completed = 100;
1648 progressInfo.total = 100;
1649 }
1650 retcode = copyout(&progressInfo, *progressInfoPtr,
1651 sizeof(RF_ProgressInfo_t));
1652 return (retcode);
1653
1654 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1655 if (raidPtr->Layout.map->faultsTolerated == 0) {
1656 /* This makes no sense on a RAID 0 */
1657 *(int *) data = 100;
1658 return(0);
1659 }
1660 if (raidPtr->copyback_in_progress == 1) {
1661 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1662 raidPtr->Layout.numStripe;
1663 } else {
1664 *(int *) data = 100;
1665 }
1666 return (0);
1667
1668 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1669 progressInfoPtr = (RF_ProgressInfo_t **) data;
1670 if (raidPtr->copyback_in_progress == 1) {
1671 progressInfo.total = raidPtr->Layout.numStripe;
1672 progressInfo.completed =
1673 raidPtr->copyback_stripes_done;
1674 progressInfo.remaining = progressInfo.total -
1675 progressInfo.completed;
1676 } else {
1677 progressInfo.remaining = 0;
1678 progressInfo.completed = 100;
1679 progressInfo.total = 100;
1680 }
1681 retcode = copyout(&progressInfo, *progressInfoPtr,
1682 sizeof(RF_ProgressInfo_t));
1683 return (retcode);
1684
1685 /* the sparetable daemon calls this to wait for the kernel to
1686 * need a spare table. this ioctl does not return until a
1687 * spare table is needed. XXX -- calling mpsleep here in the
1688 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1689 * -- I should either compute the spare table in the kernel,
1690 * or have a different -- XXX XXX -- interface (a different
1691 * character device) for delivering the table -- XXX */
1692 #if 0
1693 case RAIDFRAME_SPARET_WAIT:
1694 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1695 while (!rf_sparet_wait_queue)
1696 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1697 waitreq = rf_sparet_wait_queue;
1698 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1699 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1700
1701 /* structure assignment */
1702 *((RF_SparetWait_t *) data) = *waitreq;
1703
1704 RF_Free(waitreq, sizeof(*waitreq));
1705 return (0);
1706
1707 /* wakes up a process waiting on SPARET_WAIT and puts an error
1708 * code in it that will cause the dameon to exit */
1709 case RAIDFRAME_ABORT_SPARET_WAIT:
1710 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1711 waitreq->fcol = -1;
1712 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1713 waitreq->next = rf_sparet_wait_queue;
1714 rf_sparet_wait_queue = waitreq;
1715 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1716 wakeup(&rf_sparet_wait_queue);
1717 return (0);
1718
1719 /* used by the spare table daemon to deliver a spare table
1720 * into the kernel */
1721 case RAIDFRAME_SEND_SPARET:
1722
1723 /* install the spare table */
1724 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1725
1726 /* respond to the requestor. the return status of the spare
1727 * table installation is passed in the "fcol" field */
1728 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1729 waitreq->fcol = retcode;
1730 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1731 waitreq->next = rf_sparet_resp_queue;
1732 rf_sparet_resp_queue = waitreq;
1733 wakeup(&rf_sparet_resp_queue);
1734 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1735
1736 return (retcode);
1737 #endif
1738
1739 default:
1740 break; /* fall through to the os-specific code below */
1741
1742 }
1743
1744 if (!raidPtr->valid)
1745 return (EINVAL);
1746
1747 /*
1748 * Add support for "regular" device ioctls here.
1749 */
1750
1751 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1752 if (error != EPASSTHROUGH)
1753 return (error);
1754
1755 switch (cmd) {
1756 case DIOCGDINFO:
1757 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1758 break;
1759 #ifdef __HAVE_OLD_DISKLABEL
1760 case ODIOCGDINFO:
1761 newlabel = *(rs->sc_dkdev.dk_label);
1762 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1763 return ENOTTY;
1764 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1765 break;
1766 #endif
1767
1768 case DIOCGPART:
1769 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1770 ((struct partinfo *) data)->part =
1771 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1772 break;
1773
1774 case DIOCWDINFO:
1775 case DIOCSDINFO:
1776 #ifdef __HAVE_OLD_DISKLABEL
1777 case ODIOCWDINFO:
1778 case ODIOCSDINFO:
1779 #endif
1780 {
1781 struct disklabel *lp;
1782 #ifdef __HAVE_OLD_DISKLABEL
1783 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1784 memset(&newlabel, 0, sizeof newlabel);
1785 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1786 lp = &newlabel;
1787 } else
1788 #endif
1789 lp = (struct disklabel *)data;
1790
1791 if ((error = raidlock(rs)) != 0)
1792 return (error);
1793
1794 rs->sc_flags |= RAIDF_LABELLING;
1795
1796 error = setdisklabel(rs->sc_dkdev.dk_label,
1797 lp, 0, rs->sc_dkdev.dk_cpulabel);
1798 if (error == 0) {
1799 if (cmd == DIOCWDINFO
1800 #ifdef __HAVE_OLD_DISKLABEL
1801 || cmd == ODIOCWDINFO
1802 #endif
1803 )
1804 error = writedisklabel(RAIDLABELDEV(dev),
1805 raidstrategy, rs->sc_dkdev.dk_label,
1806 rs->sc_dkdev.dk_cpulabel);
1807 }
1808 rs->sc_flags &= ~RAIDF_LABELLING;
1809
1810 raidunlock(rs);
1811
1812 if (error)
1813 return (error);
1814 break;
1815 }
1816
1817 case DIOCWLABEL:
1818 if (*(int *) data != 0)
1819 rs->sc_flags |= RAIDF_WLABEL;
1820 else
1821 rs->sc_flags &= ~RAIDF_WLABEL;
1822 break;
1823
1824 case DIOCGDEFLABEL:
1825 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1826 break;
1827
1828 #ifdef __HAVE_OLD_DISKLABEL
1829 case ODIOCGDEFLABEL:
1830 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1831 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1832 return ENOTTY;
1833 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1834 break;
1835 #endif
1836
1837 case DIOCAWEDGE:
1838 case DIOCDWEDGE:
1839 dkw = (void *)data;
1840
1841 /* If the ioctl happens here, the parent is us. */
1842 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1843 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1844
1845 case DIOCLWEDGES:
1846 return dkwedge_list(&rs->sc_dkdev,
1847 (struct dkwedge_list *)data, l);
1848 case DIOCCACHESYNC:
1849 return rf_sync_component_caches(raidPtr);
1850 default:
1851 retcode = ENOTTY;
1852 }
1853 return (retcode);
1854
1855 }
1856
1857
1858 /* raidinit -- complete the rest of the initialization for the
1859 RAIDframe device. */
1860
1861
1862 static void
1863 raidinit(RF_Raid_t *raidPtr)
1864 {
1865 cfdata_t cf;
1866 struct raid_softc *rs;
1867 int unit;
1868
1869 unit = raidPtr->raidid;
1870
1871 rs = &raid_softc[unit];
1872
1873 /* XXX should check return code first... */
1874 rs->sc_flags |= RAIDF_INITED;
1875
1876 /* XXX doesn't check bounds. */
1877 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1878
1879 /* attach the pseudo device */
1880 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1881 cf->cf_name = raid_cd.cd_name;
1882 cf->cf_atname = raid_cd.cd_name;
1883 cf->cf_unit = unit;
1884 cf->cf_fstate = FSTATE_STAR;
1885
1886 rs->sc_dev = config_attach_pseudo(cf);
1887
1888 if (rs->sc_dev==NULL) {
1889 printf("raid%d: config_attach_pseudo failed\n",
1890 raidPtr->raidid);
1891 rs->sc_flags &= ~RAIDF_INITED;
1892 free(cf, M_RAIDFRAME);
1893 return;
1894 }
1895
1896 /* disk_attach actually creates space for the CPU disklabel, among
1897 * other things, so it's critical to call this *BEFORE* we try putzing
1898 * with disklabels. */
1899
1900 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1901 disk_attach(&rs->sc_dkdev);
1902
1903 /* XXX There may be a weird interaction here between this, and
1904 * protectedSectors, as used in RAIDframe. */
1905
1906 rs->sc_size = raidPtr->totalSectors;
1907
1908 dkwedge_discover(&rs->sc_dkdev);
1909
1910 rf_set_properties(rs, raidPtr);
1911
1912 }
1913 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1914 /* wake up the daemon & tell it to get us a spare table
1915 * XXX
1916 * the entries in the queues should be tagged with the raidPtr
1917 * so that in the extremely rare case that two recons happen at once,
1918 * we know for which device were requesting a spare table
1919 * XXX
1920 *
1921 * XXX This code is not currently used. GO
1922 */
1923 int
1924 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1925 {
1926 int retcode;
1927
1928 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1929 req->next = rf_sparet_wait_queue;
1930 rf_sparet_wait_queue = req;
1931 wakeup(&rf_sparet_wait_queue);
1932
1933 /* mpsleep unlocks the mutex */
1934 while (!rf_sparet_resp_queue) {
1935 tsleep(&rf_sparet_resp_queue, PRIBIO,
1936 "raidframe getsparetable", 0);
1937 }
1938 req = rf_sparet_resp_queue;
1939 rf_sparet_resp_queue = req->next;
1940 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1941
1942 retcode = req->fcol;
1943 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1944 * alloc'd */
1945 return (retcode);
1946 }
1947 #endif
1948
1949 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1950 * bp & passes it down.
1951 * any calls originating in the kernel must use non-blocking I/O
1952 * do some extra sanity checking to return "appropriate" error values for
1953 * certain conditions (to make some standard utilities work)
1954 *
1955 * Formerly known as: rf_DoAccessKernel
1956 */
1957 void
1958 raidstart(RF_Raid_t *raidPtr)
1959 {
1960 RF_SectorCount_t num_blocks, pb, sum;
1961 RF_RaidAddr_t raid_addr;
1962 struct partition *pp;
1963 daddr_t blocknum;
1964 int unit;
1965 struct raid_softc *rs;
1966 int do_async;
1967 struct buf *bp;
1968 int rc;
1969
1970 unit = raidPtr->raidid;
1971 rs = &raid_softc[unit];
1972
1973 /* quick check to see if anything has died recently */
1974 RF_LOCK_MUTEX(raidPtr->mutex);
1975 if (raidPtr->numNewFailures > 0) {
1976 RF_UNLOCK_MUTEX(raidPtr->mutex);
1977 rf_update_component_labels(raidPtr,
1978 RF_NORMAL_COMPONENT_UPDATE);
1979 RF_LOCK_MUTEX(raidPtr->mutex);
1980 raidPtr->numNewFailures--;
1981 }
1982
1983 /* Check to see if we're at the limit... */
1984 while (raidPtr->openings > 0) {
1985 RF_UNLOCK_MUTEX(raidPtr->mutex);
1986
1987 /* get the next item, if any, from the queue */
1988 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
1989 /* nothing more to do */
1990 return;
1991 }
1992
1993 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1994 * partition.. Need to make it absolute to the underlying
1995 * device.. */
1996
1997 blocknum = bp->b_blkno;
1998 if (DISKPART(bp->b_dev) != RAW_PART) {
1999 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2000 blocknum += pp->p_offset;
2001 }
2002
2003 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2004 (int) blocknum));
2005
2006 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2007 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2008
2009 /* *THIS* is where we adjust what block we're going to...
2010 * but DO NOT TOUCH bp->b_blkno!!! */
2011 raid_addr = blocknum;
2012
2013 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2014 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2015 sum = raid_addr + num_blocks + pb;
2016 if (1 || rf_debugKernelAccess) {
2017 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2018 (int) raid_addr, (int) sum, (int) num_blocks,
2019 (int) pb, (int) bp->b_resid));
2020 }
2021 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2022 || (sum < num_blocks) || (sum < pb)) {
2023 bp->b_error = ENOSPC;
2024 bp->b_resid = bp->b_bcount;
2025 biodone(bp);
2026 RF_LOCK_MUTEX(raidPtr->mutex);
2027 continue;
2028 }
2029 /*
2030 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2031 */
2032
2033 if (bp->b_bcount & raidPtr->sectorMask) {
2034 bp->b_error = EINVAL;
2035 bp->b_resid = bp->b_bcount;
2036 biodone(bp);
2037 RF_LOCK_MUTEX(raidPtr->mutex);
2038 continue;
2039
2040 }
2041 db1_printf(("Calling DoAccess..\n"));
2042
2043
2044 RF_LOCK_MUTEX(raidPtr->mutex);
2045 raidPtr->openings--;
2046 RF_UNLOCK_MUTEX(raidPtr->mutex);
2047
2048 /*
2049 * Everything is async.
2050 */
2051 do_async = 1;
2052
2053 disk_busy(&rs->sc_dkdev);
2054
2055 /* XXX we're still at splbio() here... do we *really*
2056 need to be? */
2057
2058 /* don't ever condition on bp->b_flags & B_WRITE.
2059 * always condition on B_READ instead */
2060
2061 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2062 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2063 do_async, raid_addr, num_blocks,
2064 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2065
2066 if (rc) {
2067 bp->b_error = rc;
2068 bp->b_resid = bp->b_bcount;
2069 biodone(bp);
2070 /* continue loop */
2071 }
2072
2073 RF_LOCK_MUTEX(raidPtr->mutex);
2074 }
2075 RF_UNLOCK_MUTEX(raidPtr->mutex);
2076 }
2077
2078
2079
2080
2081 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2082
2083 int
2084 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2085 {
2086 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2087 struct buf *bp;
2088
2089 req->queue = queue;
2090 bp = req->bp;
2091
2092 switch (req->type) {
2093 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2094 /* XXX need to do something extra here.. */
2095 /* I'm leaving this in, as I've never actually seen it used,
2096 * and I'd like folks to report it... GO */
2097 printf(("WAKEUP CALLED\n"));
2098 queue->numOutstanding++;
2099
2100 bp->b_flags = 0;
2101 bp->b_private = req;
2102
2103 KernelWakeupFunc(bp);
2104 break;
2105
2106 case RF_IO_TYPE_READ:
2107 case RF_IO_TYPE_WRITE:
2108 #if RF_ACC_TRACE > 0
2109 if (req->tracerec) {
2110 RF_ETIMER_START(req->tracerec->timer);
2111 }
2112 #endif
2113 InitBP(bp, queue->rf_cinfo->ci_vp,
2114 op, queue->rf_cinfo->ci_dev,
2115 req->sectorOffset, req->numSector,
2116 req->buf, KernelWakeupFunc, (void *) req,
2117 queue->raidPtr->logBytesPerSector, req->b_proc);
2118
2119 if (rf_debugKernelAccess) {
2120 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2121 (long) bp->b_blkno));
2122 }
2123 queue->numOutstanding++;
2124 queue->last_deq_sector = req->sectorOffset;
2125 /* acc wouldn't have been let in if there were any pending
2126 * reqs at any other priority */
2127 queue->curPriority = req->priority;
2128
2129 db1_printf(("Going for %c to unit %d col %d\n",
2130 req->type, queue->raidPtr->raidid,
2131 queue->col));
2132 db1_printf(("sector %d count %d (%d bytes) %d\n",
2133 (int) req->sectorOffset, (int) req->numSector,
2134 (int) (req->numSector <<
2135 queue->raidPtr->logBytesPerSector),
2136 (int) queue->raidPtr->logBytesPerSector));
2137
2138 /*
2139 * XXX: drop lock here since this can block at
2140 * least with backing SCSI devices. Retake it
2141 * to minimize fuss with calling interfaces.
2142 */
2143
2144 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2145 bdev_strategy(bp);
2146 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2147 break;
2148
2149 default:
2150 panic("bad req->type in rf_DispatchKernelIO");
2151 }
2152 db1_printf(("Exiting from DispatchKernelIO\n"));
2153
2154 return (0);
2155 }
2156 /* this is the callback function associated with a I/O invoked from
2157 kernel code.
2158 */
2159 static void
2160 KernelWakeupFunc(struct buf *bp)
2161 {
2162 RF_DiskQueueData_t *req = NULL;
2163 RF_DiskQueue_t *queue;
2164 int s;
2165
2166 s = splbio();
2167 db1_printf(("recovering the request queue:\n"));
2168 req = bp->b_private;
2169
2170 queue = (RF_DiskQueue_t *) req->queue;
2171
2172 #if RF_ACC_TRACE > 0
2173 if (req->tracerec) {
2174 RF_ETIMER_STOP(req->tracerec->timer);
2175 RF_ETIMER_EVAL(req->tracerec->timer);
2176 RF_LOCK_MUTEX(rf_tracing_mutex);
2177 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2178 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2179 req->tracerec->num_phys_ios++;
2180 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2181 }
2182 #endif
2183
2184 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2185 * ballistic, and mark the component as hosed... */
2186
2187 if (bp->b_error != 0) {
2188 /* Mark the disk as dead */
2189 /* but only mark it once... */
2190 /* and only if it wouldn't leave this RAID set
2191 completely broken */
2192 if (((queue->raidPtr->Disks[queue->col].status ==
2193 rf_ds_optimal) ||
2194 (queue->raidPtr->Disks[queue->col].status ==
2195 rf_ds_used_spare)) &&
2196 (queue->raidPtr->numFailures <
2197 queue->raidPtr->Layout.map->faultsTolerated)) {
2198 printf("raid%d: IO Error. Marking %s as failed.\n",
2199 queue->raidPtr->raidid,
2200 queue->raidPtr->Disks[queue->col].devname);
2201 queue->raidPtr->Disks[queue->col].status =
2202 rf_ds_failed;
2203 queue->raidPtr->status = rf_rs_degraded;
2204 queue->raidPtr->numFailures++;
2205 queue->raidPtr->numNewFailures++;
2206 } else { /* Disk is already dead... */
2207 /* printf("Disk already marked as dead!\n"); */
2208 }
2209
2210 }
2211
2212 /* Fill in the error value */
2213
2214 req->error = bp->b_error;
2215
2216 simple_lock(&queue->raidPtr->iodone_lock);
2217
2218 /* Drop this one on the "finished" queue... */
2219 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2220
2221 /* Let the raidio thread know there is work to be done. */
2222 wakeup(&(queue->raidPtr->iodone));
2223
2224 simple_unlock(&queue->raidPtr->iodone_lock);
2225
2226 splx(s);
2227 }
2228
2229
2230
2231 /*
2232 * initialize a buf structure for doing an I/O in the kernel.
2233 */
2234 static void
2235 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2236 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2237 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2238 struct proc *b_proc)
2239 {
2240 /* bp->b_flags = B_PHYS | rw_flag; */
2241 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2242 bp->b_oflags = 0;
2243 bp->b_cflags = 0;
2244 bp->b_bcount = numSect << logBytesPerSector;
2245 bp->b_bufsize = bp->b_bcount;
2246 bp->b_error = 0;
2247 bp->b_dev = dev;
2248 bp->b_data = bf;
2249 bp->b_blkno = startSect;
2250 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2251 if (bp->b_bcount == 0) {
2252 panic("bp->b_bcount is zero in InitBP!!");
2253 }
2254 bp->b_proc = b_proc;
2255 bp->b_iodone = cbFunc;
2256 bp->b_private = cbArg;
2257 }
2258
2259 static void
2260 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2261 struct disklabel *lp)
2262 {
2263 memset(lp, 0, sizeof(*lp));
2264
2265 /* fabricate a label... */
2266 lp->d_secperunit = raidPtr->totalSectors;
2267 lp->d_secsize = raidPtr->bytesPerSector;
2268 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2269 lp->d_ntracks = 4 * raidPtr->numCol;
2270 lp->d_ncylinders = raidPtr->totalSectors /
2271 (lp->d_nsectors * lp->d_ntracks);
2272 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2273
2274 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2275 lp->d_type = DTYPE_RAID;
2276 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2277 lp->d_rpm = 3600;
2278 lp->d_interleave = 1;
2279 lp->d_flags = 0;
2280
2281 lp->d_partitions[RAW_PART].p_offset = 0;
2282 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2283 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2284 lp->d_npartitions = RAW_PART + 1;
2285
2286 lp->d_magic = DISKMAGIC;
2287 lp->d_magic2 = DISKMAGIC;
2288 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2289
2290 }
2291 /*
2292 * Read the disklabel from the raid device. If one is not present, fake one
2293 * up.
2294 */
2295 static void
2296 raidgetdisklabel(dev_t dev)
2297 {
2298 int unit = raidunit(dev);
2299 struct raid_softc *rs = &raid_softc[unit];
2300 const char *errstring;
2301 struct disklabel *lp = rs->sc_dkdev.dk_label;
2302 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2303 RF_Raid_t *raidPtr;
2304
2305 db1_printf(("Getting the disklabel...\n"));
2306
2307 memset(clp, 0, sizeof(*clp));
2308
2309 raidPtr = raidPtrs[unit];
2310
2311 raidgetdefaultlabel(raidPtr, rs, lp);
2312
2313 /*
2314 * Call the generic disklabel extraction routine.
2315 */
2316 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2317 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2318 if (errstring)
2319 raidmakedisklabel(rs);
2320 else {
2321 int i;
2322 struct partition *pp;
2323
2324 /*
2325 * Sanity check whether the found disklabel is valid.
2326 *
2327 * This is necessary since total size of the raid device
2328 * may vary when an interleave is changed even though exactly
2329 * same components are used, and old disklabel may used
2330 * if that is found.
2331 */
2332 if (lp->d_secperunit != rs->sc_size)
2333 printf("raid%d: WARNING: %s: "
2334 "total sector size in disklabel (%" PRIu32 ") != "
2335 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2336 lp->d_secperunit, rs->sc_size);
2337 for (i = 0; i < lp->d_npartitions; i++) {
2338 pp = &lp->d_partitions[i];
2339 if (pp->p_offset + pp->p_size > rs->sc_size)
2340 printf("raid%d: WARNING: %s: end of partition `%c' "
2341 "exceeds the size of raid (%" PRIu64 ")\n",
2342 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2343 }
2344 }
2345
2346 }
2347 /*
2348 * Take care of things one might want to take care of in the event
2349 * that a disklabel isn't present.
2350 */
2351 static void
2352 raidmakedisklabel(struct raid_softc *rs)
2353 {
2354 struct disklabel *lp = rs->sc_dkdev.dk_label;
2355 db1_printf(("Making a label..\n"));
2356
2357 /*
2358 * For historical reasons, if there's no disklabel present
2359 * the raw partition must be marked FS_BSDFFS.
2360 */
2361
2362 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2363
2364 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2365
2366 lp->d_checksum = dkcksum(lp);
2367 }
2368 /*
2369 * Wait interruptibly for an exclusive lock.
2370 *
2371 * XXX
2372 * Several drivers do this; it should be abstracted and made MP-safe.
2373 * (Hmm... where have we seen this warning before :-> GO )
2374 */
2375 static int
2376 raidlock(struct raid_softc *rs)
2377 {
2378 int error;
2379
2380 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2381 rs->sc_flags |= RAIDF_WANTED;
2382 if ((error =
2383 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2384 return (error);
2385 }
2386 rs->sc_flags |= RAIDF_LOCKED;
2387 return (0);
2388 }
2389 /*
2390 * Unlock and wake up any waiters.
2391 */
2392 static void
2393 raidunlock(struct raid_softc *rs)
2394 {
2395
2396 rs->sc_flags &= ~RAIDF_LOCKED;
2397 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2398 rs->sc_flags &= ~RAIDF_WANTED;
2399 wakeup(rs);
2400 }
2401 }
2402
2403
2404 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2405 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2406
2407 int
2408 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2409 {
2410 RF_ComponentLabel_t clabel;
2411 raidread_component_label(dev, b_vp, &clabel);
2412 clabel.mod_counter = mod_counter;
2413 clabel.clean = RF_RAID_CLEAN;
2414 raidwrite_component_label(dev, b_vp, &clabel);
2415 return(0);
2416 }
2417
2418
2419 int
2420 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2421 {
2422 RF_ComponentLabel_t clabel;
2423 raidread_component_label(dev, b_vp, &clabel);
2424 clabel.mod_counter = mod_counter;
2425 clabel.clean = RF_RAID_DIRTY;
2426 raidwrite_component_label(dev, b_vp, &clabel);
2427 return(0);
2428 }
2429
2430 /* ARGSUSED */
2431 int
2432 raidread_component_label(dev_t dev, struct vnode *b_vp,
2433 RF_ComponentLabel_t *clabel)
2434 {
2435 struct buf *bp;
2436 const struct bdevsw *bdev;
2437 int error;
2438
2439 /* XXX should probably ensure that we don't try to do this if
2440 someone has changed rf_protected_sectors. */
2441
2442 if (b_vp == NULL) {
2443 /* For whatever reason, this component is not valid.
2444 Don't try to read a component label from it. */
2445 return(EINVAL);
2446 }
2447
2448 /* get a block of the appropriate size... */
2449 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2450 bp->b_dev = dev;
2451
2452 /* get our ducks in a row for the read */
2453 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2454 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2455 bp->b_flags |= B_READ;
2456 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2457
2458 bdev = bdevsw_lookup(bp->b_dev);
2459 if (bdev == NULL)
2460 return (ENXIO);
2461 (*bdev->d_strategy)(bp);
2462
2463 error = biowait(bp);
2464
2465 if (!error) {
2466 memcpy(clabel, bp->b_data,
2467 sizeof(RF_ComponentLabel_t));
2468 }
2469
2470 brelse(bp, 0);
2471 return(error);
2472 }
2473 /* ARGSUSED */
2474 int
2475 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2476 RF_ComponentLabel_t *clabel)
2477 {
2478 struct buf *bp;
2479 const struct bdevsw *bdev;
2480 int error;
2481
2482 /* get a block of the appropriate size... */
2483 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2484 bp->b_dev = dev;
2485
2486 /* get our ducks in a row for the write */
2487 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2488 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2489 bp->b_flags |= B_WRITE;
2490 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2491
2492 memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
2493
2494 memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
2495
2496 bdev = bdevsw_lookup(bp->b_dev);
2497 if (bdev == NULL)
2498 return (ENXIO);
2499 (*bdev->d_strategy)(bp);
2500 error = biowait(bp);
2501 brelse(bp, 0);
2502 if (error) {
2503 #if 1
2504 printf("Failed to write RAID component info!\n");
2505 #endif
2506 }
2507
2508 return(error);
2509 }
2510
2511 void
2512 rf_markalldirty(RF_Raid_t *raidPtr)
2513 {
2514 RF_ComponentLabel_t clabel;
2515 int sparecol;
2516 int c;
2517 int j;
2518 int scol = -1;
2519
2520 raidPtr->mod_counter++;
2521 for (c = 0; c < raidPtr->numCol; c++) {
2522 /* we don't want to touch (at all) a disk that has
2523 failed */
2524 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2525 raidread_component_label(
2526 raidPtr->Disks[c].dev,
2527 raidPtr->raid_cinfo[c].ci_vp,
2528 &clabel);
2529 if (clabel.status == rf_ds_spared) {
2530 /* XXX do something special...
2531 but whatever you do, don't
2532 try to access it!! */
2533 } else {
2534 raidmarkdirty(
2535 raidPtr->Disks[c].dev,
2536 raidPtr->raid_cinfo[c].ci_vp,
2537 raidPtr->mod_counter);
2538 }
2539 }
2540 }
2541
2542 for( c = 0; c < raidPtr->numSpare ; c++) {
2543 sparecol = raidPtr->numCol + c;
2544 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2545 /*
2546
2547 we claim this disk is "optimal" if it's
2548 rf_ds_used_spare, as that means it should be
2549 directly substitutable for the disk it replaced.
2550 We note that too...
2551
2552 */
2553
2554 for(j=0;j<raidPtr->numCol;j++) {
2555 if (raidPtr->Disks[j].spareCol == sparecol) {
2556 scol = j;
2557 break;
2558 }
2559 }
2560
2561 raidread_component_label(
2562 raidPtr->Disks[sparecol].dev,
2563 raidPtr->raid_cinfo[sparecol].ci_vp,
2564 &clabel);
2565 /* make sure status is noted */
2566
2567 raid_init_component_label(raidPtr, &clabel);
2568
2569 clabel.row = 0;
2570 clabel.column = scol;
2571 /* Note: we *don't* change status from rf_ds_used_spare
2572 to rf_ds_optimal */
2573 /* clabel.status = rf_ds_optimal; */
2574
2575 raidmarkdirty(raidPtr->Disks[sparecol].dev,
2576 raidPtr->raid_cinfo[sparecol].ci_vp,
2577 raidPtr->mod_counter);
2578 }
2579 }
2580 }
2581
2582
2583 void
2584 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2585 {
2586 RF_ComponentLabel_t clabel;
2587 int sparecol;
2588 int c;
2589 int j;
2590 int scol;
2591
2592 scol = -1;
2593
2594 /* XXX should do extra checks to make sure things really are clean,
2595 rather than blindly setting the clean bit... */
2596
2597 raidPtr->mod_counter++;
2598
2599 for (c = 0; c < raidPtr->numCol; c++) {
2600 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2601 raidread_component_label(
2602 raidPtr->Disks[c].dev,
2603 raidPtr->raid_cinfo[c].ci_vp,
2604 &clabel);
2605 /* make sure status is noted */
2606 clabel.status = rf_ds_optimal;
2607
2608 /* bump the counter */
2609 clabel.mod_counter = raidPtr->mod_counter;
2610
2611 /* note what unit we are configured as */
2612 clabel.last_unit = raidPtr->raidid;
2613
2614 raidwrite_component_label(
2615 raidPtr->Disks[c].dev,
2616 raidPtr->raid_cinfo[c].ci_vp,
2617 &clabel);
2618 if (final == RF_FINAL_COMPONENT_UPDATE) {
2619 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2620 raidmarkclean(
2621 raidPtr->Disks[c].dev,
2622 raidPtr->raid_cinfo[c].ci_vp,
2623 raidPtr->mod_counter);
2624 }
2625 }
2626 }
2627 /* else we don't touch it.. */
2628 }
2629
2630 for( c = 0; c < raidPtr->numSpare ; c++) {
2631 sparecol = raidPtr->numCol + c;
2632 /* Need to ensure that the reconstruct actually completed! */
2633 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2634 /*
2635
2636 we claim this disk is "optimal" if it's
2637 rf_ds_used_spare, as that means it should be
2638 directly substitutable for the disk it replaced.
2639 We note that too...
2640
2641 */
2642
2643 for(j=0;j<raidPtr->numCol;j++) {
2644 if (raidPtr->Disks[j].spareCol == sparecol) {
2645 scol = j;
2646 break;
2647 }
2648 }
2649
2650 /* XXX shouldn't *really* need this... */
2651 raidread_component_label(
2652 raidPtr->Disks[sparecol].dev,
2653 raidPtr->raid_cinfo[sparecol].ci_vp,
2654 &clabel);
2655 /* make sure status is noted */
2656
2657 raid_init_component_label(raidPtr, &clabel);
2658
2659 clabel.mod_counter = raidPtr->mod_counter;
2660 clabel.column = scol;
2661 clabel.status = rf_ds_optimal;
2662 clabel.last_unit = raidPtr->raidid;
2663
2664 raidwrite_component_label(
2665 raidPtr->Disks[sparecol].dev,
2666 raidPtr->raid_cinfo[sparecol].ci_vp,
2667 &clabel);
2668 if (final == RF_FINAL_COMPONENT_UPDATE) {
2669 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2670 raidmarkclean( raidPtr->Disks[sparecol].dev,
2671 raidPtr->raid_cinfo[sparecol].ci_vp,
2672 raidPtr->mod_counter);
2673 }
2674 }
2675 }
2676 }
2677 }
2678
2679 void
2680 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2681 {
2682
2683 if (vp != NULL) {
2684 if (auto_configured == 1) {
2685 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2686 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2687 vput(vp);
2688
2689 } else {
2690 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2691 }
2692 }
2693 }
2694
2695
2696 void
2697 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2698 {
2699 int r,c;
2700 struct vnode *vp;
2701 int acd;
2702
2703
2704 /* We take this opportunity to close the vnodes like we should.. */
2705
2706 for (c = 0; c < raidPtr->numCol; c++) {
2707 vp = raidPtr->raid_cinfo[c].ci_vp;
2708 acd = raidPtr->Disks[c].auto_configured;
2709 rf_close_component(raidPtr, vp, acd);
2710 raidPtr->raid_cinfo[c].ci_vp = NULL;
2711 raidPtr->Disks[c].auto_configured = 0;
2712 }
2713
2714 for (r = 0; r < raidPtr->numSpare; r++) {
2715 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2716 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2717 rf_close_component(raidPtr, vp, acd);
2718 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2719 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2720 }
2721 }
2722
2723
2724 void
2725 rf_ReconThread(struct rf_recon_req *req)
2726 {
2727 int s;
2728 RF_Raid_t *raidPtr;
2729
2730 s = splbio();
2731 raidPtr = (RF_Raid_t *) req->raidPtr;
2732 raidPtr->recon_in_progress = 1;
2733
2734 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2735 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2736
2737 RF_Free(req, sizeof(*req));
2738
2739 raidPtr->recon_in_progress = 0;
2740 splx(s);
2741
2742 /* That's all... */
2743 kthread_exit(0); /* does not return */
2744 }
2745
2746 void
2747 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2748 {
2749 int retcode;
2750 int s;
2751
2752 raidPtr->parity_rewrite_stripes_done = 0;
2753 raidPtr->parity_rewrite_in_progress = 1;
2754 s = splbio();
2755 retcode = rf_RewriteParity(raidPtr);
2756 splx(s);
2757 if (retcode) {
2758 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2759 } else {
2760 /* set the clean bit! If we shutdown correctly,
2761 the clean bit on each component label will get
2762 set */
2763 raidPtr->parity_good = RF_RAID_CLEAN;
2764 }
2765 raidPtr->parity_rewrite_in_progress = 0;
2766
2767 /* Anyone waiting for us to stop? If so, inform them... */
2768 if (raidPtr->waitShutdown) {
2769 wakeup(&raidPtr->parity_rewrite_in_progress);
2770 }
2771
2772 /* That's all... */
2773 kthread_exit(0); /* does not return */
2774 }
2775
2776
2777 void
2778 rf_CopybackThread(RF_Raid_t *raidPtr)
2779 {
2780 int s;
2781
2782 raidPtr->copyback_in_progress = 1;
2783 s = splbio();
2784 rf_CopybackReconstructedData(raidPtr);
2785 splx(s);
2786 raidPtr->copyback_in_progress = 0;
2787
2788 /* That's all... */
2789 kthread_exit(0); /* does not return */
2790 }
2791
2792
2793 void
2794 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2795 {
2796 int s;
2797 RF_Raid_t *raidPtr;
2798
2799 s = splbio();
2800 raidPtr = req->raidPtr;
2801 raidPtr->recon_in_progress = 1;
2802 rf_ReconstructInPlace(raidPtr, req->col);
2803 RF_Free(req, sizeof(*req));
2804 raidPtr->recon_in_progress = 0;
2805 splx(s);
2806
2807 /* That's all... */
2808 kthread_exit(0); /* does not return */
2809 }
2810
2811 static RF_AutoConfig_t *
2812 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2813 const char *cname, RF_SectorCount_t size)
2814 {
2815 int good_one = 0;
2816 RF_ComponentLabel_t *clabel;
2817 RF_AutoConfig_t *ac;
2818
2819 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2820 if (clabel == NULL) {
2821 oomem:
2822 while(ac_list) {
2823 ac = ac_list;
2824 if (ac->clabel)
2825 free(ac->clabel, M_RAIDFRAME);
2826 ac_list = ac_list->next;
2827 free(ac, M_RAIDFRAME);
2828 }
2829 printf("RAID auto config: out of memory!\n");
2830 return NULL; /* XXX probably should panic? */
2831 }
2832
2833 if (!raidread_component_label(dev, vp, clabel)) {
2834 /* Got the label. Does it look reasonable? */
2835 if (rf_reasonable_label(clabel) &&
2836 (clabel->partitionSize <= size)) {
2837 #ifdef DEBUG
2838 printf("Component on: %s: %llu\n",
2839 cname, (unsigned long long)size);
2840 rf_print_component_label(clabel);
2841 #endif
2842 /* if it's reasonable, add it, else ignore it. */
2843 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2844 M_NOWAIT);
2845 if (ac == NULL) {
2846 free(clabel, M_RAIDFRAME);
2847 goto oomem;
2848 }
2849 strlcpy(ac->devname, cname, sizeof(ac->devname));
2850 ac->dev = dev;
2851 ac->vp = vp;
2852 ac->clabel = clabel;
2853 ac->next = ac_list;
2854 ac_list = ac;
2855 good_one = 1;
2856 }
2857 }
2858 if (!good_one) {
2859 /* cleanup */
2860 free(clabel, M_RAIDFRAME);
2861 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2862 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2863 vput(vp);
2864 }
2865 return ac_list;
2866 }
2867
2868 RF_AutoConfig_t *
2869 rf_find_raid_components(void)
2870 {
2871 struct vnode *vp;
2872 struct disklabel label;
2873 device_t dv;
2874 dev_t dev;
2875 int bmajor, bminor, wedge;
2876 int error;
2877 int i;
2878 RF_AutoConfig_t *ac_list;
2879
2880
2881 /* initialize the AutoConfig list */
2882 ac_list = NULL;
2883
2884 /* we begin by trolling through *all* the devices on the system */
2885
2886 for (dv = alldevs.tqh_first; dv != NULL;
2887 dv = dv->dv_list.tqe_next) {
2888
2889 /* we are only interested in disks... */
2890 if (device_class(dv) != DV_DISK)
2891 continue;
2892
2893 /* we don't care about floppies... */
2894 if (device_is_a(dv, "fd")) {
2895 continue;
2896 }
2897
2898 /* we don't care about CD's... */
2899 if (device_is_a(dv, "cd")) {
2900 continue;
2901 }
2902
2903 /* we don't care about md's... */
2904 if (device_is_a(dv, "md")) {
2905 continue;
2906 }
2907
2908 /* hdfd is the Atari/Hades floppy driver */
2909 if (device_is_a(dv, "hdfd")) {
2910 continue;
2911 }
2912
2913 /* fdisa is the Atari/Milan floppy driver */
2914 if (device_is_a(dv, "fdisa")) {
2915 continue;
2916 }
2917
2918 /* need to find the device_name_to_block_device_major stuff */
2919 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2920
2921 /* get a vnode for the raw partition of this disk */
2922
2923 wedge = device_is_a(dv, "dk");
2924 bminor = minor(device_unit(dv));
2925 dev = wedge ? makedev(bmajor, bminor) :
2926 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2927 if (bdevvp(dev, &vp))
2928 panic("RAID can't alloc vnode");
2929
2930 error = VOP_OPEN(vp, FREAD, NOCRED);
2931
2932 if (error) {
2933 /* "Who cares." Continue looking
2934 for something that exists*/
2935 vput(vp);
2936 continue;
2937 }
2938
2939 if (wedge) {
2940 struct dkwedge_info dkw;
2941 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2942 NOCRED);
2943 if (error) {
2944 printf("RAIDframe: can't get wedge info for "
2945 "dev %s (%d)\n", device_xname(dv), error);
2946 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2947 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2948 vput(vp);
2949 continue;
2950 }
2951
2952 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2953 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2954 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2955 vput(vp);
2956 continue;
2957 }
2958
2959 ac_list = rf_get_component(ac_list, dev, vp,
2960 device_xname(dv), dkw.dkw_size);
2961 continue;
2962 }
2963
2964 /* Ok, the disk exists. Go get the disklabel. */
2965 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2966 if (error) {
2967 /*
2968 * XXX can't happen - open() would
2969 * have errored out (or faked up one)
2970 */
2971 if (error != ENOTTY)
2972 printf("RAIDframe: can't get label for dev "
2973 "%s (%d)\n", device_xname(dv), error);
2974 }
2975
2976 /* don't need this any more. We'll allocate it again
2977 a little later if we really do... */
2978 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2979 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2980 vput(vp);
2981
2982 if (error)
2983 continue;
2984
2985 for (i = 0; i < label.d_npartitions; i++) {
2986 char cname[sizeof(ac_list->devname)];
2987
2988 /* We only support partitions marked as RAID */
2989 if (label.d_partitions[i].p_fstype != FS_RAID)
2990 continue;
2991
2992 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2993 if (bdevvp(dev, &vp))
2994 panic("RAID can't alloc vnode");
2995
2996 error = VOP_OPEN(vp, FREAD, NOCRED);
2997 if (error) {
2998 /* Whatever... */
2999 vput(vp);
3000 continue;
3001 }
3002 snprintf(cname, sizeof(cname), "%s%c",
3003 device_xname(dv), 'a' + i);
3004 ac_list = rf_get_component(ac_list, dev, vp, cname,
3005 label.d_partitions[i].p_size);
3006 }
3007 }
3008 return ac_list;
3009 }
3010
3011
3012 static int
3013 rf_reasonable_label(RF_ComponentLabel_t *clabel)
3014 {
3015
3016 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3017 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3018 ((clabel->clean == RF_RAID_CLEAN) ||
3019 (clabel->clean == RF_RAID_DIRTY)) &&
3020 clabel->row >=0 &&
3021 clabel->column >= 0 &&
3022 clabel->num_rows > 0 &&
3023 clabel->num_columns > 0 &&
3024 clabel->row < clabel->num_rows &&
3025 clabel->column < clabel->num_columns &&
3026 clabel->blockSize > 0 &&
3027 clabel->numBlocks > 0) {
3028 /* label looks reasonable enough... */
3029 return(1);
3030 }
3031 return(0);
3032 }
3033
3034
3035 #ifdef DEBUG
3036 void
3037 rf_print_component_label(RF_ComponentLabel_t *clabel)
3038 {
3039 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3040 clabel->row, clabel->column,
3041 clabel->num_rows, clabel->num_columns);
3042 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3043 clabel->version, clabel->serial_number,
3044 clabel->mod_counter);
3045 printf(" Clean: %s Status: %d\n",
3046 clabel->clean ? "Yes" : "No", clabel->status );
3047 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3048 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3049 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
3050 (char) clabel->parityConfig, clabel->blockSize,
3051 clabel->numBlocks);
3052 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
3053 printf(" Contains root partition: %s\n",
3054 clabel->root_partition ? "Yes" : "No" );
3055 printf(" Last configured as: raid%d\n", clabel->last_unit );
3056 #if 0
3057 printf(" Config order: %d\n", clabel->config_order);
3058 #endif
3059
3060 }
3061 #endif
3062
3063 RF_ConfigSet_t *
3064 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3065 {
3066 RF_AutoConfig_t *ac;
3067 RF_ConfigSet_t *config_sets;
3068 RF_ConfigSet_t *cset;
3069 RF_AutoConfig_t *ac_next;
3070
3071
3072 config_sets = NULL;
3073
3074 /* Go through the AutoConfig list, and figure out which components
3075 belong to what sets. */
3076 ac = ac_list;
3077 while(ac!=NULL) {
3078 /* we're going to putz with ac->next, so save it here
3079 for use at the end of the loop */
3080 ac_next = ac->next;
3081
3082 if (config_sets == NULL) {
3083 /* will need at least this one... */
3084 config_sets = (RF_ConfigSet_t *)
3085 malloc(sizeof(RF_ConfigSet_t),
3086 M_RAIDFRAME, M_NOWAIT);
3087 if (config_sets == NULL) {
3088 panic("rf_create_auto_sets: No memory!");
3089 }
3090 /* this one is easy :) */
3091 config_sets->ac = ac;
3092 config_sets->next = NULL;
3093 config_sets->rootable = 0;
3094 ac->next = NULL;
3095 } else {
3096 /* which set does this component fit into? */
3097 cset = config_sets;
3098 while(cset!=NULL) {
3099 if (rf_does_it_fit(cset, ac)) {
3100 /* looks like it matches... */
3101 ac->next = cset->ac;
3102 cset->ac = ac;
3103 break;
3104 }
3105 cset = cset->next;
3106 }
3107 if (cset==NULL) {
3108 /* didn't find a match above... new set..*/
3109 cset = (RF_ConfigSet_t *)
3110 malloc(sizeof(RF_ConfigSet_t),
3111 M_RAIDFRAME, M_NOWAIT);
3112 if (cset == NULL) {
3113 panic("rf_create_auto_sets: No memory!");
3114 }
3115 cset->ac = ac;
3116 ac->next = NULL;
3117 cset->next = config_sets;
3118 cset->rootable = 0;
3119 config_sets = cset;
3120 }
3121 }
3122 ac = ac_next;
3123 }
3124
3125
3126 return(config_sets);
3127 }
3128
3129 static int
3130 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3131 {
3132 RF_ComponentLabel_t *clabel1, *clabel2;
3133
3134 /* If this one matches the *first* one in the set, that's good
3135 enough, since the other members of the set would have been
3136 through here too... */
3137 /* note that we are not checking partitionSize here..
3138
3139 Note that we are also not checking the mod_counters here.
3140 If everything else matches execpt the mod_counter, that's
3141 good enough for this test. We will deal with the mod_counters
3142 a little later in the autoconfiguration process.
3143
3144 (clabel1->mod_counter == clabel2->mod_counter) &&
3145
3146 The reason we don't check for this is that failed disks
3147 will have lower modification counts. If those disks are
3148 not added to the set they used to belong to, then they will
3149 form their own set, which may result in 2 different sets,
3150 for example, competing to be configured at raid0, and
3151 perhaps competing to be the root filesystem set. If the
3152 wrong ones get configured, or both attempt to become /,
3153 weird behaviour and or serious lossage will occur. Thus we
3154 need to bring them into the fold here, and kick them out at
3155 a later point.
3156
3157 */
3158
3159 clabel1 = cset->ac->clabel;
3160 clabel2 = ac->clabel;
3161 if ((clabel1->version == clabel2->version) &&
3162 (clabel1->serial_number == clabel2->serial_number) &&
3163 (clabel1->num_rows == clabel2->num_rows) &&
3164 (clabel1->num_columns == clabel2->num_columns) &&
3165 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3166 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3167 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3168 (clabel1->parityConfig == clabel2->parityConfig) &&
3169 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3170 (clabel1->blockSize == clabel2->blockSize) &&
3171 (clabel1->numBlocks == clabel2->numBlocks) &&
3172 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3173 (clabel1->root_partition == clabel2->root_partition) &&
3174 (clabel1->last_unit == clabel2->last_unit) &&
3175 (clabel1->config_order == clabel2->config_order)) {
3176 /* if it get's here, it almost *has* to be a match */
3177 } else {
3178 /* it's not consistent with somebody in the set..
3179 punt */
3180 return(0);
3181 }
3182 /* all was fine.. it must fit... */
3183 return(1);
3184 }
3185
3186 int
3187 rf_have_enough_components(RF_ConfigSet_t *cset)
3188 {
3189 RF_AutoConfig_t *ac;
3190 RF_AutoConfig_t *auto_config;
3191 RF_ComponentLabel_t *clabel;
3192 int c;
3193 int num_cols;
3194 int num_missing;
3195 int mod_counter;
3196 int mod_counter_found;
3197 int even_pair_failed;
3198 char parity_type;
3199
3200
3201 /* check to see that we have enough 'live' components
3202 of this set. If so, we can configure it if necessary */
3203
3204 num_cols = cset->ac->clabel->num_columns;
3205 parity_type = cset->ac->clabel->parityConfig;
3206
3207 /* XXX Check for duplicate components!?!?!? */
3208
3209 /* Determine what the mod_counter is supposed to be for this set. */
3210
3211 mod_counter_found = 0;
3212 mod_counter = 0;
3213 ac = cset->ac;
3214 while(ac!=NULL) {
3215 if (mod_counter_found==0) {
3216 mod_counter = ac->clabel->mod_counter;
3217 mod_counter_found = 1;
3218 } else {
3219 if (ac->clabel->mod_counter > mod_counter) {
3220 mod_counter = ac->clabel->mod_counter;
3221 }
3222 }
3223 ac = ac->next;
3224 }
3225
3226 num_missing = 0;
3227 auto_config = cset->ac;
3228
3229 even_pair_failed = 0;
3230 for(c=0; c<num_cols; c++) {
3231 ac = auto_config;
3232 while(ac!=NULL) {
3233 if ((ac->clabel->column == c) &&
3234 (ac->clabel->mod_counter == mod_counter)) {
3235 /* it's this one... */
3236 #ifdef DEBUG
3237 printf("Found: %s at %d\n",
3238 ac->devname,c);
3239 #endif
3240 break;
3241 }
3242 ac=ac->next;
3243 }
3244 if (ac==NULL) {
3245 /* Didn't find one here! */
3246 /* special case for RAID 1, especially
3247 where there are more than 2
3248 components (where RAIDframe treats
3249 things a little differently :( ) */
3250 if (parity_type == '1') {
3251 if (c%2 == 0) { /* even component */
3252 even_pair_failed = 1;
3253 } else { /* odd component. If
3254 we're failed, and
3255 so is the even
3256 component, it's
3257 "Good Night, Charlie" */
3258 if (even_pair_failed == 1) {
3259 return(0);
3260 }
3261 }
3262 } else {
3263 /* normal accounting */
3264 num_missing++;
3265 }
3266 }
3267 if ((parity_type == '1') && (c%2 == 1)) {
3268 /* Just did an even component, and we didn't
3269 bail.. reset the even_pair_failed flag,
3270 and go on to the next component.... */
3271 even_pair_failed = 0;
3272 }
3273 }
3274
3275 clabel = cset->ac->clabel;
3276
3277 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3278 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3279 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3280 /* XXX this needs to be made *much* more general */
3281 /* Too many failures */
3282 return(0);
3283 }
3284 /* otherwise, all is well, and we've got enough to take a kick
3285 at autoconfiguring this set */
3286 return(1);
3287 }
3288
3289 void
3290 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3291 RF_Raid_t *raidPtr)
3292 {
3293 RF_ComponentLabel_t *clabel;
3294 int i;
3295
3296 clabel = ac->clabel;
3297
3298 /* 1. Fill in the common stuff */
3299 config->numRow = clabel->num_rows = 1;
3300 config->numCol = clabel->num_columns;
3301 config->numSpare = 0; /* XXX should this be set here? */
3302 config->sectPerSU = clabel->sectPerSU;
3303 config->SUsPerPU = clabel->SUsPerPU;
3304 config->SUsPerRU = clabel->SUsPerRU;
3305 config->parityConfig = clabel->parityConfig;
3306 /* XXX... */
3307 strcpy(config->diskQueueType,"fifo");
3308 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3309 config->layoutSpecificSize = 0; /* XXX ?? */
3310
3311 while(ac!=NULL) {
3312 /* row/col values will be in range due to the checks
3313 in reasonable_label() */
3314 strcpy(config->devnames[0][ac->clabel->column],
3315 ac->devname);
3316 ac = ac->next;
3317 }
3318
3319 for(i=0;i<RF_MAXDBGV;i++) {
3320 config->debugVars[i][0] = 0;
3321 }
3322 }
3323
3324 int
3325 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3326 {
3327 RF_ComponentLabel_t clabel;
3328 struct vnode *vp;
3329 dev_t dev;
3330 int column;
3331 int sparecol;
3332
3333 raidPtr->autoconfigure = new_value;
3334
3335 for(column=0; column<raidPtr->numCol; column++) {
3336 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3337 dev = raidPtr->Disks[column].dev;
3338 vp = raidPtr->raid_cinfo[column].ci_vp;
3339 raidread_component_label(dev, vp, &clabel);
3340 clabel.autoconfigure = new_value;
3341 raidwrite_component_label(dev, vp, &clabel);
3342 }
3343 }
3344 for(column = 0; column < raidPtr->numSpare ; column++) {
3345 sparecol = raidPtr->numCol + column;
3346 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3347 dev = raidPtr->Disks[sparecol].dev;
3348 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3349 raidread_component_label(dev, vp, &clabel);
3350 clabel.autoconfigure = new_value;
3351 raidwrite_component_label(dev, vp, &clabel);
3352 }
3353 }
3354 return(new_value);
3355 }
3356
3357 int
3358 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3359 {
3360 RF_ComponentLabel_t clabel;
3361 struct vnode *vp;
3362 dev_t dev;
3363 int column;
3364 int sparecol;
3365
3366 raidPtr->root_partition = new_value;
3367 for(column=0; column<raidPtr->numCol; column++) {
3368 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3369 dev = raidPtr->Disks[column].dev;
3370 vp = raidPtr->raid_cinfo[column].ci_vp;
3371 raidread_component_label(dev, vp, &clabel);
3372 clabel.root_partition = new_value;
3373 raidwrite_component_label(dev, vp, &clabel);
3374 }
3375 }
3376 for(column = 0; column < raidPtr->numSpare ; column++) {
3377 sparecol = raidPtr->numCol + column;
3378 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3379 dev = raidPtr->Disks[sparecol].dev;
3380 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3381 raidread_component_label(dev, vp, &clabel);
3382 clabel.root_partition = new_value;
3383 raidwrite_component_label(dev, vp, &clabel);
3384 }
3385 }
3386 return(new_value);
3387 }
3388
3389 void
3390 rf_release_all_vps(RF_ConfigSet_t *cset)
3391 {
3392 RF_AutoConfig_t *ac;
3393
3394 ac = cset->ac;
3395 while(ac!=NULL) {
3396 /* Close the vp, and give it back */
3397 if (ac->vp) {
3398 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3399 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3400 vput(ac->vp);
3401 ac->vp = NULL;
3402 }
3403 ac = ac->next;
3404 }
3405 }
3406
3407
3408 void
3409 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3410 {
3411 RF_AutoConfig_t *ac;
3412 RF_AutoConfig_t *next_ac;
3413
3414 ac = cset->ac;
3415 while(ac!=NULL) {
3416 next_ac = ac->next;
3417 /* nuke the label */
3418 free(ac->clabel, M_RAIDFRAME);
3419 /* cleanup the config structure */
3420 free(ac, M_RAIDFRAME);
3421 /* "next.." */
3422 ac = next_ac;
3423 }
3424 /* and, finally, nuke the config set */
3425 free(cset, M_RAIDFRAME);
3426 }
3427
3428
3429 void
3430 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3431 {
3432 /* current version number */
3433 clabel->version = RF_COMPONENT_LABEL_VERSION;
3434 clabel->serial_number = raidPtr->serial_number;
3435 clabel->mod_counter = raidPtr->mod_counter;
3436 clabel->num_rows = 1;
3437 clabel->num_columns = raidPtr->numCol;
3438 clabel->clean = RF_RAID_DIRTY; /* not clean */
3439 clabel->status = rf_ds_optimal; /* "It's good!" */
3440
3441 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3442 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3443 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3444
3445 clabel->blockSize = raidPtr->bytesPerSector;
3446 clabel->numBlocks = raidPtr->sectorsPerDisk;
3447
3448 /* XXX not portable */
3449 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3450 clabel->maxOutstanding = raidPtr->maxOutstanding;
3451 clabel->autoconfigure = raidPtr->autoconfigure;
3452 clabel->root_partition = raidPtr->root_partition;
3453 clabel->last_unit = raidPtr->raidid;
3454 clabel->config_order = raidPtr->config_order;
3455 }
3456
3457 int
3458 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3459 {
3460 RF_Raid_t *raidPtr;
3461 RF_Config_t *config;
3462 int raidID;
3463 int retcode;
3464
3465 #ifdef DEBUG
3466 printf("RAID autoconfigure\n");
3467 #endif
3468
3469 retcode = 0;
3470 *unit = -1;
3471
3472 /* 1. Create a config structure */
3473
3474 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3475 M_RAIDFRAME,
3476 M_NOWAIT);
3477 if (config==NULL) {
3478 printf("Out of mem!?!?\n");
3479 /* XXX do something more intelligent here. */
3480 return(1);
3481 }
3482
3483 memset(config, 0, sizeof(RF_Config_t));
3484
3485 /*
3486 2. Figure out what RAID ID this one is supposed to live at
3487 See if we can get the same RAID dev that it was configured
3488 on last time..
3489 */
3490
3491 raidID = cset->ac->clabel->last_unit;
3492 if ((raidID < 0) || (raidID >= numraid)) {
3493 /* let's not wander off into lala land. */
3494 raidID = numraid - 1;
3495 }
3496 if (raidPtrs[raidID]->valid != 0) {
3497
3498 /*
3499 Nope... Go looking for an alternative...
3500 Start high so we don't immediately use raid0 if that's
3501 not taken.
3502 */
3503
3504 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3505 if (raidPtrs[raidID]->valid == 0) {
3506 /* can use this one! */
3507 break;
3508 }
3509 }
3510 }
3511
3512 if (raidID < 0) {
3513 /* punt... */
3514 printf("Unable to auto configure this set!\n");
3515 printf("(Out of RAID devs!)\n");
3516 free(config, M_RAIDFRAME);
3517 return(1);
3518 }
3519
3520 #ifdef DEBUG
3521 printf("Configuring raid%d:\n",raidID);
3522 #endif
3523
3524 raidPtr = raidPtrs[raidID];
3525
3526 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3527 raidPtr->raidid = raidID;
3528 raidPtr->openings = RAIDOUTSTANDING;
3529
3530 /* 3. Build the configuration structure */
3531 rf_create_configuration(cset->ac, config, raidPtr);
3532
3533 /* 4. Do the configuration */
3534 retcode = rf_Configure(raidPtr, config, cset->ac);
3535
3536 if (retcode == 0) {
3537
3538 raidinit(raidPtrs[raidID]);
3539
3540 rf_markalldirty(raidPtrs[raidID]);
3541 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3542 if (cset->ac->clabel->root_partition==1) {
3543 /* everything configured just fine. Make a note
3544 that this set is eligible to be root. */
3545 cset->rootable = 1;
3546 /* XXX do this here? */
3547 raidPtrs[raidID]->root_partition = 1;
3548 }
3549 }
3550
3551 /* 5. Cleanup */
3552 free(config, M_RAIDFRAME);
3553
3554 *unit = raidID;
3555 return(retcode);
3556 }
3557
3558 void
3559 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3560 {
3561 struct buf *bp;
3562
3563 bp = (struct buf *)desc->bp;
3564 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3565 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3566 }
3567
3568 void
3569 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3570 size_t xmin, size_t xmax)
3571 {
3572 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3573 pool_sethiwat(p, xmax);
3574 pool_prime(p, xmin);
3575 pool_setlowat(p, xmin);
3576 }
3577
3578 /*
3579 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3580 * if there is IO pending and if that IO could possibly be done for a
3581 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3582 * otherwise.
3583 *
3584 */
3585
3586 int
3587 rf_buf_queue_check(int raidid)
3588 {
3589 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
3590 raidPtrs[raidid]->openings > 0) {
3591 /* there is work to do */
3592 return 0;
3593 }
3594 /* default is nothing to do */
3595 return 1;
3596 }
3597
3598 int
3599 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3600 {
3601 struct partinfo dpart;
3602 struct dkwedge_info dkw;
3603 int error;
3604
3605 error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred);
3606 if (error == 0) {
3607 diskPtr->blockSize = dpart.disklab->d_secsize;
3608 diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3609 diskPtr->partitionSize = dpart.part->p_size;
3610 return 0;
3611 }
3612
3613 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred);
3614 if (error == 0) {
3615 diskPtr->blockSize = 512; /* XXX */
3616 diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3617 diskPtr->partitionSize = dkw.dkw_size;
3618 return 0;
3619 }
3620 return error;
3621 }
3622
3623 static int
3624 raid_match(device_t self, cfdata_t cfdata, void *aux)
3625 {
3626 return 1;
3627 }
3628
3629 static void
3630 raid_attach(device_t parent, device_t self, void *aux)
3631 {
3632
3633 }
3634
3635
3636 static int
3637 raid_detach(device_t self, int flags)
3638 {
3639 int error;
3640 struct raid_softc *rs = &raid_softc[device_unit(self)];
3641
3642 if ((error = raidlock(rs)) != 0)
3643 return (error);
3644
3645 error = raid_detach_unlocked(rs);
3646
3647 raidunlock(rs);
3648
3649 return error;
3650 }
3651
3652 static void
3653 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3654 {
3655 prop_dictionary_t disk_info, odisk_info, geom;
3656 disk_info = prop_dictionary_create();
3657 geom = prop_dictionary_create();
3658 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3659 raidPtr->totalSectors);
3660 prop_dictionary_set_uint32(geom, "sector-size",
3661 raidPtr->bytesPerSector);
3662
3663 prop_dictionary_set_uint16(geom, "sectors-per-track",
3664 raidPtr->Layout.dataSectorsPerStripe);
3665 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3666 4 * raidPtr->numCol);
3667
3668 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3669 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3670 (4 * raidPtr->numCol)));
3671
3672 prop_dictionary_set(disk_info, "geometry", geom);
3673 prop_object_release(geom);
3674 prop_dictionary_set(device_properties(rs->sc_dev),
3675 "disk-info", disk_info);
3676 odisk_info = rs->sc_dkdev.dk_info;
3677 rs->sc_dkdev.dk_info = disk_info;
3678 if (odisk_info)
3679 prop_object_release(odisk_info);
3680 }
3681
3682 /*
3683 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3684 * We end up returning whatever error was returned by the first cache flush
3685 * that fails.
3686 */
3687
3688 static int
3689 rf_sync_component_caches(RF_Raid_t *raidPtr)
3690 {
3691 int c, sparecol;
3692 int e,error;
3693 int force = 1;
3694
3695 error = 0;
3696 for (c = 0; c < raidPtr->numCol; c++) {
3697 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3698 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3699 &force, FWRITE, NOCRED);
3700 if (e) {
3701 if (e != ENODEV)
3702 printf("raid%d: cache flush to component %s failed.\n",
3703 raidPtr->raidid, raidPtr->Disks[c].devname);
3704 if (error == 0) {
3705 error = e;
3706 }
3707 }
3708 }
3709 }
3710
3711 for( c = 0; c < raidPtr->numSpare ; c++) {
3712 sparecol = raidPtr->numCol + c;
3713 /* Need to ensure that the reconstruct actually completed! */
3714 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3715 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3716 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3717 if (e) {
3718 if (e != ENODEV)
3719 printf("raid%d: cache flush to component %s failed.\n",
3720 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3721 if (error == 0) {
3722 error = e;
3723 }
3724 }
3725 }
3726 }
3727 return error;
3728 }
3729