rf_netbsdkintf.c revision 1.224.2.3 1 /* $NetBSD: rf_netbsdkintf.c,v 1.224.2.3 2008/09/25 19:45:22 bouyer Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the NetBSD
20 * Foundation, Inc. and its contributors.
21 * 4. Neither the name of The NetBSD Foundation nor the names of its
22 * contributors may be used to endorse or promote products derived
23 * from this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Copyright (c) 1990, 1993
40 * The Regents of the University of California. All rights reserved.
41 *
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
44 * Science Department.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
48 * are met:
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 *
70 * from: Utah $Hdr: cd.c 1.6 90/11/28$
71 *
72 * @(#)cd.c 8.2 (Berkeley) 11/16/93
73 */
74
75 /*
76 * Copyright (c) 1988 University of Utah.
77 *
78 * This code is derived from software contributed to Berkeley by
79 * the Systems Programming Group of the University of Utah Computer
80 * Science Department.
81 *
82 * Redistribution and use in source and binary forms, with or without
83 * modification, are permitted provided that the following conditions
84 * are met:
85 * 1. Redistributions of source code must retain the above copyright
86 * notice, this list of conditions and the following disclaimer.
87 * 2. Redistributions in binary form must reproduce the above copyright
88 * notice, this list of conditions and the following disclaimer in the
89 * documentation and/or other materials provided with the distribution.
90 * 3. All advertising materials mentioning features or use of this software
91 * must display the following acknowledgement:
92 * This product includes software developed by the University of
93 * California, Berkeley and its contributors.
94 * 4. Neither the name of the University nor the names of its contributors
95 * may be used to endorse or promote products derived from this software
96 * without specific prior written permission.
97 *
98 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
100 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
101 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
102 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
103 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
104 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
105 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
106 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
107 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
108 * SUCH DAMAGE.
109 *
110 * from: Utah $Hdr: cd.c 1.6 90/11/28$
111 *
112 * @(#)cd.c 8.2 (Berkeley) 11/16/93
113 */
114
115 /*
116 * Copyright (c) 1995 Carnegie-Mellon University.
117 * All rights reserved.
118 *
119 * Authors: Mark Holland, Jim Zelenka
120 *
121 * Permission to use, copy, modify and distribute this software and
122 * its documentation is hereby granted, provided that both the copyright
123 * notice and this permission notice appear in all copies of the
124 * software, derivative works or modified versions, and any portions
125 * thereof, and that both notices appear in supporting documentation.
126 *
127 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
128 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
129 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
130 *
131 * Carnegie Mellon requests users of this software to return to
132 *
133 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
134 * School of Computer Science
135 * Carnegie Mellon University
136 * Pittsburgh PA 15213-3890
137 *
138 * any improvements or extensions that they make and grant Carnegie the
139 * rights to redistribute these changes.
140 */
141
142 /***********************************************************
143 *
144 * rf_kintf.c -- the kernel interface routines for RAIDframe
145 *
146 ***********************************************************/
147
148 #include <sys/cdefs.h>
149 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.224.2.3 2008/09/25 19:45:22 bouyer Exp $");
150
151 #include <sys/param.h>
152 #include <sys/errno.h>
153 #include <sys/pool.h>
154 #include <sys/proc.h>
155 #include <sys/queue.h>
156 #include <sys/disk.h>
157 #include <sys/device.h>
158 #include <sys/stat.h>
159 #include <sys/ioctl.h>
160 #include <sys/fcntl.h>
161 #include <sys/systm.h>
162 #include <sys/namei.h>
163 #include <sys/vnode.h>
164 #include <sys/disklabel.h>
165 #include <sys/conf.h>
166 #include <sys/lock.h>
167 #include <sys/buf.h>
168 #include <sys/bufq.h>
169 #include <sys/user.h>
170 #include <sys/reboot.h>
171 #include <sys/kauth.h>
172
173 #include <prop/proplib.h>
174
175 #include <dev/raidframe/raidframevar.h>
176 #include <dev/raidframe/raidframeio.h>
177 #include "raid.h"
178 #include "opt_raid_autoconfig.h"
179 #include "rf_raid.h"
180 #include "rf_copyback.h"
181 #include "rf_dag.h"
182 #include "rf_dagflags.h"
183 #include "rf_desc.h"
184 #include "rf_diskqueue.h"
185 #include "rf_etimer.h"
186 #include "rf_general.h"
187 #include "rf_kintf.h"
188 #include "rf_options.h"
189 #include "rf_driver.h"
190 #include "rf_parityscan.h"
191 #include "rf_threadstuff.h"
192
193 #ifdef DEBUG
194 int rf_kdebug_level = 0;
195 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
196 #else /* DEBUG */
197 #define db1_printf(a) { }
198 #endif /* DEBUG */
199
200 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
201
202 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
203
204 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
205 * spare table */
206 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
207 * installation process */
208
209 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
210
211 /* prototypes */
212 static void KernelWakeupFunc(struct buf *);
213 static void InitBP(struct buf *, struct vnode *, unsigned,
214 dev_t, RF_SectorNum_t, RF_SectorCount_t, caddr_t, void (*) (struct buf *),
215 void *, int, struct proc *);
216 static void raidinit(RF_Raid_t *);
217
218 void raidattach(int);
219 static int raid_match(struct device *, struct cfdata *, void *);
220 static void raid_attach(struct device *, struct device *, void *);
221 static int raid_detach(struct device *, int);
222
223 dev_type_open(raidopen);
224 dev_type_close(raidclose);
225 dev_type_read(raidread);
226 dev_type_write(raidwrite);
227 dev_type_ioctl(raidioctl);
228 dev_type_strategy(raidstrategy);
229 dev_type_dump(raiddump);
230 dev_type_size(raidsize);
231
232 const struct bdevsw raid_bdevsw = {
233 raidopen, raidclose, raidstrategy, raidioctl,
234 raiddump, raidsize, D_DISK
235 };
236
237 const struct cdevsw raid_cdevsw = {
238 raidopen, raidclose, raidread, raidwrite, raidioctl,
239 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
240 };
241
242 /* XXX Not sure if the following should be replacing the raidPtrs above,
243 or if it should be used in conjunction with that...
244 */
245
246 struct raid_softc {
247 struct device *sc_dev;
248 int sc_flags; /* flags */
249 int sc_cflags; /* configuration flags */
250 uint64_t sc_size; /* size of the raid device */
251 char sc_xname[20]; /* XXX external name */
252 struct disk sc_dkdev; /* generic disk device info */
253 struct bufq_state *buf_queue; /* used for the device queue */
254 };
255 /* sc_flags */
256 #define RAIDF_INITED 0x01 /* unit has been initialized */
257 #define RAIDF_WLABEL 0x02 /* label area is writable */
258 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
259 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
260 #define RAIDF_LOCKED 0x80 /* unit is locked */
261
262 #define raidunit(x) DISKUNIT(x)
263 int numraid = 0;
264
265 extern struct cfdriver raid_cd;
266 CFATTACH_DECL(raid, sizeof(struct raid_softc),
267 raid_match, raid_attach, raid_detach, NULL);
268
269 /*
270 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
271 * Be aware that large numbers can allow the driver to consume a lot of
272 * kernel memory, especially on writes, and in degraded mode reads.
273 *
274 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
275 * a single 64K write will typically require 64K for the old data,
276 * 64K for the old parity, and 64K for the new parity, for a total
277 * of 192K (if the parity buffer is not re-used immediately).
278 * Even it if is used immediately, that's still 128K, which when multiplied
279 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
280 *
281 * Now in degraded mode, for example, a 64K read on the above setup may
282 * require data reconstruction, which will require *all* of the 4 remaining
283 * disks to participate -- 4 * 32K/disk == 128K again.
284 */
285
286 #ifndef RAIDOUTSTANDING
287 #define RAIDOUTSTANDING 6
288 #endif
289
290 #define RAIDLABELDEV(dev) \
291 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
292
293 /* declared here, and made public, for the benefit of KVM stuff.. */
294 struct raid_softc *raid_softc;
295
296 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
297 struct disklabel *);
298 static void raidgetdisklabel(dev_t);
299 static void raidmakedisklabel(struct raid_softc *);
300
301 static int raidlock(struct raid_softc *);
302 static void raidunlock(struct raid_softc *);
303
304 static void rf_markalldirty(RF_Raid_t *);
305 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
306
307 void rf_ReconThread(struct rf_recon_req *);
308 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
309 void rf_CopybackThread(RF_Raid_t *raidPtr);
310 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
311 int rf_autoconfig(struct device *self);
312 void rf_buildroothack(RF_ConfigSet_t *);
313
314 RF_AutoConfig_t *rf_find_raid_components(void);
315 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
316 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
317 static int rf_reasonable_label(RF_ComponentLabel_t *);
318 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
319 int rf_set_autoconfig(RF_Raid_t *, int);
320 int rf_set_rootpartition(RF_Raid_t *, int);
321 void rf_release_all_vps(RF_ConfigSet_t *);
322 void rf_cleanup_config_set(RF_ConfigSet_t *);
323 int rf_have_enough_components(RF_ConfigSet_t *);
324 int rf_auto_config_set(RF_ConfigSet_t *, int *);
325
326 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
327 allow autoconfig to take place.
328 Note that this is overridden by having
329 RAID_AUTOCONFIG as an option in the
330 kernel config file. */
331
332 struct RF_Pools_s rf_pools;
333
334 void
335 raidattach(int num)
336 {
337 int raidID;
338 int i, rc;
339
340 #ifdef DEBUG
341 printf("raidattach: Asked for %d units\n", num);
342 #endif
343
344 if (num <= 0) {
345 #ifdef DIAGNOSTIC
346 panic("raidattach: count <= 0");
347 #endif
348 return;
349 }
350 /* This is where all the initialization stuff gets done. */
351
352 numraid = num;
353
354 /* Make some space for requested number of units... */
355
356 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
357 if (raidPtrs == NULL) {
358 panic("raidPtrs is NULL!!");
359 }
360
361 rf_mutex_init(&rf_sparet_wait_mutex);
362
363 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
364
365 for (i = 0; i < num; i++)
366 raidPtrs[i] = NULL;
367 rc = rf_BootRaidframe();
368 if (rc == 0)
369 printf("Kernelized RAIDframe activated\n");
370 else
371 panic("Serious error booting RAID!!");
372
373 /* put together some datastructures like the CCD device does.. This
374 * lets us lock the device and what-not when it gets opened. */
375
376 raid_softc = (struct raid_softc *)
377 malloc(num * sizeof(struct raid_softc),
378 M_RAIDFRAME, M_NOWAIT);
379 if (raid_softc == NULL) {
380 printf("WARNING: no memory for RAIDframe driver\n");
381 return;
382 }
383
384 memset(raid_softc, 0, num * sizeof(struct raid_softc));
385
386 for (raidID = 0; raidID < num; raidID++) {
387 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
388
389 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
390 (RF_Raid_t *));
391 if (raidPtrs[raidID] == NULL) {
392 printf("WARNING: raidPtrs[%d] is NULL\n", raidID);
393 numraid = raidID;
394 return;
395 }
396 }
397
398 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
399 printf("config_cfattach_attach failed?\n");
400 }
401
402 #ifdef RAID_AUTOCONFIG
403 raidautoconfig = 1;
404 #endif
405
406 /*
407 * Register a finalizer which will be used to auto-config RAID
408 * sets once all real hardware devices have been found.
409 */
410 if (config_finalize_register(NULL, rf_autoconfig) != 0)
411 printf("WARNING: unable to register RAIDframe finalizer\n");
412 }
413
414 int
415 rf_autoconfig(struct device *self)
416 {
417 RF_AutoConfig_t *ac_list;
418 RF_ConfigSet_t *config_sets;
419
420 if (raidautoconfig == 0)
421 return (0);
422
423 /* XXX This code can only be run once. */
424 raidautoconfig = 0;
425
426 /* 1. locate all RAID components on the system */
427 #ifdef DEBUG
428 printf("Searching for RAID components...\n");
429 #endif
430 ac_list = rf_find_raid_components();
431
432 /* 2. Sort them into their respective sets. */
433 config_sets = rf_create_auto_sets(ac_list);
434
435 /*
436 * 3. Evaluate each set andconfigure the valid ones.
437 * This gets done in rf_buildroothack().
438 */
439 rf_buildroothack(config_sets);
440
441 return 1;
442 }
443
444 void
445 rf_buildroothack(RF_ConfigSet_t *config_sets)
446 {
447 RF_ConfigSet_t *cset;
448 RF_ConfigSet_t *next_cset;
449 int retcode;
450 int raidID;
451 int rootID;
452 int num_root;
453
454 rootID = 0;
455 num_root = 0;
456 cset = config_sets;
457 while(cset != NULL ) {
458 next_cset = cset->next;
459 if (rf_have_enough_components(cset) &&
460 cset->ac->clabel->autoconfigure==1) {
461 retcode = rf_auto_config_set(cset,&raidID);
462 if (!retcode) {
463 #ifdef DEBUG
464 printf("raid%d: configured ok\n", raidID);
465 #endif
466 if (cset->rootable) {
467 rootID = raidID;
468 num_root++;
469 }
470 } else {
471 /* The autoconfig didn't work :( */
472 #ifdef DEBUG
473 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
474 #endif
475 rf_release_all_vps(cset);
476 }
477 } else {
478 #ifdef DEBUG
479 printf("raid%d: not enough components\n", raidID);
480 #endif
481 /* we're not autoconfiguring this set...
482 release the associated resources */
483 rf_release_all_vps(cset);
484 }
485 /* cleanup */
486 rf_cleanup_config_set(cset);
487 cset = next_cset;
488 }
489
490 /* if the user has specified what the root device should be
491 then we don't touch booted_device or boothowto... */
492
493 if (rootspec != NULL)
494 return;
495
496 /* we found something bootable... */
497
498 if (num_root == 1) {
499 booted_device = raid_softc[rootID].sc_dev;
500 } else if (num_root > 1) {
501 /* we can't guess.. require the user to answer... */
502 boothowto |= RB_ASKNAME;
503 }
504 }
505
506
507 int
508 raidsize(dev_t dev)
509 {
510 struct raid_softc *rs;
511 struct disklabel *lp;
512 int part, unit, omask, size;
513
514 unit = raidunit(dev);
515 if (unit >= numraid)
516 return (-1);
517 rs = &raid_softc[unit];
518
519 if ((rs->sc_flags & RAIDF_INITED) == 0)
520 return (-1);
521
522 part = DISKPART(dev);
523 omask = rs->sc_dkdev.dk_openmask & (1 << part);
524 lp = rs->sc_dkdev.dk_label;
525
526 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
527 return (-1);
528
529 if (lp->d_partitions[part].p_fstype != FS_SWAP)
530 size = -1;
531 else
532 size = lp->d_partitions[part].p_size *
533 (lp->d_secsize / DEV_BSIZE);
534
535 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
536 return (-1);
537
538 return (size);
539
540 }
541
542 int
543 raiddump(dev_t dev, daddr_t blkno, caddr_t va, size_t size)
544 {
545 int unit = raidunit(dev);
546 struct raid_softc *rs;
547 const struct bdevsw *bdev;
548 struct disklabel *lp;
549 RF_Raid_t *raidPtr;
550 daddr_t offset;
551 int part, c, sparecol, j, scol, dumpto;
552 int error = 0;
553
554 if (unit >= numraid)
555 return (ENXIO);
556
557 rs = &raid_softc[unit];
558 raidPtr = raidPtrs[unit];
559
560 if ((rs->sc_flags & RAIDF_INITED) == 0)
561 return ENXIO;
562
563 /* we only support dumping to RAID 1 sets */
564 if (raidPtr->Layout.numDataCol != 1 ||
565 raidPtr->Layout.numParityCol != 1)
566 return EINVAL;
567
568
569 if ((error = raidlock(rs)) != 0)
570 return error;
571
572 if (size % DEV_BSIZE != 0) {
573 error = EINVAL;
574 goto out;
575 }
576
577 if (blkno + size / DEV_BSIZE > rs->sc_size) {
578 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
579 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
580 size / DEV_BSIZE, rs->sc_size);
581 error = EINVAL;
582 goto out;
583 }
584
585 part = DISKPART(dev);
586 lp = rs->sc_dkdev.dk_label;
587 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
588
589 /* figure out what device is alive.. */
590
591 /*
592 Look for a component to dump to. The preference for the
593 component to dump to is as follows:
594 1) the master
595 2) a used_spare of the master
596 3) the slave
597 4) a used_spare of the slave
598 */
599
600 dumpto = -1;
601 for (c = 0; c < raidPtr->numCol; c++) {
602 if (raidPtr->Disks[c].status == rf_ds_optimal) {
603 /* this might be the one */
604 dumpto = c;
605 break;
606 }
607 }
608
609 /*
610 At this point we have possibly selected a live master or a
611 live slave. We now check to see if there is a spared
612 master (or a spared slave), if we didn't find a live master
613 or a live slave.
614 */
615
616 for (c = 0; c < raidPtr->numSpare; c++) {
617 sparecol = raidPtr->numCol + c;
618 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
619 /* How about this one? */
620 scol = -1;
621 for(j=0;j<raidPtr->numCol;j++) {
622 if (raidPtr->Disks[j].spareCol == sparecol) {
623 scol = j;
624 break;
625 }
626 }
627 if (scol == 0) {
628 /*
629 We must have found a spared master!
630 We'll take that over anything else
631 found so far. (We couldn't have
632 found a real master before, since
633 this is a used spare, and it's
634 saying that it's replacing the
635 master.) On reboot (with
636 autoconfiguration turned on)
637 sparecol will become the 1st
638 component (component0) of this set.
639 */
640 dumpto = sparecol;
641 break;
642 } else if (scol != -1) {
643 /*
644 Must be a spared slave. We'll dump
645 to that if we havn't found anything
646 else so far.
647 */
648 if (dumpto == -1)
649 dumpto = sparecol;
650 }
651 }
652 }
653
654 if (dumpto == -1) {
655 /* we couldn't find any live components to dump to!?!?
656 */
657 error = EINVAL;
658 goto out;
659 }
660
661 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
662
663 /*
664 Note that blkno is relative to this particular partition.
665 By adding the offset of this partition in the RAID
666 set, and also adding RF_PROTECTED_SECTORS, we get a
667 value that is relative to the partition used for the
668 underlying component.
669 */
670
671 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
672 blkno + offset, va, size);
673
674 out:
675 raidunlock(rs);
676
677 return error;
678 }
679 /* ARGSUSED */
680 int
681 raidopen(dev_t dev, int flags, int fmt,
682 struct lwp *l)
683 {
684 int unit = raidunit(dev);
685 struct raid_softc *rs;
686 struct disklabel *lp;
687 int part, pmask;
688 int error = 0;
689
690 if (unit >= numraid)
691 return (ENXIO);
692 rs = &raid_softc[unit];
693
694 if ((error = raidlock(rs)) != 0)
695 return (error);
696 lp = rs->sc_dkdev.dk_label;
697
698 part = DISKPART(dev);
699
700 /*
701 * If there are wedges, and this is not RAW_PART, then we
702 * need to fail.
703 */
704 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
705 error = EBUSY;
706 goto bad;
707 }
708 pmask = (1 << part);
709
710 if ((rs->sc_flags & RAIDF_INITED) &&
711 (rs->sc_dkdev.dk_openmask == 0))
712 raidgetdisklabel(dev);
713
714 /* make sure that this partition exists */
715
716 if (part != RAW_PART) {
717 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
718 ((part >= lp->d_npartitions) ||
719 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
720 error = ENXIO;
721 goto bad;
722 }
723 }
724 /* Prevent this unit from being unconfigured while open. */
725 switch (fmt) {
726 case S_IFCHR:
727 rs->sc_dkdev.dk_copenmask |= pmask;
728 break;
729
730 case S_IFBLK:
731 rs->sc_dkdev.dk_bopenmask |= pmask;
732 break;
733 }
734
735 if ((rs->sc_dkdev.dk_openmask == 0) &&
736 ((rs->sc_flags & RAIDF_INITED) != 0)) {
737 /* First one... mark things as dirty... Note that we *MUST*
738 have done a configure before this. I DO NOT WANT TO BE
739 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
740 THAT THEY BELONG TOGETHER!!!!! */
741 /* XXX should check to see if we're only open for reading
742 here... If so, we needn't do this, but then need some
743 other way of keeping track of what's happened.. */
744
745 rf_markalldirty( raidPtrs[unit] );
746 }
747
748
749 rs->sc_dkdev.dk_openmask =
750 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
751
752 bad:
753 raidunlock(rs);
754
755 return (error);
756
757
758 }
759 /* ARGSUSED */
760 int
761 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
762 {
763 int unit = raidunit(dev);
764 struct cfdata *cf;
765 struct raid_softc *rs;
766 int error = 0;
767 int part;
768
769 if (unit >= numraid)
770 return (ENXIO);
771 rs = &raid_softc[unit];
772
773 if ((error = raidlock(rs)) != 0)
774 return (error);
775
776 part = DISKPART(dev);
777
778 /* ...that much closer to allowing unconfiguration... */
779 switch (fmt) {
780 case S_IFCHR:
781 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
782 break;
783
784 case S_IFBLK:
785 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
786 break;
787 }
788 rs->sc_dkdev.dk_openmask =
789 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
790
791 if ((rs->sc_dkdev.dk_openmask == 0) &&
792 ((rs->sc_flags & RAIDF_INITED) != 0)) {
793 /* Last one... device is not unconfigured yet.
794 Device shutdown has taken care of setting the
795 clean bits if RAIDF_INITED is not set
796 mark things as clean... */
797
798 rf_update_component_labels(raidPtrs[unit],
799 RF_FINAL_COMPONENT_UPDATE);
800 if (doing_shutdown) {
801 /* last one, and we're going down, so
802 lights out for this RAID set too. */
803 error = rf_Shutdown(raidPtrs[unit]);
804
805 /* It's no longer initialized... */
806 rs->sc_flags &= ~RAIDF_INITED;
807
808 /* detach the device */
809
810 cf = device_cfdata(rs->sc_dev);
811 error = config_detach(rs->sc_dev, DETACH_QUIET);
812 free(cf, M_RAIDFRAME);
813
814 /* Detach the disk. */
815 pseudo_disk_detach(&rs->sc_dkdev);
816 }
817 }
818
819 raidunlock(rs);
820 return (0);
821
822 }
823
824 void
825 raidstrategy(struct buf *bp)
826 {
827 int s;
828
829 unsigned int raidID = raidunit(bp->b_dev);
830 RF_Raid_t *raidPtr;
831 struct raid_softc *rs = &raid_softc[raidID];
832 int wlabel;
833
834 if ((rs->sc_flags & RAIDF_INITED) ==0) {
835 bp->b_error = ENXIO;
836 bp->b_flags |= B_ERROR;
837 goto done;
838 }
839 if (raidID >= numraid || !raidPtrs[raidID]) {
840 bp->b_error = ENODEV;
841 bp->b_flags |= B_ERROR;
842 goto done;
843 }
844 raidPtr = raidPtrs[raidID];
845 if (!raidPtr->valid) {
846 bp->b_error = ENODEV;
847 bp->b_flags |= B_ERROR;
848 goto done;
849 }
850 if (bp->b_bcount == 0) {
851 db1_printf(("b_bcount is zero..\n"));
852 goto done;
853 }
854
855 /*
856 * Do bounds checking and adjust transfer. If there's an
857 * error, the bounds check will flag that for us.
858 */
859
860 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
861 if (DISKPART(bp->b_dev) == RAW_PART) {
862 uint64_t size; /* device size in DEV_BSIZE unit */
863
864 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
865 size = raidPtr->totalSectors <<
866 (raidPtr->logBytesPerSector - DEV_BSHIFT);
867 } else {
868 size = raidPtr->totalSectors >>
869 (DEV_BSHIFT - raidPtr->logBytesPerSector);
870 }
871 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
872 goto done;
873 }
874 } else {
875 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
876 db1_printf(("Bounds check failed!!:%d %d\n",
877 (int) bp->b_blkno, (int) wlabel));
878 goto done;
879 }
880 }
881 s = splbio();
882
883 bp->b_resid = 0;
884
885 /* stuff it onto our queue */
886 BUFQ_PUT(rs->buf_queue, bp);
887
888 /* scheduled the IO to happen at the next convenient time */
889 wakeup(&(raidPtrs[raidID]->iodone));
890
891 splx(s);
892 return;
893
894 done:
895 bp->b_resid = bp->b_bcount;
896 biodone(bp);
897 }
898 /* ARGSUSED */
899 int
900 raidread(dev_t dev, struct uio *uio, int flags)
901 {
902 int unit = raidunit(dev);
903 struct raid_softc *rs;
904
905 if (unit >= numraid)
906 return (ENXIO);
907 rs = &raid_softc[unit];
908
909 if ((rs->sc_flags & RAIDF_INITED) == 0)
910 return (ENXIO);
911
912 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
913
914 }
915 /* ARGSUSED */
916 int
917 raidwrite(dev_t dev, struct uio *uio, int flags)
918 {
919 int unit = raidunit(dev);
920 struct raid_softc *rs;
921
922 if (unit >= numraid)
923 return (ENXIO);
924 rs = &raid_softc[unit];
925
926 if ((rs->sc_flags & RAIDF_INITED) == 0)
927 return (ENXIO);
928
929 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
930
931 }
932
933 int
934 raidioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct lwp *l)
935 {
936 int unit = raidunit(dev);
937 int error = 0;
938 int part, pmask;
939 struct cfdata *cf;
940 struct raid_softc *rs;
941 RF_Config_t *k_cfg, *u_cfg;
942 RF_Raid_t *raidPtr;
943 RF_RaidDisk_t *diskPtr;
944 RF_AccTotals_t *totals;
945 RF_DeviceConfig_t *d_cfg, **ucfgp;
946 u_char *specific_buf;
947 int retcode = 0;
948 int column;
949 int raidid;
950 struct rf_recon_req *rrcopy, *rr;
951 RF_ComponentLabel_t *clabel;
952 RF_ComponentLabel_t *ci_label;
953 RF_ComponentLabel_t **clabel_ptr;
954 RF_SingleComponent_t *sparePtr,*componentPtr;
955 RF_SingleComponent_t component;
956 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
957 int i, j, d;
958 #ifdef __HAVE_OLD_DISKLABEL
959 struct disklabel newlabel;
960 #endif
961 struct dkwedge_info *dkw;
962
963 if (unit >= numraid)
964 return (ENXIO);
965 rs = &raid_softc[unit];
966 raidPtr = raidPtrs[unit];
967
968 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
969 (int) DISKPART(dev), (int) unit, (int) cmd));
970
971 /* Must be open for writes for these commands... */
972 switch (cmd) {
973 #ifdef DIOCGSECTORSIZE
974 case DIOCGSECTORSIZE:
975 *(u_int *)data = raidPtr->bytesPerSector;
976 return 0;
977 case DIOCGMEDIASIZE:
978 *(off_t *)data =
979 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
980 return 0;
981 #endif
982 case DIOCSDINFO:
983 case DIOCWDINFO:
984 #ifdef __HAVE_OLD_DISKLABEL
985 case ODIOCWDINFO:
986 case ODIOCSDINFO:
987 #endif
988 case DIOCWLABEL:
989 case DIOCAWEDGE:
990 case DIOCDWEDGE:
991 if ((flag & FWRITE) == 0)
992 return (EBADF);
993 }
994
995 /* Must be initialized for these... */
996 switch (cmd) {
997 case DIOCGDINFO:
998 case DIOCSDINFO:
999 case DIOCWDINFO:
1000 #ifdef __HAVE_OLD_DISKLABEL
1001 case ODIOCGDINFO:
1002 case ODIOCWDINFO:
1003 case ODIOCSDINFO:
1004 case ODIOCGDEFLABEL:
1005 #endif
1006 case DIOCGPART:
1007 case DIOCWLABEL:
1008 case DIOCGDEFLABEL:
1009 case DIOCAWEDGE:
1010 case DIOCDWEDGE:
1011 case DIOCLWEDGES:
1012 case RAIDFRAME_SHUTDOWN:
1013 case RAIDFRAME_REWRITEPARITY:
1014 case RAIDFRAME_GET_INFO:
1015 case RAIDFRAME_RESET_ACCTOTALS:
1016 case RAIDFRAME_GET_ACCTOTALS:
1017 case RAIDFRAME_KEEP_ACCTOTALS:
1018 case RAIDFRAME_GET_SIZE:
1019 case RAIDFRAME_FAIL_DISK:
1020 case RAIDFRAME_COPYBACK:
1021 case RAIDFRAME_CHECK_RECON_STATUS:
1022 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1023 case RAIDFRAME_GET_COMPONENT_LABEL:
1024 case RAIDFRAME_SET_COMPONENT_LABEL:
1025 case RAIDFRAME_ADD_HOT_SPARE:
1026 case RAIDFRAME_REMOVE_HOT_SPARE:
1027 case RAIDFRAME_INIT_LABELS:
1028 case RAIDFRAME_REBUILD_IN_PLACE:
1029 case RAIDFRAME_CHECK_PARITY:
1030 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1031 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1032 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1033 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1034 case RAIDFRAME_SET_AUTOCONFIG:
1035 case RAIDFRAME_SET_ROOT:
1036 case RAIDFRAME_DELETE_COMPONENT:
1037 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1038 if ((rs->sc_flags & RAIDF_INITED) == 0)
1039 return (ENXIO);
1040 }
1041
1042 switch (cmd) {
1043
1044 /* configure the system */
1045 case RAIDFRAME_CONFIGURE:
1046
1047 if (raidPtr->valid) {
1048 /* There is a valid RAID set running on this unit! */
1049 printf("raid%d: Device already configured!\n",unit);
1050 return(EINVAL);
1051 }
1052
1053 /* copy-in the configuration information */
1054 /* data points to a pointer to the configuration structure */
1055
1056 u_cfg = *((RF_Config_t **) data);
1057 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1058 if (k_cfg == NULL) {
1059 return (ENOMEM);
1060 }
1061 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1062 if (retcode) {
1063 RF_Free(k_cfg, sizeof(RF_Config_t));
1064 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1065 retcode));
1066 return (retcode);
1067 }
1068 /* allocate a buffer for the layout-specific data, and copy it
1069 * in */
1070 if (k_cfg->layoutSpecificSize) {
1071 if (k_cfg->layoutSpecificSize > 10000) {
1072 /* sanity check */
1073 RF_Free(k_cfg, sizeof(RF_Config_t));
1074 return (EINVAL);
1075 }
1076 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1077 (u_char *));
1078 if (specific_buf == NULL) {
1079 RF_Free(k_cfg, sizeof(RF_Config_t));
1080 return (ENOMEM);
1081 }
1082 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1083 k_cfg->layoutSpecificSize);
1084 if (retcode) {
1085 RF_Free(k_cfg, sizeof(RF_Config_t));
1086 RF_Free(specific_buf,
1087 k_cfg->layoutSpecificSize);
1088 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1089 retcode));
1090 return (retcode);
1091 }
1092 } else
1093 specific_buf = NULL;
1094 k_cfg->layoutSpecific = specific_buf;
1095
1096 /* should do some kind of sanity check on the configuration.
1097 * Store the sum of all the bytes in the last byte? */
1098
1099 /* configure the system */
1100
1101 /*
1102 * Clear the entire RAID descriptor, just to make sure
1103 * there is no stale data left in the case of a
1104 * reconfiguration
1105 */
1106 memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1107 raidPtr->raidid = unit;
1108
1109 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1110
1111 if (retcode == 0) {
1112
1113 /* allow this many simultaneous IO's to
1114 this RAID device */
1115 raidPtr->openings = RAIDOUTSTANDING;
1116
1117 raidinit(raidPtr);
1118 rf_markalldirty(raidPtr);
1119 }
1120 /* free the buffers. No return code here. */
1121 if (k_cfg->layoutSpecificSize) {
1122 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1123 }
1124 RF_Free(k_cfg, sizeof(RF_Config_t));
1125
1126 return (retcode);
1127
1128 /* shutdown the system */
1129 case RAIDFRAME_SHUTDOWN:
1130
1131 if ((error = raidlock(rs)) != 0)
1132 return (error);
1133
1134 /*
1135 * If somebody has a partition mounted, we shouldn't
1136 * shutdown.
1137 */
1138
1139 part = DISKPART(dev);
1140 pmask = (1 << part);
1141 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1142 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1143 (rs->sc_dkdev.dk_copenmask & pmask))) {
1144 raidunlock(rs);
1145 return (EBUSY);
1146 }
1147
1148 retcode = rf_Shutdown(raidPtr);
1149
1150 /* It's no longer initialized... */
1151 rs->sc_flags &= ~RAIDF_INITED;
1152
1153 /* free the pseudo device attach bits */
1154
1155 cf = device_cfdata(rs->sc_dev);
1156 /* XXX this causes us to not return any errors
1157 from the above call to rf_Shutdown() */
1158 retcode = config_detach(rs->sc_dev, DETACH_QUIET);
1159 free(cf, M_RAIDFRAME);
1160
1161 /* Detach the disk. */
1162 pseudo_disk_detach(&rs->sc_dkdev);
1163
1164 raidunlock(rs);
1165
1166 return (retcode);
1167 case RAIDFRAME_GET_COMPONENT_LABEL:
1168 clabel_ptr = (RF_ComponentLabel_t **) data;
1169 /* need to read the component label for the disk indicated
1170 by row,column in clabel */
1171
1172 /* For practice, let's get it directly fromdisk, rather
1173 than from the in-core copy */
1174 RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
1175 (RF_ComponentLabel_t *));
1176 if (clabel == NULL)
1177 return (ENOMEM);
1178
1179 retcode = copyin( *clabel_ptr, clabel,
1180 sizeof(RF_ComponentLabel_t));
1181
1182 if (retcode) {
1183 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1184 return(retcode);
1185 }
1186
1187 clabel->row = 0; /* Don't allow looking at anything else.*/
1188
1189 column = clabel->column;
1190
1191 if ((column < 0) || (column >= raidPtr->numCol +
1192 raidPtr->numSpare)) {
1193 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1194 return(EINVAL);
1195 }
1196
1197 retcode = raidread_component_label(raidPtr->Disks[column].dev,
1198 raidPtr->raid_cinfo[column].ci_vp,
1199 clabel );
1200
1201 if (retcode == 0) {
1202 retcode = copyout(clabel, *clabel_ptr,
1203 sizeof(RF_ComponentLabel_t));
1204 }
1205 RF_Free(clabel, sizeof(RF_ComponentLabel_t));
1206 return (retcode);
1207
1208 case RAIDFRAME_SET_COMPONENT_LABEL:
1209 clabel = (RF_ComponentLabel_t *) data;
1210
1211 /* XXX check the label for valid stuff... */
1212 /* Note that some things *should not* get modified --
1213 the user should be re-initing the labels instead of
1214 trying to patch things.
1215 */
1216
1217 raidid = raidPtr->raidid;
1218 #ifdef DEBUG
1219 printf("raid%d: Got component label:\n", raidid);
1220 printf("raid%d: Version: %d\n", raidid, clabel->version);
1221 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1222 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1223 printf("raid%d: Column: %d\n", raidid, clabel->column);
1224 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1225 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1226 printf("raid%d: Status: %d\n", raidid, clabel->status);
1227 #endif
1228 clabel->row = 0;
1229 column = clabel->column;
1230
1231 if ((column < 0) || (column >= raidPtr->numCol)) {
1232 return(EINVAL);
1233 }
1234
1235 /* XXX this isn't allowed to do anything for now :-) */
1236
1237 /* XXX and before it is, we need to fill in the rest
1238 of the fields!?!?!?! */
1239 #if 0
1240 raidwrite_component_label(
1241 raidPtr->Disks[column].dev,
1242 raidPtr->raid_cinfo[column].ci_vp,
1243 clabel );
1244 #endif
1245 return (0);
1246
1247 case RAIDFRAME_INIT_LABELS:
1248 clabel = (RF_ComponentLabel_t *) data;
1249 /*
1250 we only want the serial number from
1251 the above. We get all the rest of the information
1252 from the config that was used to create this RAID
1253 set.
1254 */
1255
1256 raidPtr->serial_number = clabel->serial_number;
1257
1258 RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t),
1259 (RF_ComponentLabel_t *));
1260 if (ci_label == NULL)
1261 return (ENOMEM);
1262
1263 raid_init_component_label(raidPtr, ci_label);
1264 ci_label->serial_number = clabel->serial_number;
1265 ci_label->row = 0; /* we dont' pretend to support more */
1266
1267 for(column=0;column<raidPtr->numCol;column++) {
1268 diskPtr = &raidPtr->Disks[column];
1269 if (!RF_DEAD_DISK(diskPtr->status)) {
1270 ci_label->partitionSize = diskPtr->partitionSize;
1271 ci_label->column = column;
1272 raidwrite_component_label(
1273 raidPtr->Disks[column].dev,
1274 raidPtr->raid_cinfo[column].ci_vp,
1275 ci_label );
1276 }
1277 }
1278 RF_Free(ci_label, sizeof(RF_ComponentLabel_t));
1279
1280 return (retcode);
1281 case RAIDFRAME_SET_AUTOCONFIG:
1282 d = rf_set_autoconfig(raidPtr, *(int *) data);
1283 printf("raid%d: New autoconfig value is: %d\n",
1284 raidPtr->raidid, d);
1285 *(int *) data = d;
1286 return (retcode);
1287
1288 case RAIDFRAME_SET_ROOT:
1289 d = rf_set_rootpartition(raidPtr, *(int *) data);
1290 printf("raid%d: New rootpartition value is: %d\n",
1291 raidPtr->raidid, d);
1292 *(int *) data = d;
1293 return (retcode);
1294
1295 /* initialize all parity */
1296 case RAIDFRAME_REWRITEPARITY:
1297
1298 if (raidPtr->Layout.map->faultsTolerated == 0) {
1299 /* Parity for RAID 0 is trivially correct */
1300 raidPtr->parity_good = RF_RAID_CLEAN;
1301 return(0);
1302 }
1303
1304 if (raidPtr->parity_rewrite_in_progress == 1) {
1305 /* Re-write is already in progress! */
1306 return(EINVAL);
1307 }
1308
1309 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1310 rf_RewriteParityThread,
1311 raidPtr,"raid_parity");
1312 return (retcode);
1313
1314
1315 case RAIDFRAME_ADD_HOT_SPARE:
1316 sparePtr = (RF_SingleComponent_t *) data;
1317 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1318 retcode = rf_add_hot_spare(raidPtr, &component);
1319 return(retcode);
1320
1321 case RAIDFRAME_REMOVE_HOT_SPARE:
1322 return(retcode);
1323
1324 case RAIDFRAME_DELETE_COMPONENT:
1325 componentPtr = (RF_SingleComponent_t *)data;
1326 memcpy( &component, componentPtr,
1327 sizeof(RF_SingleComponent_t));
1328 retcode = rf_delete_component(raidPtr, &component);
1329 return(retcode);
1330
1331 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1332 componentPtr = (RF_SingleComponent_t *)data;
1333 memcpy( &component, componentPtr,
1334 sizeof(RF_SingleComponent_t));
1335 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1336 return(retcode);
1337
1338 case RAIDFRAME_REBUILD_IN_PLACE:
1339
1340 if (raidPtr->Layout.map->faultsTolerated == 0) {
1341 /* Can't do this on a RAID 0!! */
1342 return(EINVAL);
1343 }
1344
1345 if (raidPtr->recon_in_progress == 1) {
1346 /* a reconstruct is already in progress! */
1347 return(EINVAL);
1348 }
1349
1350 componentPtr = (RF_SingleComponent_t *) data;
1351 memcpy( &component, componentPtr,
1352 sizeof(RF_SingleComponent_t));
1353 component.row = 0; /* we don't support any more */
1354 column = component.column;
1355
1356 if ((column < 0) || (column >= raidPtr->numCol)) {
1357 return(EINVAL);
1358 }
1359
1360 RF_LOCK_MUTEX(raidPtr->mutex);
1361 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1362 (raidPtr->numFailures > 0)) {
1363 /* XXX 0 above shouldn't be constant!!! */
1364 /* some component other than this has failed.
1365 Let's not make things worse than they already
1366 are... */
1367 printf("raid%d: Unable to reconstruct to disk at:\n",
1368 raidPtr->raidid);
1369 printf("raid%d: Col: %d Too many failures.\n",
1370 raidPtr->raidid, column);
1371 RF_UNLOCK_MUTEX(raidPtr->mutex);
1372 return (EINVAL);
1373 }
1374 if (raidPtr->Disks[column].status ==
1375 rf_ds_reconstructing) {
1376 printf("raid%d: Unable to reconstruct to disk at:\n",
1377 raidPtr->raidid);
1378 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1379
1380 RF_UNLOCK_MUTEX(raidPtr->mutex);
1381 return (EINVAL);
1382 }
1383 if (raidPtr->Disks[column].status == rf_ds_spared) {
1384 RF_UNLOCK_MUTEX(raidPtr->mutex);
1385 return (EINVAL);
1386 }
1387 RF_UNLOCK_MUTEX(raidPtr->mutex);
1388
1389 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1390 if (rrcopy == NULL)
1391 return(ENOMEM);
1392
1393 rrcopy->raidPtr = (void *) raidPtr;
1394 rrcopy->col = column;
1395
1396 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1397 rf_ReconstructInPlaceThread,
1398 rrcopy,"raid_reconip");
1399 return(retcode);
1400
1401 case RAIDFRAME_GET_INFO:
1402 if (!raidPtr->valid)
1403 return (ENODEV);
1404 ucfgp = (RF_DeviceConfig_t **) data;
1405 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1406 (RF_DeviceConfig_t *));
1407 if (d_cfg == NULL)
1408 return (ENOMEM);
1409 d_cfg->rows = 1; /* there is only 1 row now */
1410 d_cfg->cols = raidPtr->numCol;
1411 d_cfg->ndevs = raidPtr->numCol;
1412 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1413 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1414 return (ENOMEM);
1415 }
1416 d_cfg->nspares = raidPtr->numSpare;
1417 if (d_cfg->nspares >= RF_MAX_DISKS) {
1418 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1419 return (ENOMEM);
1420 }
1421 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1422 d = 0;
1423 for (j = 0; j < d_cfg->cols; j++) {
1424 d_cfg->devs[d] = raidPtr->Disks[j];
1425 d++;
1426 }
1427 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1428 d_cfg->spares[i] = raidPtr->Disks[j];
1429 }
1430 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1431 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1432
1433 return (retcode);
1434
1435 case RAIDFRAME_CHECK_PARITY:
1436 *(int *) data = raidPtr->parity_good;
1437 return (0);
1438
1439 case RAIDFRAME_RESET_ACCTOTALS:
1440 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1441 return (0);
1442
1443 case RAIDFRAME_GET_ACCTOTALS:
1444 totals = (RF_AccTotals_t *) data;
1445 *totals = raidPtr->acc_totals;
1446 return (0);
1447
1448 case RAIDFRAME_KEEP_ACCTOTALS:
1449 raidPtr->keep_acc_totals = *(int *)data;
1450 return (0);
1451
1452 case RAIDFRAME_GET_SIZE:
1453 *(int *) data = raidPtr->totalSectors;
1454 return (0);
1455
1456 /* fail a disk & optionally start reconstruction */
1457 case RAIDFRAME_FAIL_DISK:
1458
1459 if (raidPtr->Layout.map->faultsTolerated == 0) {
1460 /* Can't do this on a RAID 0!! */
1461 return(EINVAL);
1462 }
1463
1464 rr = (struct rf_recon_req *) data;
1465 rr->row = 0;
1466 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1467 return (EINVAL);
1468
1469
1470 RF_LOCK_MUTEX(raidPtr->mutex);
1471 if (raidPtr->status == rf_rs_reconstructing) {
1472 /* you can't fail a disk while we're reconstructing! */
1473 /* XXX wrong for RAID6 */
1474 RF_UNLOCK_MUTEX(raidPtr->mutex);
1475 return (EINVAL);
1476 }
1477 if ((raidPtr->Disks[rr->col].status ==
1478 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1479 /* some other component has failed. Let's not make
1480 things worse. XXX wrong for RAID6 */
1481 RF_UNLOCK_MUTEX(raidPtr->mutex);
1482 return (EINVAL);
1483 }
1484 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1485 /* Can't fail a spared disk! */
1486 RF_UNLOCK_MUTEX(raidPtr->mutex);
1487 return (EINVAL);
1488 }
1489 RF_UNLOCK_MUTEX(raidPtr->mutex);
1490
1491 /* make a copy of the recon request so that we don't rely on
1492 * the user's buffer */
1493 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1494 if (rrcopy == NULL)
1495 return(ENOMEM);
1496 memcpy(rrcopy, rr, sizeof(*rr));
1497 rrcopy->raidPtr = (void *) raidPtr;
1498
1499 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1500 rf_ReconThread,
1501 rrcopy,"raid_recon");
1502 return (0);
1503
1504 /* invoke a copyback operation after recon on whatever disk
1505 * needs it, if any */
1506 case RAIDFRAME_COPYBACK:
1507
1508 if (raidPtr->Layout.map->faultsTolerated == 0) {
1509 /* This makes no sense on a RAID 0!! */
1510 return(EINVAL);
1511 }
1512
1513 if (raidPtr->copyback_in_progress == 1) {
1514 /* Copyback is already in progress! */
1515 return(EINVAL);
1516 }
1517
1518 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1519 rf_CopybackThread,
1520 raidPtr,"raid_copyback");
1521 return (retcode);
1522
1523 /* return the percentage completion of reconstruction */
1524 case RAIDFRAME_CHECK_RECON_STATUS:
1525 if (raidPtr->Layout.map->faultsTolerated == 0) {
1526 /* This makes no sense on a RAID 0, so tell the
1527 user it's done. */
1528 *(int *) data = 100;
1529 return(0);
1530 }
1531 if (raidPtr->status != rf_rs_reconstructing)
1532 *(int *) data = 100;
1533 else {
1534 if (raidPtr->reconControl->numRUsTotal > 0) {
1535 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1536 } else {
1537 *(int *) data = 0;
1538 }
1539 }
1540 return (0);
1541 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1542 progressInfoPtr = (RF_ProgressInfo_t **) data;
1543 if (raidPtr->status != rf_rs_reconstructing) {
1544 progressInfo.remaining = 0;
1545 progressInfo.completed = 100;
1546 progressInfo.total = 100;
1547 } else {
1548 progressInfo.total =
1549 raidPtr->reconControl->numRUsTotal;
1550 progressInfo.completed =
1551 raidPtr->reconControl->numRUsComplete;
1552 progressInfo.remaining = progressInfo.total -
1553 progressInfo.completed;
1554 }
1555 retcode = copyout(&progressInfo, *progressInfoPtr,
1556 sizeof(RF_ProgressInfo_t));
1557 return (retcode);
1558
1559 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1560 if (raidPtr->Layout.map->faultsTolerated == 0) {
1561 /* This makes no sense on a RAID 0, so tell the
1562 user it's done. */
1563 *(int *) data = 100;
1564 return(0);
1565 }
1566 if (raidPtr->parity_rewrite_in_progress == 1) {
1567 *(int *) data = 100 *
1568 raidPtr->parity_rewrite_stripes_done /
1569 raidPtr->Layout.numStripe;
1570 } else {
1571 *(int *) data = 100;
1572 }
1573 return (0);
1574
1575 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1576 progressInfoPtr = (RF_ProgressInfo_t **) data;
1577 if (raidPtr->parity_rewrite_in_progress == 1) {
1578 progressInfo.total = raidPtr->Layout.numStripe;
1579 progressInfo.completed =
1580 raidPtr->parity_rewrite_stripes_done;
1581 progressInfo.remaining = progressInfo.total -
1582 progressInfo.completed;
1583 } else {
1584 progressInfo.remaining = 0;
1585 progressInfo.completed = 100;
1586 progressInfo.total = 100;
1587 }
1588 retcode = copyout(&progressInfo, *progressInfoPtr,
1589 sizeof(RF_ProgressInfo_t));
1590 return (retcode);
1591
1592 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1593 if (raidPtr->Layout.map->faultsTolerated == 0) {
1594 /* This makes no sense on a RAID 0 */
1595 *(int *) data = 100;
1596 return(0);
1597 }
1598 if (raidPtr->copyback_in_progress == 1) {
1599 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1600 raidPtr->Layout.numStripe;
1601 } else {
1602 *(int *) data = 100;
1603 }
1604 return (0);
1605
1606 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1607 progressInfoPtr = (RF_ProgressInfo_t **) data;
1608 if (raidPtr->copyback_in_progress == 1) {
1609 progressInfo.total = raidPtr->Layout.numStripe;
1610 progressInfo.completed =
1611 raidPtr->copyback_stripes_done;
1612 progressInfo.remaining = progressInfo.total -
1613 progressInfo.completed;
1614 } else {
1615 progressInfo.remaining = 0;
1616 progressInfo.completed = 100;
1617 progressInfo.total = 100;
1618 }
1619 retcode = copyout(&progressInfo, *progressInfoPtr,
1620 sizeof(RF_ProgressInfo_t));
1621 return (retcode);
1622
1623 /* the sparetable daemon calls this to wait for the kernel to
1624 * need a spare table. this ioctl does not return until a
1625 * spare table is needed. XXX -- calling mpsleep here in the
1626 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1627 * -- I should either compute the spare table in the kernel,
1628 * or have a different -- XXX XXX -- interface (a different
1629 * character device) for delivering the table -- XXX */
1630 #if 0
1631 case RAIDFRAME_SPARET_WAIT:
1632 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1633 while (!rf_sparet_wait_queue)
1634 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1635 waitreq = rf_sparet_wait_queue;
1636 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1637 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1638
1639 /* structure assignment */
1640 *((RF_SparetWait_t *) data) = *waitreq;
1641
1642 RF_Free(waitreq, sizeof(*waitreq));
1643 return (0);
1644
1645 /* wakes up a process waiting on SPARET_WAIT and puts an error
1646 * code in it that will cause the dameon to exit */
1647 case RAIDFRAME_ABORT_SPARET_WAIT:
1648 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1649 waitreq->fcol = -1;
1650 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1651 waitreq->next = rf_sparet_wait_queue;
1652 rf_sparet_wait_queue = waitreq;
1653 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1654 wakeup(&rf_sparet_wait_queue);
1655 return (0);
1656
1657 /* used by the spare table daemon to deliver a spare table
1658 * into the kernel */
1659 case RAIDFRAME_SEND_SPARET:
1660
1661 /* install the spare table */
1662 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1663
1664 /* respond to the requestor. the return status of the spare
1665 * table installation is passed in the "fcol" field */
1666 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1667 waitreq->fcol = retcode;
1668 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1669 waitreq->next = rf_sparet_resp_queue;
1670 rf_sparet_resp_queue = waitreq;
1671 wakeup(&rf_sparet_resp_queue);
1672 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1673
1674 return (retcode);
1675 #endif
1676
1677 default:
1678 break; /* fall through to the os-specific code below */
1679
1680 }
1681
1682 if (!raidPtr->valid)
1683 return (EINVAL);
1684
1685 /*
1686 * Add support for "regular" device ioctls here.
1687 */
1688
1689 switch (cmd) {
1690 case DIOCGDINFO:
1691 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1692 break;
1693 #ifdef __HAVE_OLD_DISKLABEL
1694 case ODIOCGDINFO:
1695 newlabel = *(rs->sc_dkdev.dk_label);
1696 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1697 return ENOTTY;
1698 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1699 break;
1700 #endif
1701
1702 case DIOCGPART:
1703 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1704 ((struct partinfo *) data)->part =
1705 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1706 break;
1707
1708 case DIOCWDINFO:
1709 case DIOCSDINFO:
1710 #ifdef __HAVE_OLD_DISKLABEL
1711 case ODIOCWDINFO:
1712 case ODIOCSDINFO:
1713 #endif
1714 {
1715 struct disklabel *lp;
1716 #ifdef __HAVE_OLD_DISKLABEL
1717 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1718 memset(&newlabel, 0, sizeof newlabel);
1719 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1720 lp = &newlabel;
1721 } else
1722 #endif
1723 lp = (struct disklabel *)data;
1724
1725 if ((error = raidlock(rs)) != 0)
1726 return (error);
1727
1728 rs->sc_flags |= RAIDF_LABELLING;
1729
1730 error = setdisklabel(rs->sc_dkdev.dk_label,
1731 lp, 0, rs->sc_dkdev.dk_cpulabel);
1732 if (error == 0) {
1733 if (cmd == DIOCWDINFO
1734 #ifdef __HAVE_OLD_DISKLABEL
1735 || cmd == ODIOCWDINFO
1736 #endif
1737 )
1738 error = writedisklabel(RAIDLABELDEV(dev),
1739 raidstrategy, rs->sc_dkdev.dk_label,
1740 rs->sc_dkdev.dk_cpulabel);
1741 }
1742 rs->sc_flags &= ~RAIDF_LABELLING;
1743
1744 raidunlock(rs);
1745
1746 if (error)
1747 return (error);
1748 break;
1749 }
1750
1751 case DIOCWLABEL:
1752 if (*(int *) data != 0)
1753 rs->sc_flags |= RAIDF_WLABEL;
1754 else
1755 rs->sc_flags &= ~RAIDF_WLABEL;
1756 break;
1757
1758 case DIOCGDEFLABEL:
1759 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1760 break;
1761
1762 #ifdef __HAVE_OLD_DISKLABEL
1763 case ODIOCGDEFLABEL:
1764 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1765 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1766 return ENOTTY;
1767 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1768 break;
1769 #endif
1770
1771 case DIOCAWEDGE:
1772 case DIOCDWEDGE:
1773 dkw = (void *)data;
1774
1775 /* If the ioctl happens here, the parent is us. */
1776 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1777 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1778
1779 case DIOCLWEDGES:
1780 return dkwedge_list(&rs->sc_dkdev,
1781 (struct dkwedge_list *)data, l);
1782
1783 default:
1784 retcode = ENOTTY;
1785 }
1786 return (retcode);
1787
1788 }
1789
1790
1791 /* raidinit -- complete the rest of the initialization for the
1792 RAIDframe device. */
1793
1794
1795 static void
1796 raidinit(RF_Raid_t *raidPtr)
1797 {
1798 struct cfdata *cf;
1799 struct raid_softc *rs;
1800 int unit;
1801
1802 unit = raidPtr->raidid;
1803
1804 rs = &raid_softc[unit];
1805
1806 /* XXX should check return code first... */
1807 rs->sc_flags |= RAIDF_INITED;
1808
1809 /* XXX doesn't check bounds. */
1810 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1811
1812 rs->sc_dkdev.dk_name = rs->sc_xname;
1813
1814 /* attach the pseudo device */
1815 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1816 cf->cf_name = raid_cd.cd_name;
1817 cf->cf_atname = raid_cd.cd_name;
1818 cf->cf_unit = unit;
1819 cf->cf_fstate = FSTATE_STAR;
1820
1821 rs->sc_dev = config_attach_pseudo(cf);
1822
1823 if (rs->sc_dev==NULL) {
1824 printf("raid%d: config_attach_pseudo failed\n",
1825 raidPtr->raidid);
1826 }
1827
1828 /* disk_attach actually creates space for the CPU disklabel, among
1829 * other things, so it's critical to call this *BEFORE* we try putzing
1830 * with disklabels. */
1831
1832 disk_attach(&rs->sc_dkdev);
1833
1834 /* XXX There may be a weird interaction here between this, and
1835 * protectedSectors, as used in RAIDframe. */
1836
1837 rs->sc_size = raidPtr->totalSectors;
1838
1839 dkwedge_discover(&rs->sc_dkdev);
1840
1841 rf_set_properties(rs, raidPtr);
1842
1843 }
1844 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1845 /* wake up the daemon & tell it to get us a spare table
1846 * XXX
1847 * the entries in the queues should be tagged with the raidPtr
1848 * so that in the extremely rare case that two recons happen at once,
1849 * we know for which device were requesting a spare table
1850 * XXX
1851 *
1852 * XXX This code is not currently used. GO
1853 */
1854 int
1855 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1856 {
1857 int retcode;
1858
1859 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1860 req->next = rf_sparet_wait_queue;
1861 rf_sparet_wait_queue = req;
1862 wakeup(&rf_sparet_wait_queue);
1863
1864 /* mpsleep unlocks the mutex */
1865 while (!rf_sparet_resp_queue) {
1866 tsleep(&rf_sparet_resp_queue, PRIBIO,
1867 "raidframe getsparetable", 0);
1868 }
1869 req = rf_sparet_resp_queue;
1870 rf_sparet_resp_queue = req->next;
1871 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1872
1873 retcode = req->fcol;
1874 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1875 * alloc'd */
1876 return (retcode);
1877 }
1878 #endif
1879
1880 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1881 * bp & passes it down.
1882 * any calls originating in the kernel must use non-blocking I/O
1883 * do some extra sanity checking to return "appropriate" error values for
1884 * certain conditions (to make some standard utilities work)
1885 *
1886 * Formerly known as: rf_DoAccessKernel
1887 */
1888 void
1889 raidstart(RF_Raid_t *raidPtr)
1890 {
1891 RF_SectorCount_t num_blocks, pb, sum;
1892 RF_RaidAddr_t raid_addr;
1893 struct partition *pp;
1894 daddr_t blocknum;
1895 int unit;
1896 struct raid_softc *rs;
1897 int do_async;
1898 struct buf *bp;
1899 int rc;
1900
1901 unit = raidPtr->raidid;
1902 rs = &raid_softc[unit];
1903
1904 /* quick check to see if anything has died recently */
1905 RF_LOCK_MUTEX(raidPtr->mutex);
1906 if (raidPtr->numNewFailures > 0) {
1907 RF_UNLOCK_MUTEX(raidPtr->mutex);
1908 rf_update_component_labels(raidPtr,
1909 RF_NORMAL_COMPONENT_UPDATE);
1910 RF_LOCK_MUTEX(raidPtr->mutex);
1911 raidPtr->numNewFailures--;
1912 }
1913
1914 /* Check to see if we're at the limit... */
1915 while (raidPtr->openings > 0) {
1916 RF_UNLOCK_MUTEX(raidPtr->mutex);
1917
1918 /* get the next item, if any, from the queue */
1919 if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
1920 /* nothing more to do */
1921 return;
1922 }
1923
1924 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1925 * partition.. Need to make it absolute to the underlying
1926 * device.. */
1927
1928 blocknum = bp->b_blkno;
1929 if (DISKPART(bp->b_dev) != RAW_PART) {
1930 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1931 blocknum += pp->p_offset;
1932 }
1933
1934 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1935 (int) blocknum));
1936
1937 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1938 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1939
1940 /* *THIS* is where we adjust what block we're going to...
1941 * but DO NOT TOUCH bp->b_blkno!!! */
1942 raid_addr = blocknum;
1943
1944 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1945 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1946 sum = raid_addr + num_blocks + pb;
1947 if (1 || rf_debugKernelAccess) {
1948 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1949 (int) raid_addr, (int) sum, (int) num_blocks,
1950 (int) pb, (int) bp->b_resid));
1951 }
1952 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1953 || (sum < num_blocks) || (sum < pb)) {
1954 bp->b_error = ENOSPC;
1955 bp->b_flags |= B_ERROR;
1956 bp->b_resid = bp->b_bcount;
1957 biodone(bp);
1958 RF_LOCK_MUTEX(raidPtr->mutex);
1959 continue;
1960 }
1961 /*
1962 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1963 */
1964
1965 if (bp->b_bcount & raidPtr->sectorMask) {
1966 bp->b_error = EINVAL;
1967 bp->b_flags |= B_ERROR;
1968 bp->b_resid = bp->b_bcount;
1969 biodone(bp);
1970 RF_LOCK_MUTEX(raidPtr->mutex);
1971 continue;
1972
1973 }
1974 db1_printf(("Calling DoAccess..\n"));
1975
1976
1977 RF_LOCK_MUTEX(raidPtr->mutex);
1978 raidPtr->openings--;
1979 RF_UNLOCK_MUTEX(raidPtr->mutex);
1980
1981 /*
1982 * Everything is async.
1983 */
1984 do_async = 1;
1985
1986 disk_busy(&rs->sc_dkdev);
1987
1988 /* XXX we're still at splbio() here... do we *really*
1989 need to be? */
1990
1991 /* don't ever condition on bp->b_flags & B_WRITE.
1992 * always condition on B_READ instead */
1993
1994 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1995 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1996 do_async, raid_addr, num_blocks,
1997 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1998
1999 if (rc) {
2000 bp->b_error = rc;
2001 bp->b_flags |= B_ERROR;
2002 bp->b_resid = bp->b_bcount;
2003 biodone(bp);
2004 /* continue loop */
2005 }
2006
2007 RF_LOCK_MUTEX(raidPtr->mutex);
2008 }
2009 RF_UNLOCK_MUTEX(raidPtr->mutex);
2010 }
2011
2012
2013
2014
2015 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2016
2017 int
2018 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2019 {
2020 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2021 struct buf *bp;
2022
2023 req->queue = queue;
2024
2025 #if DIAGNOSTIC
2026 if (queue->raidPtr->raidid >= numraid) {
2027 printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
2028 numraid);
2029 panic("Invalid Unit number in rf_DispatchKernelIO");
2030 }
2031 #endif
2032
2033 bp = req->bp;
2034
2035 switch (req->type) {
2036 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2037 /* XXX need to do something extra here.. */
2038 /* I'm leaving this in, as I've never actually seen it used,
2039 * and I'd like folks to report it... GO */
2040 printf(("WAKEUP CALLED\n"));
2041 queue->numOutstanding++;
2042
2043 bp->b_flags = 0;
2044 bp->b_private = req;
2045
2046 KernelWakeupFunc(bp);
2047 break;
2048
2049 case RF_IO_TYPE_READ:
2050 case RF_IO_TYPE_WRITE:
2051 #if RF_ACC_TRACE > 0
2052 if (req->tracerec) {
2053 RF_ETIMER_START(req->tracerec->timer);
2054 }
2055 #endif
2056 InitBP(bp, queue->rf_cinfo->ci_vp,
2057 op, queue->rf_cinfo->ci_dev,
2058 req->sectorOffset, req->numSector,
2059 req->buf, KernelWakeupFunc, (void *) req,
2060 queue->raidPtr->logBytesPerSector, req->b_proc);
2061
2062 if (rf_debugKernelAccess) {
2063 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2064 (long) bp->b_blkno));
2065 }
2066 queue->numOutstanding++;
2067 queue->last_deq_sector = req->sectorOffset;
2068 /* acc wouldn't have been let in if there were any pending
2069 * reqs at any other priority */
2070 queue->curPriority = req->priority;
2071
2072 db1_printf(("Going for %c to unit %d col %d\n",
2073 req->type, queue->raidPtr->raidid,
2074 queue->col));
2075 db1_printf(("sector %d count %d (%d bytes) %d\n",
2076 (int) req->sectorOffset, (int) req->numSector,
2077 (int) (req->numSector <<
2078 queue->raidPtr->logBytesPerSector),
2079 (int) queue->raidPtr->logBytesPerSector));
2080 VOP_STRATEGY(bp->b_vp, bp);
2081
2082 break;
2083
2084 default:
2085 panic("bad req->type in rf_DispatchKernelIO");
2086 }
2087 db1_printf(("Exiting from DispatchKernelIO\n"));
2088
2089 return (0);
2090 }
2091 /* this is the callback function associated with a I/O invoked from
2092 kernel code.
2093 */
2094 static void
2095 KernelWakeupFunc(struct buf *bp)
2096 {
2097 RF_DiskQueueData_t *req = NULL;
2098 RF_DiskQueue_t *queue;
2099 int s;
2100
2101 s = splbio();
2102 db1_printf(("recovering the request queue:\n"));
2103 req = bp->b_private;
2104
2105 queue = (RF_DiskQueue_t *) req->queue;
2106
2107 #if RF_ACC_TRACE > 0
2108 if (req->tracerec) {
2109 RF_ETIMER_STOP(req->tracerec->timer);
2110 RF_ETIMER_EVAL(req->tracerec->timer);
2111 RF_LOCK_MUTEX(rf_tracing_mutex);
2112 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2113 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2114 req->tracerec->num_phys_ios++;
2115 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2116 }
2117 #endif
2118
2119 /* XXX Ok, let's get aggressive... If B_ERROR is set, let's go
2120 * ballistic, and mark the component as hosed... */
2121
2122 if (bp->b_flags & B_ERROR) {
2123 /* Mark the disk as dead */
2124 /* but only mark it once... */
2125 /* and only if it wouldn't leave this RAID set
2126 completely broken */
2127 if (((queue->raidPtr->Disks[queue->col].status ==
2128 rf_ds_optimal) ||
2129 (queue->raidPtr->Disks[queue->col].status ==
2130 rf_ds_used_spare)) &&
2131 (queue->raidPtr->numFailures <
2132 queue->raidPtr->Layout.map->faultsTolerated)) {
2133 printf("raid%d: IO Error. Marking %s as failed.\n",
2134 queue->raidPtr->raidid,
2135 queue->raidPtr->Disks[queue->col].devname);
2136 queue->raidPtr->Disks[queue->col].status =
2137 rf_ds_failed;
2138 queue->raidPtr->status = rf_rs_degraded;
2139 queue->raidPtr->numFailures++;
2140 queue->raidPtr->numNewFailures++;
2141 } else { /* Disk is already dead... */
2142 /* printf("Disk already marked as dead!\n"); */
2143 }
2144
2145 }
2146
2147 /* Fill in the error value */
2148
2149 req->error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
2150
2151 simple_lock(&queue->raidPtr->iodone_lock);
2152
2153 /* Drop this one on the "finished" queue... */
2154 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2155
2156 /* Let the raidio thread know there is work to be done. */
2157 wakeup(&(queue->raidPtr->iodone));
2158
2159 simple_unlock(&queue->raidPtr->iodone_lock);
2160
2161 splx(s);
2162 }
2163
2164
2165
2166 /*
2167 * initialize a buf structure for doing an I/O in the kernel.
2168 */
2169 static void
2170 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2171 RF_SectorNum_t startSect, RF_SectorCount_t numSect, caddr_t bf,
2172 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2173 struct proc *b_proc)
2174 {
2175 /* bp->b_flags = B_PHYS | rw_flag; */
2176 bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */
2177 bp->b_bcount = numSect << logBytesPerSector;
2178 bp->b_bufsize = bp->b_bcount;
2179 bp->b_error = 0;
2180 bp->b_dev = dev;
2181 bp->b_data = bf;
2182 bp->b_blkno = startSect;
2183 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2184 if (bp->b_bcount == 0) {
2185 panic("bp->b_bcount is zero in InitBP!!");
2186 }
2187 bp->b_proc = b_proc;
2188 bp->b_iodone = cbFunc;
2189 bp->b_private = cbArg;
2190 bp->b_vp = b_vp;
2191 if ((bp->b_flags & B_READ) == 0) {
2192 bp->b_vp->v_numoutput++;
2193 }
2194
2195 }
2196
2197 static void
2198 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2199 struct disklabel *lp)
2200 {
2201 memset(lp, 0, sizeof(*lp));
2202
2203 /* fabricate a label... */
2204 lp->d_secperunit = raidPtr->totalSectors;
2205 lp->d_secsize = raidPtr->bytesPerSector;
2206 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2207 lp->d_ntracks = 4 * raidPtr->numCol;
2208 lp->d_ncylinders = raidPtr->totalSectors /
2209 (lp->d_nsectors * lp->d_ntracks);
2210 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2211
2212 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2213 lp->d_type = DTYPE_RAID;
2214 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2215 lp->d_rpm = 3600;
2216 lp->d_interleave = 1;
2217 lp->d_flags = 0;
2218
2219 lp->d_partitions[RAW_PART].p_offset = 0;
2220 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2221 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2222 lp->d_npartitions = RAW_PART + 1;
2223
2224 lp->d_magic = DISKMAGIC;
2225 lp->d_magic2 = DISKMAGIC;
2226 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2227
2228 }
2229 /*
2230 * Read the disklabel from the raid device. If one is not present, fake one
2231 * up.
2232 */
2233 static void
2234 raidgetdisklabel(dev_t dev)
2235 {
2236 int unit = raidunit(dev);
2237 struct raid_softc *rs = &raid_softc[unit];
2238 const char *errstring;
2239 struct disklabel *lp = rs->sc_dkdev.dk_label;
2240 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2241 RF_Raid_t *raidPtr;
2242
2243 db1_printf(("Getting the disklabel...\n"));
2244
2245 memset(clp, 0, sizeof(*clp));
2246
2247 raidPtr = raidPtrs[unit];
2248
2249 raidgetdefaultlabel(raidPtr, rs, lp);
2250
2251 /*
2252 * Call the generic disklabel extraction routine.
2253 */
2254 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2255 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2256 if (errstring)
2257 raidmakedisklabel(rs);
2258 else {
2259 int i;
2260 struct partition *pp;
2261
2262 /*
2263 * Sanity check whether the found disklabel is valid.
2264 *
2265 * This is necessary since total size of the raid device
2266 * may vary when an interleave is changed even though exactly
2267 * same components are used, and old disklabel may used
2268 * if that is found.
2269 */
2270 if (lp->d_secperunit != rs->sc_size)
2271 printf("raid%d: WARNING: %s: "
2272 "total sector size in disklabel (%d) != "
2273 "the size of raid (%ld)\n", unit, rs->sc_xname,
2274 lp->d_secperunit, (long) rs->sc_size);
2275 for (i = 0; i < lp->d_npartitions; i++) {
2276 pp = &lp->d_partitions[i];
2277 if (pp->p_offset + pp->p_size > rs->sc_size)
2278 printf("raid%d: WARNING: %s: end of partition `%c' "
2279 "exceeds the size of raid (%ld)\n",
2280 unit, rs->sc_xname, 'a' + i, (long) rs->sc_size);
2281 }
2282 }
2283
2284 }
2285 /*
2286 * Take care of things one might want to take care of in the event
2287 * that a disklabel isn't present.
2288 */
2289 static void
2290 raidmakedisklabel(struct raid_softc *rs)
2291 {
2292 struct disklabel *lp = rs->sc_dkdev.dk_label;
2293 db1_printf(("Making a label..\n"));
2294
2295 /*
2296 * For historical reasons, if there's no disklabel present
2297 * the raw partition must be marked FS_BSDFFS.
2298 */
2299
2300 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2301
2302 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2303
2304 lp->d_checksum = dkcksum(lp);
2305 }
2306 /*
2307 * Wait interruptibly for an exclusive lock.
2308 *
2309 * XXX
2310 * Several drivers do this; it should be abstracted and made MP-safe.
2311 * (Hmm... where have we seen this warning before :-> GO )
2312 */
2313 static int
2314 raidlock(struct raid_softc *rs)
2315 {
2316 int error;
2317
2318 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2319 rs->sc_flags |= RAIDF_WANTED;
2320 if ((error =
2321 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2322 return (error);
2323 }
2324 rs->sc_flags |= RAIDF_LOCKED;
2325 return (0);
2326 }
2327 /*
2328 * Unlock and wake up any waiters.
2329 */
2330 static void
2331 raidunlock(struct raid_softc *rs)
2332 {
2333
2334 rs->sc_flags &= ~RAIDF_LOCKED;
2335 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2336 rs->sc_flags &= ~RAIDF_WANTED;
2337 wakeup(rs);
2338 }
2339 }
2340
2341
2342 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2343 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2344
2345 int
2346 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2347 {
2348 RF_ComponentLabel_t clabel;
2349 raidread_component_label(dev, b_vp, &clabel);
2350 clabel.mod_counter = mod_counter;
2351 clabel.clean = RF_RAID_CLEAN;
2352 raidwrite_component_label(dev, b_vp, &clabel);
2353 return(0);
2354 }
2355
2356
2357 int
2358 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2359 {
2360 RF_ComponentLabel_t clabel;
2361 raidread_component_label(dev, b_vp, &clabel);
2362 clabel.mod_counter = mod_counter;
2363 clabel.clean = RF_RAID_DIRTY;
2364 raidwrite_component_label(dev, b_vp, &clabel);
2365 return(0);
2366 }
2367
2368 /* ARGSUSED */
2369 int
2370 raidread_component_label(dev_t dev, struct vnode *b_vp,
2371 RF_ComponentLabel_t *clabel)
2372 {
2373 struct buf *bp;
2374 const struct bdevsw *bdev;
2375 int error;
2376
2377 /* XXX should probably ensure that we don't try to do this if
2378 someone has changed rf_protected_sectors. */
2379
2380 if (b_vp == NULL) {
2381 /* For whatever reason, this component is not valid.
2382 Don't try to read a component label from it. */
2383 return(EINVAL);
2384 }
2385
2386 /* get a block of the appropriate size... */
2387 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2388 bp->b_dev = dev;
2389
2390 /* get our ducks in a row for the read */
2391 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2392 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2393 bp->b_flags |= B_READ;
2394 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2395
2396 bdev = bdevsw_lookup(bp->b_dev);
2397 if (bdev == NULL)
2398 return (ENXIO);
2399 (*bdev->d_strategy)(bp);
2400
2401 error = biowait(bp);
2402
2403 if (!error) {
2404 memcpy(clabel, bp->b_data,
2405 sizeof(RF_ComponentLabel_t));
2406 }
2407
2408 brelse(bp);
2409 return(error);
2410 }
2411 /* ARGSUSED */
2412 int
2413 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2414 RF_ComponentLabel_t *clabel)
2415 {
2416 struct buf *bp;
2417 const struct bdevsw *bdev;
2418 int error;
2419
2420 /* get a block of the appropriate size... */
2421 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2422 bp->b_dev = dev;
2423
2424 /* get our ducks in a row for the write */
2425 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2426 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2427 bp->b_flags |= B_WRITE;
2428 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2429
2430 memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
2431
2432 memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
2433
2434 bdev = bdevsw_lookup(bp->b_dev);
2435 if (bdev == NULL)
2436 return (ENXIO);
2437 (*bdev->d_strategy)(bp);
2438 error = biowait(bp);
2439 brelse(bp);
2440 if (error) {
2441 #if 1
2442 printf("Failed to write RAID component info!\n");
2443 #endif
2444 }
2445
2446 return(error);
2447 }
2448
2449 void
2450 rf_markalldirty(RF_Raid_t *raidPtr)
2451 {
2452 RF_ComponentLabel_t clabel;
2453 int sparecol;
2454 int c;
2455 int j;
2456 int scol = -1;
2457
2458 raidPtr->mod_counter++;
2459 for (c = 0; c < raidPtr->numCol; c++) {
2460 /* we don't want to touch (at all) a disk that has
2461 failed */
2462 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2463 raidread_component_label(
2464 raidPtr->Disks[c].dev,
2465 raidPtr->raid_cinfo[c].ci_vp,
2466 &clabel);
2467 if (clabel.status == rf_ds_spared) {
2468 /* XXX do something special...
2469 but whatever you do, don't
2470 try to access it!! */
2471 } else {
2472 raidmarkdirty(
2473 raidPtr->Disks[c].dev,
2474 raidPtr->raid_cinfo[c].ci_vp,
2475 raidPtr->mod_counter);
2476 }
2477 }
2478 }
2479
2480 for( c = 0; c < raidPtr->numSpare ; c++) {
2481 sparecol = raidPtr->numCol + c;
2482 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2483 /*
2484
2485 we claim this disk is "optimal" if it's
2486 rf_ds_used_spare, as that means it should be
2487 directly substitutable for the disk it replaced.
2488 We note that too...
2489
2490 */
2491
2492 for(j=0;j<raidPtr->numCol;j++) {
2493 if (raidPtr->Disks[j].spareCol == sparecol) {
2494 scol = j;
2495 break;
2496 }
2497 }
2498
2499 raidread_component_label(
2500 raidPtr->Disks[sparecol].dev,
2501 raidPtr->raid_cinfo[sparecol].ci_vp,
2502 &clabel);
2503 /* make sure status is noted */
2504
2505 raid_init_component_label(raidPtr, &clabel);
2506
2507 clabel.row = 0;
2508 clabel.column = scol;
2509 /* Note: we *don't* change status from rf_ds_used_spare
2510 to rf_ds_optimal */
2511 /* clabel.status = rf_ds_optimal; */
2512
2513 raidmarkdirty(raidPtr->Disks[sparecol].dev,
2514 raidPtr->raid_cinfo[sparecol].ci_vp,
2515 raidPtr->mod_counter);
2516 }
2517 }
2518 }
2519
2520
2521 void
2522 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2523 {
2524 RF_ComponentLabel_t clabel;
2525 int sparecol;
2526 int c;
2527 int j;
2528 int scol;
2529
2530 scol = -1;
2531
2532 /* XXX should do extra checks to make sure things really are clean,
2533 rather than blindly setting the clean bit... */
2534
2535 raidPtr->mod_counter++;
2536
2537 for (c = 0; c < raidPtr->numCol; c++) {
2538 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2539 raidread_component_label(
2540 raidPtr->Disks[c].dev,
2541 raidPtr->raid_cinfo[c].ci_vp,
2542 &clabel);
2543 /* make sure status is noted */
2544 clabel.status = rf_ds_optimal;
2545
2546 /* bump the counter */
2547 clabel.mod_counter = raidPtr->mod_counter;
2548
2549 /* note what unit we are configured as */
2550 clabel.last_unit = raidPtr->raidid;
2551
2552 raidwrite_component_label(
2553 raidPtr->Disks[c].dev,
2554 raidPtr->raid_cinfo[c].ci_vp,
2555 &clabel);
2556 if (final == RF_FINAL_COMPONENT_UPDATE) {
2557 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2558 raidmarkclean(
2559 raidPtr->Disks[c].dev,
2560 raidPtr->raid_cinfo[c].ci_vp,
2561 raidPtr->mod_counter);
2562 }
2563 }
2564 }
2565 /* else we don't touch it.. */
2566 }
2567
2568 for( c = 0; c < raidPtr->numSpare ; c++) {
2569 sparecol = raidPtr->numCol + c;
2570 /* Need to ensure that the reconstruct actually completed! */
2571 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2572 /*
2573
2574 we claim this disk is "optimal" if it's
2575 rf_ds_used_spare, as that means it should be
2576 directly substitutable for the disk it replaced.
2577 We note that too...
2578
2579 */
2580
2581 for(j=0;j<raidPtr->numCol;j++) {
2582 if (raidPtr->Disks[j].spareCol == sparecol) {
2583 scol = j;
2584 break;
2585 }
2586 }
2587
2588 /* XXX shouldn't *really* need this... */
2589 raidread_component_label(
2590 raidPtr->Disks[sparecol].dev,
2591 raidPtr->raid_cinfo[sparecol].ci_vp,
2592 &clabel);
2593 /* make sure status is noted */
2594
2595 raid_init_component_label(raidPtr, &clabel);
2596
2597 clabel.mod_counter = raidPtr->mod_counter;
2598 clabel.column = scol;
2599 clabel.status = rf_ds_optimal;
2600 clabel.last_unit = raidPtr->raidid;
2601
2602 raidwrite_component_label(
2603 raidPtr->Disks[sparecol].dev,
2604 raidPtr->raid_cinfo[sparecol].ci_vp,
2605 &clabel);
2606 if (final == RF_FINAL_COMPONENT_UPDATE) {
2607 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2608 raidmarkclean( raidPtr->Disks[sparecol].dev,
2609 raidPtr->raid_cinfo[sparecol].ci_vp,
2610 raidPtr->mod_counter);
2611 }
2612 }
2613 }
2614 }
2615 }
2616
2617 void
2618 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2619 {
2620 struct proc *p;
2621 struct lwp *l;
2622
2623 p = raidPtr->engine_thread;
2624 l = LIST_FIRST(&p->p_lwps);
2625
2626 if (vp != NULL) {
2627 if (auto_configured == 1) {
2628 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2629 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
2630 vput(vp);
2631
2632 } else {
2633 (void) vn_close(vp, FREAD | FWRITE, p->p_cred, l);
2634 }
2635 }
2636 }
2637
2638
2639 void
2640 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2641 {
2642 int r,c;
2643 struct vnode *vp;
2644 int acd;
2645
2646
2647 /* We take this opportunity to close the vnodes like we should.. */
2648
2649 for (c = 0; c < raidPtr->numCol; c++) {
2650 vp = raidPtr->raid_cinfo[c].ci_vp;
2651 acd = raidPtr->Disks[c].auto_configured;
2652 rf_close_component(raidPtr, vp, acd);
2653 raidPtr->raid_cinfo[c].ci_vp = NULL;
2654 raidPtr->Disks[c].auto_configured = 0;
2655 }
2656
2657 for (r = 0; r < raidPtr->numSpare; r++) {
2658 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2659 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2660 rf_close_component(raidPtr, vp, acd);
2661 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2662 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2663 }
2664 }
2665
2666
2667 void
2668 rf_ReconThread(struct rf_recon_req *req)
2669 {
2670 int s;
2671 RF_Raid_t *raidPtr;
2672
2673 s = splbio();
2674 raidPtr = (RF_Raid_t *) req->raidPtr;
2675 raidPtr->recon_in_progress = 1;
2676
2677 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2678 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2679
2680 RF_Free(req, sizeof(*req));
2681
2682 raidPtr->recon_in_progress = 0;
2683 splx(s);
2684
2685 /* That's all... */
2686 kthread_exit(0); /* does not return */
2687 }
2688
2689 void
2690 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2691 {
2692 int retcode;
2693 int s;
2694
2695 raidPtr->parity_rewrite_stripes_done = 0;
2696 raidPtr->parity_rewrite_in_progress = 1;
2697 s = splbio();
2698 retcode = rf_RewriteParity(raidPtr);
2699 splx(s);
2700 if (retcode) {
2701 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2702 } else {
2703 /* set the clean bit! If we shutdown correctly,
2704 the clean bit on each component label will get
2705 set */
2706 raidPtr->parity_good = RF_RAID_CLEAN;
2707 }
2708 raidPtr->parity_rewrite_in_progress = 0;
2709
2710 /* Anyone waiting for us to stop? If so, inform them... */
2711 if (raidPtr->waitShutdown) {
2712 wakeup(&raidPtr->parity_rewrite_in_progress);
2713 }
2714
2715 /* That's all... */
2716 kthread_exit(0); /* does not return */
2717 }
2718
2719
2720 void
2721 rf_CopybackThread(RF_Raid_t *raidPtr)
2722 {
2723 int s;
2724
2725 raidPtr->copyback_in_progress = 1;
2726 s = splbio();
2727 rf_CopybackReconstructedData(raidPtr);
2728 splx(s);
2729 raidPtr->copyback_in_progress = 0;
2730
2731 /* That's all... */
2732 kthread_exit(0); /* does not return */
2733 }
2734
2735
2736 void
2737 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2738 {
2739 int s;
2740 RF_Raid_t *raidPtr;
2741
2742 s = splbio();
2743 raidPtr = req->raidPtr;
2744 raidPtr->recon_in_progress = 1;
2745 rf_ReconstructInPlace(raidPtr, req->col);
2746 RF_Free(req, sizeof(*req));
2747 raidPtr->recon_in_progress = 0;
2748 splx(s);
2749
2750 /* That's all... */
2751 kthread_exit(0); /* does not return */
2752 }
2753
2754 static RF_AutoConfig_t *
2755 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2756 const char *cname, RF_SectorCount_t size)
2757 {
2758 int good_one = 0;
2759 RF_ComponentLabel_t *clabel;
2760 RF_AutoConfig_t *ac;
2761
2762 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2763 if (clabel == NULL) {
2764 oomem:
2765 while(ac_list) {
2766 ac = ac_list;
2767 if (ac->clabel)
2768 free(ac->clabel, M_RAIDFRAME);
2769 ac_list = ac_list->next;
2770 free(ac, M_RAIDFRAME);
2771 }
2772 printf("RAID auto config: out of memory!\n");
2773 return NULL; /* XXX probably should panic? */
2774 }
2775
2776 if (!raidread_component_label(dev, vp, clabel)) {
2777 /* Got the label. Does it look reasonable? */
2778 if (rf_reasonable_label(clabel) &&
2779 (clabel->partitionSize <= size)) {
2780 #ifdef DEBUG
2781 printf("Component on: %s: %llu\n",
2782 cname, (unsigned long long)size);
2783 rf_print_component_label(clabel);
2784 #endif
2785 /* if it's reasonable, add it, else ignore it. */
2786 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2787 M_NOWAIT);
2788 if (ac == NULL) {
2789 free(clabel, M_RAIDFRAME);
2790 goto oomem;
2791 }
2792 strlcpy(ac->devname, cname, sizeof(ac->devname));
2793 ac->dev = dev;
2794 ac->vp = vp;
2795 ac->clabel = clabel;
2796 ac->next = ac_list;
2797 ac_list = ac;
2798 good_one = 1;
2799 }
2800 }
2801 if (!good_one) {
2802 /* cleanup */
2803 free(clabel, M_RAIDFRAME);
2804 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2805 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
2806 vput(vp);
2807 }
2808 return ac_list;
2809 }
2810
2811 RF_AutoConfig_t *
2812 rf_find_raid_components()
2813 {
2814 struct vnode *vp;
2815 struct disklabel label;
2816 struct device *dv;
2817 dev_t dev;
2818 int bmajor, bminor, wedge;
2819 int error;
2820 int i;
2821 RF_AutoConfig_t *ac_list;
2822
2823
2824 /* initialize the AutoConfig list */
2825 ac_list = NULL;
2826
2827 /* we begin by trolling through *all* the devices on the system */
2828
2829 for (dv = alldevs.tqh_first; dv != NULL;
2830 dv = dv->dv_list.tqe_next) {
2831
2832 /* we are only interested in disks... */
2833 if (device_class(dv) != DV_DISK)
2834 continue;
2835
2836 /* we don't care about floppies... */
2837 if (device_is_a(dv, "fd")) {
2838 continue;
2839 }
2840
2841 /* we don't care about CD's... */
2842 if (device_is_a(dv, "cd")) {
2843 continue;
2844 }
2845
2846 /* hdfd is the Atari/Hades floppy driver */
2847 if (device_is_a(dv, "hdfd")) {
2848 continue;
2849 }
2850
2851 /* fdisa is the Atari/Milan floppy driver */
2852 if (device_is_a(dv, "fdisa")) {
2853 continue;
2854 }
2855
2856 /* need to find the device_name_to_block_device_major stuff */
2857 bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
2858
2859 /* get a vnode for the raw partition of this disk */
2860
2861 wedge = device_is_a(dv, "dk");
2862 bminor = minor(device_unit(dv));
2863 dev = wedge ? makedev(bmajor, bminor) :
2864 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2865 if (bdevvp(dev, &vp))
2866 panic("RAID can't alloc vnode");
2867
2868 error = VOP_OPEN(vp, FREAD, NOCRED, 0);
2869
2870 if (error) {
2871 /* "Who cares." Continue looking
2872 for something that exists*/
2873 vput(vp);
2874 continue;
2875 }
2876
2877 if (wedge) {
2878 struct dkwedge_info dkw;
2879 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2880 NOCRED, 0);
2881 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2882 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
2883 vput(vp);
2884 if (error) {
2885 printf("RAIDframe: can't get wedge info for "
2886 "dev %s (%d)\n", dv->dv_xname, error);
2887 continue;
2888 }
2889
2890 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0)
2891 continue;
2892
2893 ac_list = rf_get_component(ac_list, dev, vp,
2894 dv->dv_xname, dkw.dkw_size);
2895 continue;
2896 }
2897
2898 /* Ok, the disk exists. Go get the disklabel. */
2899 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED, 0);
2900 if (error) {
2901 /*
2902 * XXX can't happen - open() would
2903 * have errored out (or faked up one)
2904 */
2905 if (error != ENOTTY)
2906 printf("RAIDframe: can't get label for dev "
2907 "%s (%d)\n", dv->dv_xname, error);
2908 }
2909
2910 /* don't need this any more. We'll allocate it again
2911 a little later if we really do... */
2912 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2913 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
2914 vput(vp);
2915
2916 if (error)
2917 continue;
2918
2919 for (i = 0; i < label.d_npartitions; i++) {
2920 char cname[sizeof(ac_list->devname)];
2921
2922 /* We only support partitions marked as RAID */
2923 if (label.d_partitions[i].p_fstype != FS_RAID)
2924 continue;
2925
2926 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2927 if (bdevvp(dev, &vp))
2928 panic("RAID can't alloc vnode");
2929
2930 error = VOP_OPEN(vp, FREAD, NOCRED, 0);
2931 if (error) {
2932 /* Whatever... */
2933 vput(vp);
2934 continue;
2935 }
2936 snprintf(cname, sizeof(cname), "%s%c",
2937 dv->dv_xname, 'a' + i);
2938 ac_list = rf_get_component(ac_list, dev, vp, cname,
2939 label.d_partitions[i].p_size);
2940 }
2941 }
2942 return ac_list;
2943 }
2944
2945
2946 static int
2947 rf_reasonable_label(RF_ComponentLabel_t *clabel)
2948 {
2949
2950 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2951 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2952 ((clabel->clean == RF_RAID_CLEAN) ||
2953 (clabel->clean == RF_RAID_DIRTY)) &&
2954 clabel->row >=0 &&
2955 clabel->column >= 0 &&
2956 clabel->num_rows > 0 &&
2957 clabel->num_columns > 0 &&
2958 clabel->row < clabel->num_rows &&
2959 clabel->column < clabel->num_columns &&
2960 clabel->blockSize > 0 &&
2961 clabel->numBlocks > 0) {
2962 /* label looks reasonable enough... */
2963 return(1);
2964 }
2965 return(0);
2966 }
2967
2968
2969 #ifdef DEBUG
2970 void
2971 rf_print_component_label(RF_ComponentLabel_t *clabel)
2972 {
2973 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
2974 clabel->row, clabel->column,
2975 clabel->num_rows, clabel->num_columns);
2976 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
2977 clabel->version, clabel->serial_number,
2978 clabel->mod_counter);
2979 printf(" Clean: %s Status: %d\n",
2980 clabel->clean ? "Yes" : "No", clabel->status );
2981 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
2982 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
2983 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
2984 (char) clabel->parityConfig, clabel->blockSize,
2985 clabel->numBlocks);
2986 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
2987 printf(" Contains root partition: %s\n",
2988 clabel->root_partition ? "Yes" : "No" );
2989 printf(" Last configured as: raid%d\n", clabel->last_unit );
2990 #if 0
2991 printf(" Config order: %d\n", clabel->config_order);
2992 #endif
2993
2994 }
2995 #endif
2996
2997 RF_ConfigSet_t *
2998 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
2999 {
3000 RF_AutoConfig_t *ac;
3001 RF_ConfigSet_t *config_sets;
3002 RF_ConfigSet_t *cset;
3003 RF_AutoConfig_t *ac_next;
3004
3005
3006 config_sets = NULL;
3007
3008 /* Go through the AutoConfig list, and figure out which components
3009 belong to what sets. */
3010 ac = ac_list;
3011 while(ac!=NULL) {
3012 /* we're going to putz with ac->next, so save it here
3013 for use at the end of the loop */
3014 ac_next = ac->next;
3015
3016 if (config_sets == NULL) {
3017 /* will need at least this one... */
3018 config_sets = (RF_ConfigSet_t *)
3019 malloc(sizeof(RF_ConfigSet_t),
3020 M_RAIDFRAME, M_NOWAIT);
3021 if (config_sets == NULL) {
3022 panic("rf_create_auto_sets: No memory!");
3023 }
3024 /* this one is easy :) */
3025 config_sets->ac = ac;
3026 config_sets->next = NULL;
3027 config_sets->rootable = 0;
3028 ac->next = NULL;
3029 } else {
3030 /* which set does this component fit into? */
3031 cset = config_sets;
3032 while(cset!=NULL) {
3033 if (rf_does_it_fit(cset, ac)) {
3034 /* looks like it matches... */
3035 ac->next = cset->ac;
3036 cset->ac = ac;
3037 break;
3038 }
3039 cset = cset->next;
3040 }
3041 if (cset==NULL) {
3042 /* didn't find a match above... new set..*/
3043 cset = (RF_ConfigSet_t *)
3044 malloc(sizeof(RF_ConfigSet_t),
3045 M_RAIDFRAME, M_NOWAIT);
3046 if (cset == NULL) {
3047 panic("rf_create_auto_sets: No memory!");
3048 }
3049 cset->ac = ac;
3050 ac->next = NULL;
3051 cset->next = config_sets;
3052 cset->rootable = 0;
3053 config_sets = cset;
3054 }
3055 }
3056 ac = ac_next;
3057 }
3058
3059
3060 return(config_sets);
3061 }
3062
3063 static int
3064 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3065 {
3066 RF_ComponentLabel_t *clabel1, *clabel2;
3067
3068 /* If this one matches the *first* one in the set, that's good
3069 enough, since the other members of the set would have been
3070 through here too... */
3071 /* note that we are not checking partitionSize here..
3072
3073 Note that we are also not checking the mod_counters here.
3074 If everything else matches execpt the mod_counter, that's
3075 good enough for this test. We will deal with the mod_counters
3076 a little later in the autoconfiguration process.
3077
3078 (clabel1->mod_counter == clabel2->mod_counter) &&
3079
3080 The reason we don't check for this is that failed disks
3081 will have lower modification counts. If those disks are
3082 not added to the set they used to belong to, then they will
3083 form their own set, which may result in 2 different sets,
3084 for example, competing to be configured at raid0, and
3085 perhaps competing to be the root filesystem set. If the
3086 wrong ones get configured, or both attempt to become /,
3087 weird behaviour and or serious lossage will occur. Thus we
3088 need to bring them into the fold here, and kick them out at
3089 a later point.
3090
3091 */
3092
3093 clabel1 = cset->ac->clabel;
3094 clabel2 = ac->clabel;
3095 if ((clabel1->version == clabel2->version) &&
3096 (clabel1->serial_number == clabel2->serial_number) &&
3097 (clabel1->num_rows == clabel2->num_rows) &&
3098 (clabel1->num_columns == clabel2->num_columns) &&
3099 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3100 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3101 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3102 (clabel1->parityConfig == clabel2->parityConfig) &&
3103 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3104 (clabel1->blockSize == clabel2->blockSize) &&
3105 (clabel1->numBlocks == clabel2->numBlocks) &&
3106 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3107 (clabel1->root_partition == clabel2->root_partition) &&
3108 (clabel1->last_unit == clabel2->last_unit) &&
3109 (clabel1->config_order == clabel2->config_order)) {
3110 /* if it get's here, it almost *has* to be a match */
3111 } else {
3112 /* it's not consistent with somebody in the set..
3113 punt */
3114 return(0);
3115 }
3116 /* all was fine.. it must fit... */
3117 return(1);
3118 }
3119
3120 int
3121 rf_have_enough_components(RF_ConfigSet_t *cset)
3122 {
3123 RF_AutoConfig_t *ac;
3124 RF_AutoConfig_t *auto_config;
3125 RF_ComponentLabel_t *clabel;
3126 int c;
3127 int num_cols;
3128 int num_missing;
3129 int mod_counter;
3130 int mod_counter_found;
3131 int even_pair_failed;
3132 char parity_type;
3133
3134
3135 /* check to see that we have enough 'live' components
3136 of this set. If so, we can configure it if necessary */
3137
3138 num_cols = cset->ac->clabel->num_columns;
3139 parity_type = cset->ac->clabel->parityConfig;
3140
3141 /* XXX Check for duplicate components!?!?!? */
3142
3143 /* Determine what the mod_counter is supposed to be for this set. */
3144
3145 mod_counter_found = 0;
3146 mod_counter = 0;
3147 ac = cset->ac;
3148 while(ac!=NULL) {
3149 if (mod_counter_found==0) {
3150 mod_counter = ac->clabel->mod_counter;
3151 mod_counter_found = 1;
3152 } else {
3153 if (ac->clabel->mod_counter > mod_counter) {
3154 mod_counter = ac->clabel->mod_counter;
3155 }
3156 }
3157 ac = ac->next;
3158 }
3159
3160 num_missing = 0;
3161 auto_config = cset->ac;
3162
3163 even_pair_failed = 0;
3164 for(c=0; c<num_cols; c++) {
3165 ac = auto_config;
3166 while(ac!=NULL) {
3167 if ((ac->clabel->column == c) &&
3168 (ac->clabel->mod_counter == mod_counter)) {
3169 /* it's this one... */
3170 #ifdef DEBUG
3171 printf("Found: %s at %d\n",
3172 ac->devname,c);
3173 #endif
3174 break;
3175 }
3176 ac=ac->next;
3177 }
3178 if (ac==NULL) {
3179 /* Didn't find one here! */
3180 /* special case for RAID 1, especially
3181 where there are more than 2
3182 components (where RAIDframe treats
3183 things a little differently :( ) */
3184 if (parity_type == '1') {
3185 if (c%2 == 0) { /* even component */
3186 even_pair_failed = 1;
3187 } else { /* odd component. If
3188 we're failed, and
3189 so is the even
3190 component, it's
3191 "Good Night, Charlie" */
3192 if (even_pair_failed == 1) {
3193 return(0);
3194 }
3195 }
3196 } else {
3197 /* normal accounting */
3198 num_missing++;
3199 }
3200 }
3201 if ((parity_type == '1') && (c%2 == 1)) {
3202 /* Just did an even component, and we didn't
3203 bail.. reset the even_pair_failed flag,
3204 and go on to the next component.... */
3205 even_pair_failed = 0;
3206 }
3207 }
3208
3209 clabel = cset->ac->clabel;
3210
3211 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3212 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3213 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3214 /* XXX this needs to be made *much* more general */
3215 /* Too many failures */
3216 return(0);
3217 }
3218 /* otherwise, all is well, and we've got enough to take a kick
3219 at autoconfiguring this set */
3220 return(1);
3221 }
3222
3223 void
3224 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3225 RF_Raid_t *raidPtr)
3226 {
3227 RF_ComponentLabel_t *clabel;
3228 int i;
3229
3230 clabel = ac->clabel;
3231
3232 /* 1. Fill in the common stuff */
3233 config->numRow = clabel->num_rows = 1;
3234 config->numCol = clabel->num_columns;
3235 config->numSpare = 0; /* XXX should this be set here? */
3236 config->sectPerSU = clabel->sectPerSU;
3237 config->SUsPerPU = clabel->SUsPerPU;
3238 config->SUsPerRU = clabel->SUsPerRU;
3239 config->parityConfig = clabel->parityConfig;
3240 /* XXX... */
3241 strcpy(config->diskQueueType,"fifo");
3242 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3243 config->layoutSpecificSize = 0; /* XXX ?? */
3244
3245 while(ac!=NULL) {
3246 /* row/col values will be in range due to the checks
3247 in reasonable_label() */
3248 strcpy(config->devnames[0][ac->clabel->column],
3249 ac->devname);
3250 ac = ac->next;
3251 }
3252
3253 for(i=0;i<RF_MAXDBGV;i++) {
3254 config->debugVars[i][0] = 0;
3255 }
3256 }
3257
3258 int
3259 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3260 {
3261 RF_ComponentLabel_t clabel;
3262 struct vnode *vp;
3263 dev_t dev;
3264 int column;
3265 int sparecol;
3266
3267 raidPtr->autoconfigure = new_value;
3268
3269 for(column=0; column<raidPtr->numCol; column++) {
3270 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3271 dev = raidPtr->Disks[column].dev;
3272 vp = raidPtr->raid_cinfo[column].ci_vp;
3273 raidread_component_label(dev, vp, &clabel);
3274 clabel.autoconfigure = new_value;
3275 raidwrite_component_label(dev, vp, &clabel);
3276 }
3277 }
3278 for(column = 0; column < raidPtr->numSpare ; column++) {
3279 sparecol = raidPtr->numCol + column;
3280 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3281 dev = raidPtr->Disks[sparecol].dev;
3282 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3283 raidread_component_label(dev, vp, &clabel);
3284 clabel.autoconfigure = new_value;
3285 raidwrite_component_label(dev, vp, &clabel);
3286 }
3287 }
3288 return(new_value);
3289 }
3290
3291 int
3292 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3293 {
3294 RF_ComponentLabel_t clabel;
3295 struct vnode *vp;
3296 dev_t dev;
3297 int column;
3298 int sparecol;
3299
3300 raidPtr->root_partition = new_value;
3301 for(column=0; column<raidPtr->numCol; column++) {
3302 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3303 dev = raidPtr->Disks[column].dev;
3304 vp = raidPtr->raid_cinfo[column].ci_vp;
3305 raidread_component_label(dev, vp, &clabel);
3306 clabel.root_partition = new_value;
3307 raidwrite_component_label(dev, vp, &clabel);
3308 }
3309 }
3310 for(column = 0; column < raidPtr->numSpare ; column++) {
3311 sparecol = raidPtr->numCol + column;
3312 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3313 dev = raidPtr->Disks[sparecol].dev;
3314 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3315 raidread_component_label(dev, vp, &clabel);
3316 clabel.root_partition = new_value;
3317 raidwrite_component_label(dev, vp, &clabel);
3318 }
3319 }
3320 return(new_value);
3321 }
3322
3323 void
3324 rf_release_all_vps(RF_ConfigSet_t *cset)
3325 {
3326 RF_AutoConfig_t *ac;
3327
3328 ac = cset->ac;
3329 while(ac!=NULL) {
3330 /* Close the vp, and give it back */
3331 if (ac->vp) {
3332 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3333 VOP_CLOSE(ac->vp, FREAD, NOCRED, 0);
3334 vput(ac->vp);
3335 ac->vp = NULL;
3336 }
3337 ac = ac->next;
3338 }
3339 }
3340
3341
3342 void
3343 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3344 {
3345 RF_AutoConfig_t *ac;
3346 RF_AutoConfig_t *next_ac;
3347
3348 ac = cset->ac;
3349 while(ac!=NULL) {
3350 next_ac = ac->next;
3351 /* nuke the label */
3352 free(ac->clabel, M_RAIDFRAME);
3353 /* cleanup the config structure */
3354 free(ac, M_RAIDFRAME);
3355 /* "next.." */
3356 ac = next_ac;
3357 }
3358 /* and, finally, nuke the config set */
3359 free(cset, M_RAIDFRAME);
3360 }
3361
3362
3363 void
3364 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3365 {
3366 /* current version number */
3367 clabel->version = RF_COMPONENT_LABEL_VERSION;
3368 clabel->serial_number = raidPtr->serial_number;
3369 clabel->mod_counter = raidPtr->mod_counter;
3370 clabel->num_rows = 1;
3371 clabel->num_columns = raidPtr->numCol;
3372 clabel->clean = RF_RAID_DIRTY; /* not clean */
3373 clabel->status = rf_ds_optimal; /* "It's good!" */
3374
3375 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3376 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3377 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3378
3379 clabel->blockSize = raidPtr->bytesPerSector;
3380 clabel->numBlocks = raidPtr->sectorsPerDisk;
3381
3382 /* XXX not portable */
3383 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3384 clabel->maxOutstanding = raidPtr->maxOutstanding;
3385 clabel->autoconfigure = raidPtr->autoconfigure;
3386 clabel->root_partition = raidPtr->root_partition;
3387 clabel->last_unit = raidPtr->raidid;
3388 clabel->config_order = raidPtr->config_order;
3389 }
3390
3391 int
3392 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3393 {
3394 RF_Raid_t *raidPtr;
3395 RF_Config_t *config;
3396 int raidID;
3397 int retcode;
3398
3399 #ifdef DEBUG
3400 printf("RAID autoconfigure\n");
3401 #endif
3402
3403 retcode = 0;
3404 *unit = -1;
3405
3406 /* 1. Create a config structure */
3407
3408 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3409 M_RAIDFRAME,
3410 M_NOWAIT);
3411 if (config==NULL) {
3412 printf("Out of mem!?!?\n");
3413 /* XXX do something more intelligent here. */
3414 return(1);
3415 }
3416
3417 memset(config, 0, sizeof(RF_Config_t));
3418
3419 /*
3420 2. Figure out what RAID ID this one is supposed to live at
3421 See if we can get the same RAID dev that it was configured
3422 on last time..
3423 */
3424
3425 raidID = cset->ac->clabel->last_unit;
3426 if ((raidID < 0) || (raidID >= numraid)) {
3427 /* let's not wander off into lala land. */
3428 raidID = numraid - 1;
3429 }
3430 if (raidPtrs[raidID]->valid != 0) {
3431
3432 /*
3433 Nope... Go looking for an alternative...
3434 Start high so we don't immediately use raid0 if that's
3435 not taken.
3436 */
3437
3438 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3439 if (raidPtrs[raidID]->valid == 0) {
3440 /* can use this one! */
3441 break;
3442 }
3443 }
3444 }
3445
3446 if (raidID < 0) {
3447 /* punt... */
3448 printf("Unable to auto configure this set!\n");
3449 printf("(Out of RAID devs!)\n");
3450 free(config, M_RAIDFRAME);
3451 return(1);
3452 }
3453
3454 #ifdef DEBUG
3455 printf("Configuring raid%d:\n",raidID);
3456 #endif
3457
3458 raidPtr = raidPtrs[raidID];
3459
3460 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3461 raidPtr->raidid = raidID;
3462 raidPtr->openings = RAIDOUTSTANDING;
3463
3464 /* 3. Build the configuration structure */
3465 rf_create_configuration(cset->ac, config, raidPtr);
3466
3467 /* 4. Do the configuration */
3468 retcode = rf_Configure(raidPtr, config, cset->ac);
3469
3470 if (retcode == 0) {
3471
3472 raidinit(raidPtrs[raidID]);
3473
3474 rf_markalldirty(raidPtrs[raidID]);
3475 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3476 if (cset->ac->clabel->root_partition==1) {
3477 /* everything configured just fine. Make a note
3478 that this set is eligible to be root. */
3479 cset->rootable = 1;
3480 /* XXX do this here? */
3481 raidPtrs[raidID]->root_partition = 1;
3482 }
3483 }
3484
3485 /* 5. Cleanup */
3486 free(config, M_RAIDFRAME);
3487
3488 *unit = raidID;
3489 return(retcode);
3490 }
3491
3492 void
3493 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3494 {
3495 struct buf *bp;
3496
3497 bp = (struct buf *)desc->bp;
3498 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3499 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3500 }
3501
3502 void
3503 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3504 size_t xmin, size_t xmax)
3505 {
3506 pool_init(p, size, 0, 0, 0, w_chan, NULL);
3507 pool_sethiwat(p, xmax);
3508 pool_prime(p, xmin);
3509 pool_setlowat(p, xmin);
3510 }
3511
3512 /*
3513 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3514 * if there is IO pending and if that IO could possibly be done for a
3515 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3516 * otherwise.
3517 *
3518 */
3519
3520 int
3521 rf_buf_queue_check(int raidid)
3522 {
3523 if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
3524 raidPtrs[raidid]->openings > 0) {
3525 /* there is work to do */
3526 return 0;
3527 }
3528 /* default is nothing to do */
3529 return 1;
3530 }
3531
3532 int
3533 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3534 {
3535 struct partinfo dpart;
3536 struct dkwedge_info dkw;
3537 int error;
3538
3539 error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred, l);
3540 if (error == 0) {
3541 diskPtr->blockSize = dpart.disklab->d_secsize;
3542 diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3543 diskPtr->partitionSize = dpart.part->p_size;
3544 return 0;
3545 }
3546
3547 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred, l);
3548 if (error == 0) {
3549 diskPtr->blockSize = 512; /* XXX */
3550 diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3551 diskPtr->partitionSize = dkw.dkw_size;
3552 return 0;
3553 }
3554 return error;
3555 }
3556
3557 static int
3558 raid_match(struct device *self, struct cfdata *cfdata,
3559 void *aux)
3560 {
3561 return 1;
3562 }
3563
3564 static void
3565 raid_attach(struct device *parent, struct device *self,
3566 void *aux)
3567 {
3568
3569 }
3570
3571
3572 static int
3573 raid_detach(struct device *self, int flags)
3574 {
3575 struct raid_softc *rs = (struct raid_softc *)self;
3576
3577 if (rs->sc_flags & RAIDF_INITED)
3578 return EBUSY;
3579
3580 return 0;
3581 }
3582
3583 static void
3584 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3585 {
3586 prop_dictionary_t disk_info, odisk_info, geom;
3587 disk_info = prop_dictionary_create();
3588 geom = prop_dictionary_create();
3589 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3590 raidPtr->totalSectors);
3591 prop_dictionary_set_uint32(geom, "sector-size",
3592 raidPtr->bytesPerSector);
3593
3594 prop_dictionary_set_uint16(geom, "sectors-per-track",
3595 raidPtr->Layout.dataSectorsPerStripe);
3596 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3597 4 * raidPtr->numCol);
3598
3599 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3600 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3601 (4 * raidPtr->numCol)));
3602
3603 prop_dictionary_set(disk_info, "geometry", geom);
3604 prop_object_release(geom);
3605 prop_dictionary_set(device_properties(rs->sc_dev),
3606 "disk-info", disk_info);
3607 odisk_info = rs->sc_dkdev.dk_info;
3608 rs->sc_dkdev.dk_info = disk_info;
3609 if (odisk_info)
3610 prop_object_release(odisk_info);
3611 }
3612