rf_netbsdkintf.c revision 1.224.2.1 1 /* $NetBSD: rf_netbsdkintf.c,v 1.224.2.1 2007/12/19 20:07:53 ghen Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the NetBSD
20 * Foundation, Inc. and its contributors.
21 * 4. Neither the name of The NetBSD Foundation nor the names of its
22 * contributors may be used to endorse or promote products derived
23 * from this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Copyright (c) 1990, 1993
40 * The Regents of the University of California. All rights reserved.
41 *
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
44 * Science Department.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
48 * are met:
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 *
70 * from: Utah $Hdr: cd.c 1.6 90/11/28$
71 *
72 * @(#)cd.c 8.2 (Berkeley) 11/16/93
73 */
74
75 /*
76 * Copyright (c) 1988 University of Utah.
77 *
78 * This code is derived from software contributed to Berkeley by
79 * the Systems Programming Group of the University of Utah Computer
80 * Science Department.
81 *
82 * Redistribution and use in source and binary forms, with or without
83 * modification, are permitted provided that the following conditions
84 * are met:
85 * 1. Redistributions of source code must retain the above copyright
86 * notice, this list of conditions and the following disclaimer.
87 * 2. Redistributions in binary form must reproduce the above copyright
88 * notice, this list of conditions and the following disclaimer in the
89 * documentation and/or other materials provided with the distribution.
90 * 3. All advertising materials mentioning features or use of this software
91 * must display the following acknowledgement:
92 * This product includes software developed by the University of
93 * California, Berkeley and its contributors.
94 * 4. Neither the name of the University nor the names of its contributors
95 * may be used to endorse or promote products derived from this software
96 * without specific prior written permission.
97 *
98 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
100 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
101 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
102 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
103 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
104 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
105 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
106 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
107 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
108 * SUCH DAMAGE.
109 *
110 * from: Utah $Hdr: cd.c 1.6 90/11/28$
111 *
112 * @(#)cd.c 8.2 (Berkeley) 11/16/93
113 */
114
115 /*
116 * Copyright (c) 1995 Carnegie-Mellon University.
117 * All rights reserved.
118 *
119 * Authors: Mark Holland, Jim Zelenka
120 *
121 * Permission to use, copy, modify and distribute this software and
122 * its documentation is hereby granted, provided that both the copyright
123 * notice and this permission notice appear in all copies of the
124 * software, derivative works or modified versions, and any portions
125 * thereof, and that both notices appear in supporting documentation.
126 *
127 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
128 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
129 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
130 *
131 * Carnegie Mellon requests users of this software to return to
132 *
133 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
134 * School of Computer Science
135 * Carnegie Mellon University
136 * Pittsburgh PA 15213-3890
137 *
138 * any improvements or extensions that they make and grant Carnegie the
139 * rights to redistribute these changes.
140 */
141
142 /***********************************************************
143 *
144 * rf_kintf.c -- the kernel interface routines for RAIDframe
145 *
146 ***********************************************************/
147
148 #include <sys/cdefs.h>
149 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.224.2.1 2007/12/19 20:07:53 ghen Exp $");
150
151 #include <sys/param.h>
152 #include <sys/errno.h>
153 #include <sys/pool.h>
154 #include <sys/proc.h>
155 #include <sys/queue.h>
156 #include <sys/disk.h>
157 #include <sys/device.h>
158 #include <sys/stat.h>
159 #include <sys/ioctl.h>
160 #include <sys/fcntl.h>
161 #include <sys/systm.h>
162 #include <sys/namei.h>
163 #include <sys/vnode.h>
164 #include <sys/disklabel.h>
165 #include <sys/conf.h>
166 #include <sys/lock.h>
167 #include <sys/buf.h>
168 #include <sys/bufq.h>
169 #include <sys/user.h>
170 #include <sys/reboot.h>
171 #include <sys/kauth.h>
172
173 #include <dev/raidframe/raidframevar.h>
174 #include <dev/raidframe/raidframeio.h>
175 #include "raid.h"
176 #include "opt_raid_autoconfig.h"
177 #include "rf_raid.h"
178 #include "rf_copyback.h"
179 #include "rf_dag.h"
180 #include "rf_dagflags.h"
181 #include "rf_desc.h"
182 #include "rf_diskqueue.h"
183 #include "rf_etimer.h"
184 #include "rf_general.h"
185 #include "rf_kintf.h"
186 #include "rf_options.h"
187 #include "rf_driver.h"
188 #include "rf_parityscan.h"
189 #include "rf_threadstuff.h"
190
191 #ifdef DEBUG
192 int rf_kdebug_level = 0;
193 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
194 #else /* DEBUG */
195 #define db1_printf(a) { }
196 #endif /* DEBUG */
197
198 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
199
200 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
201
202 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
203 * spare table */
204 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
205 * installation process */
206
207 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
208
209 /* prototypes */
210 static void KernelWakeupFunc(struct buf *);
211 static void InitBP(struct buf *, struct vnode *, unsigned,
212 dev_t, RF_SectorNum_t, RF_SectorCount_t, caddr_t, void (*) (struct buf *),
213 void *, int, struct proc *);
214 static void raidinit(RF_Raid_t *);
215
216 void raidattach(int);
217 static int raid_match(struct device *, struct cfdata *, void *);
218 static void raid_attach(struct device *, struct device *, void *);
219 static int raid_detach(struct device *, int);
220
221 dev_type_open(raidopen);
222 dev_type_close(raidclose);
223 dev_type_read(raidread);
224 dev_type_write(raidwrite);
225 dev_type_ioctl(raidioctl);
226 dev_type_strategy(raidstrategy);
227 dev_type_dump(raiddump);
228 dev_type_size(raidsize);
229
230 const struct bdevsw raid_bdevsw = {
231 raidopen, raidclose, raidstrategy, raidioctl,
232 raiddump, raidsize, D_DISK
233 };
234
235 const struct cdevsw raid_cdevsw = {
236 raidopen, raidclose, raidread, raidwrite, raidioctl,
237 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
238 };
239
240 /* XXX Not sure if the following should be replacing the raidPtrs above,
241 or if it should be used in conjunction with that...
242 */
243
244 struct raid_softc {
245 struct device *sc_dev;
246 int sc_flags; /* flags */
247 int sc_cflags; /* configuration flags */
248 uint64_t sc_size; /* size of the raid device */
249 char sc_xname[20]; /* XXX external name */
250 struct disk sc_dkdev; /* generic disk device info */
251 struct bufq_state *buf_queue; /* used for the device queue */
252 };
253 /* sc_flags */
254 #define RAIDF_INITED 0x01 /* unit has been initialized */
255 #define RAIDF_WLABEL 0x02 /* label area is writable */
256 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
257 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
258 #define RAIDF_LOCKED 0x80 /* unit is locked */
259
260 #define raidunit(x) DISKUNIT(x)
261 int numraid = 0;
262
263 extern struct cfdriver raid_cd;
264 CFATTACH_DECL(raid, sizeof(struct raid_softc),
265 raid_match, raid_attach, raid_detach, NULL);
266
267 /*
268 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
269 * Be aware that large numbers can allow the driver to consume a lot of
270 * kernel memory, especially on writes, and in degraded mode reads.
271 *
272 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
273 * a single 64K write will typically require 64K for the old data,
274 * 64K for the old parity, and 64K for the new parity, for a total
275 * of 192K (if the parity buffer is not re-used immediately).
276 * Even it if is used immediately, that's still 128K, which when multiplied
277 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
278 *
279 * Now in degraded mode, for example, a 64K read on the above setup may
280 * require data reconstruction, which will require *all* of the 4 remaining
281 * disks to participate -- 4 * 32K/disk == 128K again.
282 */
283
284 #ifndef RAIDOUTSTANDING
285 #define RAIDOUTSTANDING 6
286 #endif
287
288 #define RAIDLABELDEV(dev) \
289 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
290
291 /* declared here, and made public, for the benefit of KVM stuff.. */
292 struct raid_softc *raid_softc;
293
294 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
295 struct disklabel *);
296 static void raidgetdisklabel(dev_t);
297 static void raidmakedisklabel(struct raid_softc *);
298
299 static int raidlock(struct raid_softc *);
300 static void raidunlock(struct raid_softc *);
301
302 static void rf_markalldirty(RF_Raid_t *);
303
304 void rf_ReconThread(struct rf_recon_req *);
305 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
306 void rf_CopybackThread(RF_Raid_t *raidPtr);
307 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
308 int rf_autoconfig(struct device *self);
309 void rf_buildroothack(RF_ConfigSet_t *);
310
311 RF_AutoConfig_t *rf_find_raid_components(void);
312 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
313 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
314 static int rf_reasonable_label(RF_ComponentLabel_t *);
315 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
316 int rf_set_autoconfig(RF_Raid_t *, int);
317 int rf_set_rootpartition(RF_Raid_t *, int);
318 void rf_release_all_vps(RF_ConfigSet_t *);
319 void rf_cleanup_config_set(RF_ConfigSet_t *);
320 int rf_have_enough_components(RF_ConfigSet_t *);
321 int rf_auto_config_set(RF_ConfigSet_t *, int *);
322
323 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
324 allow autoconfig to take place.
325 Note that this is overridden by having
326 RAID_AUTOCONFIG as an option in the
327 kernel config file. */
328
329 struct RF_Pools_s rf_pools;
330
331 void
332 raidattach(int num)
333 {
334 int raidID;
335 int i, rc;
336
337 #ifdef DEBUG
338 printf("raidattach: Asked for %d units\n", num);
339 #endif
340
341 if (num <= 0) {
342 #ifdef DIAGNOSTIC
343 panic("raidattach: count <= 0");
344 #endif
345 return;
346 }
347 /* This is where all the initialization stuff gets done. */
348
349 numraid = num;
350
351 /* Make some space for requested number of units... */
352
353 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
354 if (raidPtrs == NULL) {
355 panic("raidPtrs is NULL!!");
356 }
357
358 rf_mutex_init(&rf_sparet_wait_mutex);
359
360 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
361
362 for (i = 0; i < num; i++)
363 raidPtrs[i] = NULL;
364 rc = rf_BootRaidframe();
365 if (rc == 0)
366 printf("Kernelized RAIDframe activated\n");
367 else
368 panic("Serious error booting RAID!!");
369
370 /* put together some datastructures like the CCD device does.. This
371 * lets us lock the device and what-not when it gets opened. */
372
373 raid_softc = (struct raid_softc *)
374 malloc(num * sizeof(struct raid_softc),
375 M_RAIDFRAME, M_NOWAIT);
376 if (raid_softc == NULL) {
377 printf("WARNING: no memory for RAIDframe driver\n");
378 return;
379 }
380
381 memset(raid_softc, 0, num * sizeof(struct raid_softc));
382
383 for (raidID = 0; raidID < num; raidID++) {
384 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
385
386 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
387 (RF_Raid_t *));
388 if (raidPtrs[raidID] == NULL) {
389 printf("WARNING: raidPtrs[%d] is NULL\n", raidID);
390 numraid = raidID;
391 return;
392 }
393 }
394
395 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
396 printf("config_cfattach_attach failed?\n");
397 }
398
399 #ifdef RAID_AUTOCONFIG
400 raidautoconfig = 1;
401 #endif
402
403 /*
404 * Register a finalizer which will be used to auto-config RAID
405 * sets once all real hardware devices have been found.
406 */
407 if (config_finalize_register(NULL, rf_autoconfig) != 0)
408 printf("WARNING: unable to register RAIDframe finalizer\n");
409 }
410
411 int
412 rf_autoconfig(struct device *self)
413 {
414 RF_AutoConfig_t *ac_list;
415 RF_ConfigSet_t *config_sets;
416 int i;
417
418 if (raidautoconfig == 0)
419 return (0);
420
421 /* XXX This code can only be run once. */
422 raidautoconfig = 0;
423
424 /* 1. locate all RAID components on the system */
425 #ifdef DEBUG
426 printf("Searching for RAID components...\n");
427 #endif
428 ac_list = rf_find_raid_components();
429
430 /* 2. Sort them into their respective sets. */
431 config_sets = rf_create_auto_sets(ac_list);
432
433 /*
434 * 3. Evaluate each set andconfigure the valid ones.
435 * This gets done in rf_buildroothack().
436 */
437 rf_buildroothack(config_sets);
438
439 for (i = 0; i < numraid; i++)
440 if (raidPtrs[i] != NULL && raidPtrs[i]->valid)
441 dkwedge_discover(&raid_softc[i].sc_dkdev);
442
443 return 1;
444 }
445
446 void
447 rf_buildroothack(RF_ConfigSet_t *config_sets)
448 {
449 RF_ConfigSet_t *cset;
450 RF_ConfigSet_t *next_cset;
451 int retcode;
452 int raidID;
453 int rootID;
454 int num_root;
455
456 rootID = 0;
457 num_root = 0;
458 cset = config_sets;
459 while(cset != NULL ) {
460 next_cset = cset->next;
461 if (rf_have_enough_components(cset) &&
462 cset->ac->clabel->autoconfigure==1) {
463 retcode = rf_auto_config_set(cset,&raidID);
464 if (!retcode) {
465 #ifdef DEBUG
466 printf("raid%d: configured ok\n", raidID);
467 #endif
468 if (cset->rootable) {
469 rootID = raidID;
470 num_root++;
471 }
472 } else {
473 /* The autoconfig didn't work :( */
474 #ifdef DEBUG
475 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
476 #endif
477 rf_release_all_vps(cset);
478 }
479 } else {
480 #ifdef DEBUG
481 printf("raid%d: not enough components\n", raidID);
482 #endif
483 /* we're not autoconfiguring this set...
484 release the associated resources */
485 rf_release_all_vps(cset);
486 }
487 /* cleanup */
488 rf_cleanup_config_set(cset);
489 cset = next_cset;
490 }
491
492 /* if the user has specified what the root device should be
493 then we don't touch booted_device or boothowto... */
494
495 if (rootspec != NULL)
496 return;
497
498 /* we found something bootable... */
499
500 if (num_root == 1) {
501 booted_device = raid_softc[rootID].sc_dev;
502 } else if (num_root > 1) {
503 /* we can't guess.. require the user to answer... */
504 boothowto |= RB_ASKNAME;
505 }
506 }
507
508
509 int
510 raidsize(dev_t dev)
511 {
512 struct raid_softc *rs;
513 struct disklabel *lp;
514 int part, unit, omask, size;
515
516 unit = raidunit(dev);
517 if (unit >= numraid)
518 return (-1);
519 rs = &raid_softc[unit];
520
521 if ((rs->sc_flags & RAIDF_INITED) == 0)
522 return (-1);
523
524 part = DISKPART(dev);
525 omask = rs->sc_dkdev.dk_openmask & (1 << part);
526 lp = rs->sc_dkdev.dk_label;
527
528 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
529 return (-1);
530
531 if (lp->d_partitions[part].p_fstype != FS_SWAP)
532 size = -1;
533 else
534 size = lp->d_partitions[part].p_size *
535 (lp->d_secsize / DEV_BSIZE);
536
537 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
538 return (-1);
539
540 return (size);
541
542 }
543
544 int
545 raiddump(dev_t dev, daddr_t blkno, caddr_t va, size_t size)
546 {
547 int unit = raidunit(dev);
548 struct raid_softc *rs;
549 const struct bdevsw *bdev;
550 struct disklabel *lp;
551 RF_Raid_t *raidPtr;
552 daddr_t offset;
553 int part, c, sparecol, j, scol, dumpto;
554 int error = 0;
555
556 if (unit >= numraid)
557 return (ENXIO);
558
559 rs = &raid_softc[unit];
560 raidPtr = raidPtrs[unit];
561
562 if ((rs->sc_flags & RAIDF_INITED) == 0)
563 return ENXIO;
564
565 /* we only support dumping to RAID 1 sets */
566 if (raidPtr->Layout.numDataCol != 1 ||
567 raidPtr->Layout.numParityCol != 1)
568 return EINVAL;
569
570
571 if ((error = raidlock(rs)) != 0)
572 return error;
573
574 if (size % DEV_BSIZE != 0) {
575 error = EINVAL;
576 goto out;
577 }
578
579 if (blkno + size / DEV_BSIZE > rs->sc_size) {
580 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
581 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
582 size / DEV_BSIZE, rs->sc_size);
583 error = EINVAL;
584 goto out;
585 }
586
587 part = DISKPART(dev);
588 lp = rs->sc_dkdev.dk_label;
589 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
590
591 /* figure out what device is alive.. */
592
593 /*
594 Look for a component to dump to. The preference for the
595 component to dump to is as follows:
596 1) the master
597 2) a used_spare of the master
598 3) the slave
599 4) a used_spare of the slave
600 */
601
602 dumpto = -1;
603 for (c = 0; c < raidPtr->numCol; c++) {
604 if (raidPtr->Disks[c].status == rf_ds_optimal) {
605 /* this might be the one */
606 dumpto = c;
607 break;
608 }
609 }
610
611 /*
612 At this point we have possibly selected a live master or a
613 live slave. We now check to see if there is a spared
614 master (or a spared slave), if we didn't find a live master
615 or a live slave.
616 */
617
618 for (c = 0; c < raidPtr->numSpare; c++) {
619 sparecol = raidPtr->numCol + c;
620 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
621 /* How about this one? */
622 scol = -1;
623 for(j=0;j<raidPtr->numCol;j++) {
624 if (raidPtr->Disks[j].spareCol == sparecol) {
625 scol = j;
626 break;
627 }
628 }
629 if (scol == 0) {
630 /*
631 We must have found a spared master!
632 We'll take that over anything else
633 found so far. (We couldn't have
634 found a real master before, since
635 this is a used spare, and it's
636 saying that it's replacing the
637 master.) On reboot (with
638 autoconfiguration turned on)
639 sparecol will become the 1st
640 component (component0) of this set.
641 */
642 dumpto = sparecol;
643 break;
644 } else if (scol != -1) {
645 /*
646 Must be a spared slave. We'll dump
647 to that if we havn't found anything
648 else so far.
649 */
650 if (dumpto == -1)
651 dumpto = sparecol;
652 }
653 }
654 }
655
656 if (dumpto == -1) {
657 /* we couldn't find any live components to dump to!?!?
658 */
659 error = EINVAL;
660 goto out;
661 }
662
663 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
664
665 /*
666 Note that blkno is relative to this particular partition.
667 By adding the offset of this partition in the RAID
668 set, and also adding RF_PROTECTED_SECTORS, we get a
669 value that is relative to the partition used for the
670 underlying component.
671 */
672
673 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
674 blkno + offset, va, size);
675
676 out:
677 raidunlock(rs);
678
679 return error;
680 }
681 /* ARGSUSED */
682 int
683 raidopen(dev_t dev, int flags, int fmt,
684 struct lwp *l)
685 {
686 int unit = raidunit(dev);
687 struct raid_softc *rs;
688 struct disklabel *lp;
689 int part, pmask;
690 int error = 0;
691
692 if (unit >= numraid)
693 return (ENXIO);
694 rs = &raid_softc[unit];
695
696 if ((error = raidlock(rs)) != 0)
697 return (error);
698 lp = rs->sc_dkdev.dk_label;
699
700 part = DISKPART(dev);
701
702 /*
703 * If there are wedges, and this is not RAW_PART, then we
704 * need to fail.
705 */
706 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
707 error = EBUSY;
708 goto bad;
709 }
710 pmask = (1 << part);
711
712 if ((rs->sc_flags & RAIDF_INITED) &&
713 (rs->sc_dkdev.dk_openmask == 0))
714 raidgetdisklabel(dev);
715
716 /* make sure that this partition exists */
717
718 if (part != RAW_PART) {
719 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
720 ((part >= lp->d_npartitions) ||
721 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
722 error = ENXIO;
723 goto bad;
724 }
725 }
726 /* Prevent this unit from being unconfigured while open. */
727 switch (fmt) {
728 case S_IFCHR:
729 rs->sc_dkdev.dk_copenmask |= pmask;
730 break;
731
732 case S_IFBLK:
733 rs->sc_dkdev.dk_bopenmask |= pmask;
734 break;
735 }
736
737 if ((rs->sc_dkdev.dk_openmask == 0) &&
738 ((rs->sc_flags & RAIDF_INITED) != 0)) {
739 /* First one... mark things as dirty... Note that we *MUST*
740 have done a configure before this. I DO NOT WANT TO BE
741 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
742 THAT THEY BELONG TOGETHER!!!!! */
743 /* XXX should check to see if we're only open for reading
744 here... If so, we needn't do this, but then need some
745 other way of keeping track of what's happened.. */
746
747 rf_markalldirty( raidPtrs[unit] );
748 }
749
750
751 rs->sc_dkdev.dk_openmask =
752 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
753
754 bad:
755 raidunlock(rs);
756
757 return (error);
758
759
760 }
761 /* ARGSUSED */
762 int
763 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
764 {
765 int unit = raidunit(dev);
766 struct cfdata *cf;
767 struct raid_softc *rs;
768 int error = 0;
769 int part;
770
771 if (unit >= numraid)
772 return (ENXIO);
773 rs = &raid_softc[unit];
774
775 if ((error = raidlock(rs)) != 0)
776 return (error);
777
778 part = DISKPART(dev);
779
780 /* ...that much closer to allowing unconfiguration... */
781 switch (fmt) {
782 case S_IFCHR:
783 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
784 break;
785
786 case S_IFBLK:
787 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
788 break;
789 }
790 rs->sc_dkdev.dk_openmask =
791 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
792
793 if ((rs->sc_dkdev.dk_openmask == 0) &&
794 ((rs->sc_flags & RAIDF_INITED) != 0)) {
795 /* Last one... device is not unconfigured yet.
796 Device shutdown has taken care of setting the
797 clean bits if RAIDF_INITED is not set
798 mark things as clean... */
799
800 rf_update_component_labels(raidPtrs[unit],
801 RF_FINAL_COMPONENT_UPDATE);
802 if (doing_shutdown) {
803 /* last one, and we're going down, so
804 lights out for this RAID set too. */
805 error = rf_Shutdown(raidPtrs[unit]);
806
807 /* It's no longer initialized... */
808 rs->sc_flags &= ~RAIDF_INITED;
809
810 /* detach the device */
811
812 cf = device_cfdata(rs->sc_dev);
813 error = config_detach(rs->sc_dev, DETACH_QUIET);
814 free(cf, M_RAIDFRAME);
815
816 /* Detach the disk. */
817 pseudo_disk_detach(&rs->sc_dkdev);
818 }
819 }
820
821 raidunlock(rs);
822 return (0);
823
824 }
825
826 void
827 raidstrategy(struct buf *bp)
828 {
829 int s;
830
831 unsigned int raidID = raidunit(bp->b_dev);
832 RF_Raid_t *raidPtr;
833 struct raid_softc *rs = &raid_softc[raidID];
834 int wlabel;
835
836 if ((rs->sc_flags & RAIDF_INITED) ==0) {
837 bp->b_error = ENXIO;
838 bp->b_flags |= B_ERROR;
839 goto done;
840 }
841 if (raidID >= numraid || !raidPtrs[raidID]) {
842 bp->b_error = ENODEV;
843 bp->b_flags |= B_ERROR;
844 goto done;
845 }
846 raidPtr = raidPtrs[raidID];
847 if (!raidPtr->valid) {
848 bp->b_error = ENODEV;
849 bp->b_flags |= B_ERROR;
850 goto done;
851 }
852 if (bp->b_bcount == 0) {
853 db1_printf(("b_bcount is zero..\n"));
854 goto done;
855 }
856
857 /*
858 * Do bounds checking and adjust transfer. If there's an
859 * error, the bounds check will flag that for us.
860 */
861
862 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
863 if (DISKPART(bp->b_dev) == RAW_PART) {
864 uint64_t size; /* device size in DEV_BSIZE unit */
865
866 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
867 size = raidPtr->totalSectors <<
868 (raidPtr->logBytesPerSector - DEV_BSHIFT);
869 } else {
870 size = raidPtr->totalSectors >>
871 (DEV_BSHIFT - raidPtr->logBytesPerSector);
872 }
873 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
874 goto done;
875 }
876 } else {
877 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
878 db1_printf(("Bounds check failed!!:%d %d\n",
879 (int) bp->b_blkno, (int) wlabel));
880 goto done;
881 }
882 }
883 s = splbio();
884
885 bp->b_resid = 0;
886
887 /* stuff it onto our queue */
888 BUFQ_PUT(rs->buf_queue, bp);
889
890 /* scheduled the IO to happen at the next convenient time */
891 wakeup(&(raidPtrs[raidID]->iodone));
892
893 splx(s);
894 return;
895
896 done:
897 bp->b_resid = bp->b_bcount;
898 biodone(bp);
899 }
900 /* ARGSUSED */
901 int
902 raidread(dev_t dev, struct uio *uio, int flags)
903 {
904 int unit = raidunit(dev);
905 struct raid_softc *rs;
906
907 if (unit >= numraid)
908 return (ENXIO);
909 rs = &raid_softc[unit];
910
911 if ((rs->sc_flags & RAIDF_INITED) == 0)
912 return (ENXIO);
913
914 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
915
916 }
917 /* ARGSUSED */
918 int
919 raidwrite(dev_t dev, struct uio *uio, int flags)
920 {
921 int unit = raidunit(dev);
922 struct raid_softc *rs;
923
924 if (unit >= numraid)
925 return (ENXIO);
926 rs = &raid_softc[unit];
927
928 if ((rs->sc_flags & RAIDF_INITED) == 0)
929 return (ENXIO);
930
931 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
932
933 }
934
935 int
936 raidioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct lwp *l)
937 {
938 int unit = raidunit(dev);
939 int error = 0;
940 int part, pmask;
941 struct cfdata *cf;
942 struct raid_softc *rs;
943 RF_Config_t *k_cfg, *u_cfg;
944 RF_Raid_t *raidPtr;
945 RF_RaidDisk_t *diskPtr;
946 RF_AccTotals_t *totals;
947 RF_DeviceConfig_t *d_cfg, **ucfgp;
948 u_char *specific_buf;
949 int retcode = 0;
950 int column;
951 int raidid;
952 struct rf_recon_req *rrcopy, *rr;
953 RF_ComponentLabel_t *clabel;
954 RF_ComponentLabel_t *ci_label;
955 RF_ComponentLabel_t **clabel_ptr;
956 RF_SingleComponent_t *sparePtr,*componentPtr;
957 RF_SingleComponent_t component;
958 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
959 int i, j, d;
960 #ifdef __HAVE_OLD_DISKLABEL
961 struct disklabel newlabel;
962 #endif
963 struct dkwedge_info *dkw;
964
965 if (unit >= numraid)
966 return (ENXIO);
967 rs = &raid_softc[unit];
968 raidPtr = raidPtrs[unit];
969
970 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
971 (int) DISKPART(dev), (int) unit, (int) cmd));
972
973 /* Must be open for writes for these commands... */
974 switch (cmd) {
975 #ifdef DIOCGSECTORSIZE
976 case DIOCGSECTORSIZE:
977 *(u_int *)data = raidPtr->bytesPerSector;
978 return 0;
979 case DIOCGMEDIASIZE:
980 *(off_t *)data =
981 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
982 return 0;
983 #endif
984 case DIOCSDINFO:
985 case DIOCWDINFO:
986 #ifdef __HAVE_OLD_DISKLABEL
987 case ODIOCWDINFO:
988 case ODIOCSDINFO:
989 #endif
990 case DIOCWLABEL:
991 case DIOCAWEDGE:
992 case DIOCDWEDGE:
993 if ((flag & FWRITE) == 0)
994 return (EBADF);
995 }
996
997 /* Must be initialized for these... */
998 switch (cmd) {
999 case DIOCGDINFO:
1000 case DIOCSDINFO:
1001 case DIOCWDINFO:
1002 #ifdef __HAVE_OLD_DISKLABEL
1003 case ODIOCGDINFO:
1004 case ODIOCWDINFO:
1005 case ODIOCSDINFO:
1006 case ODIOCGDEFLABEL:
1007 #endif
1008 case DIOCGPART:
1009 case DIOCWLABEL:
1010 case DIOCGDEFLABEL:
1011 case DIOCAWEDGE:
1012 case DIOCDWEDGE:
1013 case DIOCLWEDGES:
1014 case RAIDFRAME_SHUTDOWN:
1015 case RAIDFRAME_REWRITEPARITY:
1016 case RAIDFRAME_GET_INFO:
1017 case RAIDFRAME_RESET_ACCTOTALS:
1018 case RAIDFRAME_GET_ACCTOTALS:
1019 case RAIDFRAME_KEEP_ACCTOTALS:
1020 case RAIDFRAME_GET_SIZE:
1021 case RAIDFRAME_FAIL_DISK:
1022 case RAIDFRAME_COPYBACK:
1023 case RAIDFRAME_CHECK_RECON_STATUS:
1024 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1025 case RAIDFRAME_GET_COMPONENT_LABEL:
1026 case RAIDFRAME_SET_COMPONENT_LABEL:
1027 case RAIDFRAME_ADD_HOT_SPARE:
1028 case RAIDFRAME_REMOVE_HOT_SPARE:
1029 case RAIDFRAME_INIT_LABELS:
1030 case RAIDFRAME_REBUILD_IN_PLACE:
1031 case RAIDFRAME_CHECK_PARITY:
1032 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1033 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1034 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1035 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1036 case RAIDFRAME_SET_AUTOCONFIG:
1037 case RAIDFRAME_SET_ROOT:
1038 case RAIDFRAME_DELETE_COMPONENT:
1039 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1040 if ((rs->sc_flags & RAIDF_INITED) == 0)
1041 return (ENXIO);
1042 }
1043
1044 switch (cmd) {
1045
1046 /* configure the system */
1047 case RAIDFRAME_CONFIGURE:
1048
1049 if (raidPtr->valid) {
1050 /* There is a valid RAID set running on this unit! */
1051 printf("raid%d: Device already configured!\n",unit);
1052 return(EINVAL);
1053 }
1054
1055 /* copy-in the configuration information */
1056 /* data points to a pointer to the configuration structure */
1057
1058 u_cfg = *((RF_Config_t **) data);
1059 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1060 if (k_cfg == NULL) {
1061 return (ENOMEM);
1062 }
1063 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1064 if (retcode) {
1065 RF_Free(k_cfg, sizeof(RF_Config_t));
1066 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1067 retcode));
1068 return (retcode);
1069 }
1070 /* allocate a buffer for the layout-specific data, and copy it
1071 * in */
1072 if (k_cfg->layoutSpecificSize) {
1073 if (k_cfg->layoutSpecificSize > 10000) {
1074 /* sanity check */
1075 RF_Free(k_cfg, sizeof(RF_Config_t));
1076 return (EINVAL);
1077 }
1078 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1079 (u_char *));
1080 if (specific_buf == NULL) {
1081 RF_Free(k_cfg, sizeof(RF_Config_t));
1082 return (ENOMEM);
1083 }
1084 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1085 k_cfg->layoutSpecificSize);
1086 if (retcode) {
1087 RF_Free(k_cfg, sizeof(RF_Config_t));
1088 RF_Free(specific_buf,
1089 k_cfg->layoutSpecificSize);
1090 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1091 retcode));
1092 return (retcode);
1093 }
1094 } else
1095 specific_buf = NULL;
1096 k_cfg->layoutSpecific = specific_buf;
1097
1098 /* should do some kind of sanity check on the configuration.
1099 * Store the sum of all the bytes in the last byte? */
1100
1101 /* configure the system */
1102
1103 /*
1104 * Clear the entire RAID descriptor, just to make sure
1105 * there is no stale data left in the case of a
1106 * reconfiguration
1107 */
1108 memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1109 raidPtr->raidid = unit;
1110
1111 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1112
1113 if (retcode == 0) {
1114
1115 /* allow this many simultaneous IO's to
1116 this RAID device */
1117 raidPtr->openings = RAIDOUTSTANDING;
1118
1119 raidinit(raidPtr);
1120 rf_markalldirty(raidPtr);
1121 }
1122 /* free the buffers. No return code here. */
1123 if (k_cfg->layoutSpecificSize) {
1124 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1125 }
1126 RF_Free(k_cfg, sizeof(RF_Config_t));
1127
1128 return (retcode);
1129
1130 /* shutdown the system */
1131 case RAIDFRAME_SHUTDOWN:
1132
1133 if ((error = raidlock(rs)) != 0)
1134 return (error);
1135
1136 /*
1137 * If somebody has a partition mounted, we shouldn't
1138 * shutdown.
1139 */
1140
1141 part = DISKPART(dev);
1142 pmask = (1 << part);
1143 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1144 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1145 (rs->sc_dkdev.dk_copenmask & pmask))) {
1146 raidunlock(rs);
1147 return (EBUSY);
1148 }
1149
1150 retcode = rf_Shutdown(raidPtr);
1151
1152 /* It's no longer initialized... */
1153 rs->sc_flags &= ~RAIDF_INITED;
1154
1155 /* free the pseudo device attach bits */
1156
1157 cf = device_cfdata(rs->sc_dev);
1158 /* XXX this causes us to not return any errors
1159 from the above call to rf_Shutdown() */
1160 retcode = config_detach(rs->sc_dev, DETACH_QUIET);
1161 free(cf, M_RAIDFRAME);
1162
1163 /* Detach the disk. */
1164 pseudo_disk_detach(&rs->sc_dkdev);
1165
1166 raidunlock(rs);
1167
1168 return (retcode);
1169 case RAIDFRAME_GET_COMPONENT_LABEL:
1170 clabel_ptr = (RF_ComponentLabel_t **) data;
1171 /* need to read the component label for the disk indicated
1172 by row,column in clabel */
1173
1174 /* For practice, let's get it directly fromdisk, rather
1175 than from the in-core copy */
1176 RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
1177 (RF_ComponentLabel_t *));
1178 if (clabel == NULL)
1179 return (ENOMEM);
1180
1181 retcode = copyin( *clabel_ptr, clabel,
1182 sizeof(RF_ComponentLabel_t));
1183
1184 if (retcode) {
1185 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1186 return(retcode);
1187 }
1188
1189 clabel->row = 0; /* Don't allow looking at anything else.*/
1190
1191 column = clabel->column;
1192
1193 if ((column < 0) || (column >= raidPtr->numCol +
1194 raidPtr->numSpare)) {
1195 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1196 return(EINVAL);
1197 }
1198
1199 retcode = raidread_component_label(raidPtr->Disks[column].dev,
1200 raidPtr->raid_cinfo[column].ci_vp,
1201 clabel );
1202
1203 if (retcode == 0) {
1204 retcode = copyout(clabel, *clabel_ptr,
1205 sizeof(RF_ComponentLabel_t));
1206 }
1207 RF_Free(clabel, sizeof(RF_ComponentLabel_t));
1208 return (retcode);
1209
1210 case RAIDFRAME_SET_COMPONENT_LABEL:
1211 clabel = (RF_ComponentLabel_t *) data;
1212
1213 /* XXX check the label for valid stuff... */
1214 /* Note that some things *should not* get modified --
1215 the user should be re-initing the labels instead of
1216 trying to patch things.
1217 */
1218
1219 raidid = raidPtr->raidid;
1220 #ifdef DEBUG
1221 printf("raid%d: Got component label:\n", raidid);
1222 printf("raid%d: Version: %d\n", raidid, clabel->version);
1223 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1224 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1225 printf("raid%d: Column: %d\n", raidid, clabel->column);
1226 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1227 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1228 printf("raid%d: Status: %d\n", raidid, clabel->status);
1229 #endif
1230 clabel->row = 0;
1231 column = clabel->column;
1232
1233 if ((column < 0) || (column >= raidPtr->numCol)) {
1234 return(EINVAL);
1235 }
1236
1237 /* XXX this isn't allowed to do anything for now :-) */
1238
1239 /* XXX and before it is, we need to fill in the rest
1240 of the fields!?!?!?! */
1241 #if 0
1242 raidwrite_component_label(
1243 raidPtr->Disks[column].dev,
1244 raidPtr->raid_cinfo[column].ci_vp,
1245 clabel );
1246 #endif
1247 return (0);
1248
1249 case RAIDFRAME_INIT_LABELS:
1250 clabel = (RF_ComponentLabel_t *) data;
1251 /*
1252 we only want the serial number from
1253 the above. We get all the rest of the information
1254 from the config that was used to create this RAID
1255 set.
1256 */
1257
1258 raidPtr->serial_number = clabel->serial_number;
1259
1260 RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t),
1261 (RF_ComponentLabel_t *));
1262 if (ci_label == NULL)
1263 return (ENOMEM);
1264
1265 raid_init_component_label(raidPtr, ci_label);
1266 ci_label->serial_number = clabel->serial_number;
1267 ci_label->row = 0; /* we dont' pretend to support more */
1268
1269 for(column=0;column<raidPtr->numCol;column++) {
1270 diskPtr = &raidPtr->Disks[column];
1271 if (!RF_DEAD_DISK(diskPtr->status)) {
1272 ci_label->partitionSize = diskPtr->partitionSize;
1273 ci_label->column = column;
1274 raidwrite_component_label(
1275 raidPtr->Disks[column].dev,
1276 raidPtr->raid_cinfo[column].ci_vp,
1277 ci_label );
1278 }
1279 }
1280 RF_Free(ci_label, sizeof(RF_ComponentLabel_t));
1281
1282 return (retcode);
1283 case RAIDFRAME_SET_AUTOCONFIG:
1284 d = rf_set_autoconfig(raidPtr, *(int *) data);
1285 printf("raid%d: New autoconfig value is: %d\n",
1286 raidPtr->raidid, d);
1287 *(int *) data = d;
1288 return (retcode);
1289
1290 case RAIDFRAME_SET_ROOT:
1291 d = rf_set_rootpartition(raidPtr, *(int *) data);
1292 printf("raid%d: New rootpartition value is: %d\n",
1293 raidPtr->raidid, d);
1294 *(int *) data = d;
1295 return (retcode);
1296
1297 /* initialize all parity */
1298 case RAIDFRAME_REWRITEPARITY:
1299
1300 if (raidPtr->Layout.map->faultsTolerated == 0) {
1301 /* Parity for RAID 0 is trivially correct */
1302 raidPtr->parity_good = RF_RAID_CLEAN;
1303 return(0);
1304 }
1305
1306 if (raidPtr->parity_rewrite_in_progress == 1) {
1307 /* Re-write is already in progress! */
1308 return(EINVAL);
1309 }
1310
1311 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1312 rf_RewriteParityThread,
1313 raidPtr,"raid_parity");
1314 return (retcode);
1315
1316
1317 case RAIDFRAME_ADD_HOT_SPARE:
1318 sparePtr = (RF_SingleComponent_t *) data;
1319 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1320 retcode = rf_add_hot_spare(raidPtr, &component);
1321 return(retcode);
1322
1323 case RAIDFRAME_REMOVE_HOT_SPARE:
1324 return(retcode);
1325
1326 case RAIDFRAME_DELETE_COMPONENT:
1327 componentPtr = (RF_SingleComponent_t *)data;
1328 memcpy( &component, componentPtr,
1329 sizeof(RF_SingleComponent_t));
1330 retcode = rf_delete_component(raidPtr, &component);
1331 return(retcode);
1332
1333 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1334 componentPtr = (RF_SingleComponent_t *)data;
1335 memcpy( &component, componentPtr,
1336 sizeof(RF_SingleComponent_t));
1337 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1338 return(retcode);
1339
1340 case RAIDFRAME_REBUILD_IN_PLACE:
1341
1342 if (raidPtr->Layout.map->faultsTolerated == 0) {
1343 /* Can't do this on a RAID 0!! */
1344 return(EINVAL);
1345 }
1346
1347 if (raidPtr->recon_in_progress == 1) {
1348 /* a reconstruct is already in progress! */
1349 return(EINVAL);
1350 }
1351
1352 componentPtr = (RF_SingleComponent_t *) data;
1353 memcpy( &component, componentPtr,
1354 sizeof(RF_SingleComponent_t));
1355 component.row = 0; /* we don't support any more */
1356 column = component.column;
1357
1358 if ((column < 0) || (column >= raidPtr->numCol)) {
1359 return(EINVAL);
1360 }
1361
1362 RF_LOCK_MUTEX(raidPtr->mutex);
1363 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1364 (raidPtr->numFailures > 0)) {
1365 /* XXX 0 above shouldn't be constant!!! */
1366 /* some component other than this has failed.
1367 Let's not make things worse than they already
1368 are... */
1369 printf("raid%d: Unable to reconstruct to disk at:\n",
1370 raidPtr->raidid);
1371 printf("raid%d: Col: %d Too many failures.\n",
1372 raidPtr->raidid, column);
1373 RF_UNLOCK_MUTEX(raidPtr->mutex);
1374 return (EINVAL);
1375 }
1376 if (raidPtr->Disks[column].status ==
1377 rf_ds_reconstructing) {
1378 printf("raid%d: Unable to reconstruct to disk at:\n",
1379 raidPtr->raidid);
1380 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1381
1382 RF_UNLOCK_MUTEX(raidPtr->mutex);
1383 return (EINVAL);
1384 }
1385 if (raidPtr->Disks[column].status == rf_ds_spared) {
1386 RF_UNLOCK_MUTEX(raidPtr->mutex);
1387 return (EINVAL);
1388 }
1389 RF_UNLOCK_MUTEX(raidPtr->mutex);
1390
1391 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1392 if (rrcopy == NULL)
1393 return(ENOMEM);
1394
1395 rrcopy->raidPtr = (void *) raidPtr;
1396 rrcopy->col = column;
1397
1398 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1399 rf_ReconstructInPlaceThread,
1400 rrcopy,"raid_reconip");
1401 return(retcode);
1402
1403 case RAIDFRAME_GET_INFO:
1404 if (!raidPtr->valid)
1405 return (ENODEV);
1406 ucfgp = (RF_DeviceConfig_t **) data;
1407 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1408 (RF_DeviceConfig_t *));
1409 if (d_cfg == NULL)
1410 return (ENOMEM);
1411 d_cfg->rows = 1; /* there is only 1 row now */
1412 d_cfg->cols = raidPtr->numCol;
1413 d_cfg->ndevs = raidPtr->numCol;
1414 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1415 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1416 return (ENOMEM);
1417 }
1418 d_cfg->nspares = raidPtr->numSpare;
1419 if (d_cfg->nspares >= RF_MAX_DISKS) {
1420 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1421 return (ENOMEM);
1422 }
1423 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1424 d = 0;
1425 for (j = 0; j < d_cfg->cols; j++) {
1426 d_cfg->devs[d] = raidPtr->Disks[j];
1427 d++;
1428 }
1429 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1430 d_cfg->spares[i] = raidPtr->Disks[j];
1431 }
1432 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1433 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1434
1435 return (retcode);
1436
1437 case RAIDFRAME_CHECK_PARITY:
1438 *(int *) data = raidPtr->parity_good;
1439 return (0);
1440
1441 case RAIDFRAME_RESET_ACCTOTALS:
1442 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1443 return (0);
1444
1445 case RAIDFRAME_GET_ACCTOTALS:
1446 totals = (RF_AccTotals_t *) data;
1447 *totals = raidPtr->acc_totals;
1448 return (0);
1449
1450 case RAIDFRAME_KEEP_ACCTOTALS:
1451 raidPtr->keep_acc_totals = *(int *)data;
1452 return (0);
1453
1454 case RAIDFRAME_GET_SIZE:
1455 *(int *) data = raidPtr->totalSectors;
1456 return (0);
1457
1458 /* fail a disk & optionally start reconstruction */
1459 case RAIDFRAME_FAIL_DISK:
1460
1461 if (raidPtr->Layout.map->faultsTolerated == 0) {
1462 /* Can't do this on a RAID 0!! */
1463 return(EINVAL);
1464 }
1465
1466 rr = (struct rf_recon_req *) data;
1467 rr->row = 0;
1468 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1469 return (EINVAL);
1470
1471
1472 RF_LOCK_MUTEX(raidPtr->mutex);
1473 if (raidPtr->status == rf_rs_reconstructing) {
1474 /* you can't fail a disk while we're reconstructing! */
1475 /* XXX wrong for RAID6 */
1476 RF_UNLOCK_MUTEX(raidPtr->mutex);
1477 return (EINVAL);
1478 }
1479 if ((raidPtr->Disks[rr->col].status ==
1480 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1481 /* some other component has failed. Let's not make
1482 things worse. XXX wrong for RAID6 */
1483 RF_UNLOCK_MUTEX(raidPtr->mutex);
1484 return (EINVAL);
1485 }
1486 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1487 /* Can't fail a spared disk! */
1488 RF_UNLOCK_MUTEX(raidPtr->mutex);
1489 return (EINVAL);
1490 }
1491 RF_UNLOCK_MUTEX(raidPtr->mutex);
1492
1493 /* make a copy of the recon request so that we don't rely on
1494 * the user's buffer */
1495 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1496 if (rrcopy == NULL)
1497 return(ENOMEM);
1498 memcpy(rrcopy, rr, sizeof(*rr));
1499 rrcopy->raidPtr = (void *) raidPtr;
1500
1501 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1502 rf_ReconThread,
1503 rrcopy,"raid_recon");
1504 return (0);
1505
1506 /* invoke a copyback operation after recon on whatever disk
1507 * needs it, if any */
1508 case RAIDFRAME_COPYBACK:
1509
1510 if (raidPtr->Layout.map->faultsTolerated == 0) {
1511 /* This makes no sense on a RAID 0!! */
1512 return(EINVAL);
1513 }
1514
1515 if (raidPtr->copyback_in_progress == 1) {
1516 /* Copyback is already in progress! */
1517 return(EINVAL);
1518 }
1519
1520 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1521 rf_CopybackThread,
1522 raidPtr,"raid_copyback");
1523 return (retcode);
1524
1525 /* return the percentage completion of reconstruction */
1526 case RAIDFRAME_CHECK_RECON_STATUS:
1527 if (raidPtr->Layout.map->faultsTolerated == 0) {
1528 /* This makes no sense on a RAID 0, so tell the
1529 user it's done. */
1530 *(int *) data = 100;
1531 return(0);
1532 }
1533 if (raidPtr->status != rf_rs_reconstructing)
1534 *(int *) data = 100;
1535 else {
1536 if (raidPtr->reconControl->numRUsTotal > 0) {
1537 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1538 } else {
1539 *(int *) data = 0;
1540 }
1541 }
1542 return (0);
1543 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1544 progressInfoPtr = (RF_ProgressInfo_t **) data;
1545 if (raidPtr->status != rf_rs_reconstructing) {
1546 progressInfo.remaining = 0;
1547 progressInfo.completed = 100;
1548 progressInfo.total = 100;
1549 } else {
1550 progressInfo.total =
1551 raidPtr->reconControl->numRUsTotal;
1552 progressInfo.completed =
1553 raidPtr->reconControl->numRUsComplete;
1554 progressInfo.remaining = progressInfo.total -
1555 progressInfo.completed;
1556 }
1557 retcode = copyout(&progressInfo, *progressInfoPtr,
1558 sizeof(RF_ProgressInfo_t));
1559 return (retcode);
1560
1561 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1562 if (raidPtr->Layout.map->faultsTolerated == 0) {
1563 /* This makes no sense on a RAID 0, so tell the
1564 user it's done. */
1565 *(int *) data = 100;
1566 return(0);
1567 }
1568 if (raidPtr->parity_rewrite_in_progress == 1) {
1569 *(int *) data = 100 *
1570 raidPtr->parity_rewrite_stripes_done /
1571 raidPtr->Layout.numStripe;
1572 } else {
1573 *(int *) data = 100;
1574 }
1575 return (0);
1576
1577 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1578 progressInfoPtr = (RF_ProgressInfo_t **) data;
1579 if (raidPtr->parity_rewrite_in_progress == 1) {
1580 progressInfo.total = raidPtr->Layout.numStripe;
1581 progressInfo.completed =
1582 raidPtr->parity_rewrite_stripes_done;
1583 progressInfo.remaining = progressInfo.total -
1584 progressInfo.completed;
1585 } else {
1586 progressInfo.remaining = 0;
1587 progressInfo.completed = 100;
1588 progressInfo.total = 100;
1589 }
1590 retcode = copyout(&progressInfo, *progressInfoPtr,
1591 sizeof(RF_ProgressInfo_t));
1592 return (retcode);
1593
1594 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1595 if (raidPtr->Layout.map->faultsTolerated == 0) {
1596 /* This makes no sense on a RAID 0 */
1597 *(int *) data = 100;
1598 return(0);
1599 }
1600 if (raidPtr->copyback_in_progress == 1) {
1601 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1602 raidPtr->Layout.numStripe;
1603 } else {
1604 *(int *) data = 100;
1605 }
1606 return (0);
1607
1608 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1609 progressInfoPtr = (RF_ProgressInfo_t **) data;
1610 if (raidPtr->copyback_in_progress == 1) {
1611 progressInfo.total = raidPtr->Layout.numStripe;
1612 progressInfo.completed =
1613 raidPtr->copyback_stripes_done;
1614 progressInfo.remaining = progressInfo.total -
1615 progressInfo.completed;
1616 } else {
1617 progressInfo.remaining = 0;
1618 progressInfo.completed = 100;
1619 progressInfo.total = 100;
1620 }
1621 retcode = copyout(&progressInfo, *progressInfoPtr,
1622 sizeof(RF_ProgressInfo_t));
1623 return (retcode);
1624
1625 /* the sparetable daemon calls this to wait for the kernel to
1626 * need a spare table. this ioctl does not return until a
1627 * spare table is needed. XXX -- calling mpsleep here in the
1628 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1629 * -- I should either compute the spare table in the kernel,
1630 * or have a different -- XXX XXX -- interface (a different
1631 * character device) for delivering the table -- XXX */
1632 #if 0
1633 case RAIDFRAME_SPARET_WAIT:
1634 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1635 while (!rf_sparet_wait_queue)
1636 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1637 waitreq = rf_sparet_wait_queue;
1638 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1639 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1640
1641 /* structure assignment */
1642 *((RF_SparetWait_t *) data) = *waitreq;
1643
1644 RF_Free(waitreq, sizeof(*waitreq));
1645 return (0);
1646
1647 /* wakes up a process waiting on SPARET_WAIT and puts an error
1648 * code in it that will cause the dameon to exit */
1649 case RAIDFRAME_ABORT_SPARET_WAIT:
1650 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1651 waitreq->fcol = -1;
1652 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1653 waitreq->next = rf_sparet_wait_queue;
1654 rf_sparet_wait_queue = waitreq;
1655 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1656 wakeup(&rf_sparet_wait_queue);
1657 return (0);
1658
1659 /* used by the spare table daemon to deliver a spare table
1660 * into the kernel */
1661 case RAIDFRAME_SEND_SPARET:
1662
1663 /* install the spare table */
1664 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1665
1666 /* respond to the requestor. the return status of the spare
1667 * table installation is passed in the "fcol" field */
1668 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1669 waitreq->fcol = retcode;
1670 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1671 waitreq->next = rf_sparet_resp_queue;
1672 rf_sparet_resp_queue = waitreq;
1673 wakeup(&rf_sparet_resp_queue);
1674 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1675
1676 return (retcode);
1677 #endif
1678
1679 default:
1680 break; /* fall through to the os-specific code below */
1681
1682 }
1683
1684 if (!raidPtr->valid)
1685 return (EINVAL);
1686
1687 /*
1688 * Add support for "regular" device ioctls here.
1689 */
1690
1691 switch (cmd) {
1692 case DIOCGDINFO:
1693 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1694 break;
1695 #ifdef __HAVE_OLD_DISKLABEL
1696 case ODIOCGDINFO:
1697 newlabel = *(rs->sc_dkdev.dk_label);
1698 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1699 return ENOTTY;
1700 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1701 break;
1702 #endif
1703
1704 case DIOCGPART:
1705 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1706 ((struct partinfo *) data)->part =
1707 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1708 break;
1709
1710 case DIOCWDINFO:
1711 case DIOCSDINFO:
1712 #ifdef __HAVE_OLD_DISKLABEL
1713 case ODIOCWDINFO:
1714 case ODIOCSDINFO:
1715 #endif
1716 {
1717 struct disklabel *lp;
1718 #ifdef __HAVE_OLD_DISKLABEL
1719 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1720 memset(&newlabel, 0, sizeof newlabel);
1721 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1722 lp = &newlabel;
1723 } else
1724 #endif
1725 lp = (struct disklabel *)data;
1726
1727 if ((error = raidlock(rs)) != 0)
1728 return (error);
1729
1730 rs->sc_flags |= RAIDF_LABELLING;
1731
1732 error = setdisklabel(rs->sc_dkdev.dk_label,
1733 lp, 0, rs->sc_dkdev.dk_cpulabel);
1734 if (error == 0) {
1735 if (cmd == DIOCWDINFO
1736 #ifdef __HAVE_OLD_DISKLABEL
1737 || cmd == ODIOCWDINFO
1738 #endif
1739 )
1740 error = writedisklabel(RAIDLABELDEV(dev),
1741 raidstrategy, rs->sc_dkdev.dk_label,
1742 rs->sc_dkdev.dk_cpulabel);
1743 }
1744 rs->sc_flags &= ~RAIDF_LABELLING;
1745
1746 raidunlock(rs);
1747
1748 if (error)
1749 return (error);
1750 break;
1751 }
1752
1753 case DIOCWLABEL:
1754 if (*(int *) data != 0)
1755 rs->sc_flags |= RAIDF_WLABEL;
1756 else
1757 rs->sc_flags &= ~RAIDF_WLABEL;
1758 break;
1759
1760 case DIOCGDEFLABEL:
1761 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1762 break;
1763
1764 #ifdef __HAVE_OLD_DISKLABEL
1765 case ODIOCGDEFLABEL:
1766 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1767 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1768 return ENOTTY;
1769 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1770 break;
1771 #endif
1772
1773 case DIOCAWEDGE:
1774 case DIOCDWEDGE:
1775 dkw = (void *)data;
1776
1777 /* If the ioctl happens here, the parent is us. */
1778 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1779 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1780
1781 case DIOCLWEDGES:
1782 return dkwedge_list(&rs->sc_dkdev,
1783 (struct dkwedge_list *)data, l);
1784
1785 default:
1786 retcode = ENOTTY;
1787 }
1788 return (retcode);
1789
1790 }
1791
1792
1793 /* raidinit -- complete the rest of the initialization for the
1794 RAIDframe device. */
1795
1796
1797 static void
1798 raidinit(RF_Raid_t *raidPtr)
1799 {
1800 struct cfdata *cf;
1801 struct raid_softc *rs;
1802 int unit;
1803
1804 unit = raidPtr->raidid;
1805
1806 rs = &raid_softc[unit];
1807
1808 /* XXX should check return code first... */
1809 rs->sc_flags |= RAIDF_INITED;
1810
1811 /* XXX doesn't check bounds. */
1812 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1813
1814 rs->sc_dkdev.dk_name = rs->sc_xname;
1815
1816 /* attach the pseudo device */
1817 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1818 cf->cf_name = raid_cd.cd_name;
1819 cf->cf_atname = raid_cd.cd_name;
1820 cf->cf_unit = unit;
1821 cf->cf_fstate = FSTATE_STAR;
1822
1823 rs->sc_dev = config_attach_pseudo(cf);
1824
1825 if (rs->sc_dev==NULL) {
1826 printf("raid%d: config_attach_pseudo failed\n",
1827 raidPtr->raidid);
1828 }
1829
1830 /* disk_attach actually creates space for the CPU disklabel, among
1831 * other things, so it's critical to call this *BEFORE* we try putzing
1832 * with disklabels. */
1833
1834 disk_attach(&rs->sc_dkdev);
1835
1836 /* XXX There may be a weird interaction here between this, and
1837 * protectedSectors, as used in RAIDframe. */
1838
1839 rs->sc_size = raidPtr->totalSectors;
1840 }
1841 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1842 /* wake up the daemon & tell it to get us a spare table
1843 * XXX
1844 * the entries in the queues should be tagged with the raidPtr
1845 * so that in the extremely rare case that two recons happen at once,
1846 * we know for which device were requesting a spare table
1847 * XXX
1848 *
1849 * XXX This code is not currently used. GO
1850 */
1851 int
1852 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1853 {
1854 int retcode;
1855
1856 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1857 req->next = rf_sparet_wait_queue;
1858 rf_sparet_wait_queue = req;
1859 wakeup(&rf_sparet_wait_queue);
1860
1861 /* mpsleep unlocks the mutex */
1862 while (!rf_sparet_resp_queue) {
1863 tsleep(&rf_sparet_resp_queue, PRIBIO,
1864 "raidframe getsparetable", 0);
1865 }
1866 req = rf_sparet_resp_queue;
1867 rf_sparet_resp_queue = req->next;
1868 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1869
1870 retcode = req->fcol;
1871 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1872 * alloc'd */
1873 return (retcode);
1874 }
1875 #endif
1876
1877 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1878 * bp & passes it down.
1879 * any calls originating in the kernel must use non-blocking I/O
1880 * do some extra sanity checking to return "appropriate" error values for
1881 * certain conditions (to make some standard utilities work)
1882 *
1883 * Formerly known as: rf_DoAccessKernel
1884 */
1885 void
1886 raidstart(RF_Raid_t *raidPtr)
1887 {
1888 RF_SectorCount_t num_blocks, pb, sum;
1889 RF_RaidAddr_t raid_addr;
1890 struct partition *pp;
1891 daddr_t blocknum;
1892 int unit;
1893 struct raid_softc *rs;
1894 int do_async;
1895 struct buf *bp;
1896 int rc;
1897
1898 unit = raidPtr->raidid;
1899 rs = &raid_softc[unit];
1900
1901 /* quick check to see if anything has died recently */
1902 RF_LOCK_MUTEX(raidPtr->mutex);
1903 if (raidPtr->numNewFailures > 0) {
1904 RF_UNLOCK_MUTEX(raidPtr->mutex);
1905 rf_update_component_labels(raidPtr,
1906 RF_NORMAL_COMPONENT_UPDATE);
1907 RF_LOCK_MUTEX(raidPtr->mutex);
1908 raidPtr->numNewFailures--;
1909 }
1910
1911 /* Check to see if we're at the limit... */
1912 while (raidPtr->openings > 0) {
1913 RF_UNLOCK_MUTEX(raidPtr->mutex);
1914
1915 /* get the next item, if any, from the queue */
1916 if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
1917 /* nothing more to do */
1918 return;
1919 }
1920
1921 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1922 * partition.. Need to make it absolute to the underlying
1923 * device.. */
1924
1925 blocknum = bp->b_blkno;
1926 if (DISKPART(bp->b_dev) != RAW_PART) {
1927 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1928 blocknum += pp->p_offset;
1929 }
1930
1931 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1932 (int) blocknum));
1933
1934 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1935 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1936
1937 /* *THIS* is where we adjust what block we're going to...
1938 * but DO NOT TOUCH bp->b_blkno!!! */
1939 raid_addr = blocknum;
1940
1941 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1942 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1943 sum = raid_addr + num_blocks + pb;
1944 if (1 || rf_debugKernelAccess) {
1945 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1946 (int) raid_addr, (int) sum, (int) num_blocks,
1947 (int) pb, (int) bp->b_resid));
1948 }
1949 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1950 || (sum < num_blocks) || (sum < pb)) {
1951 bp->b_error = ENOSPC;
1952 bp->b_flags |= B_ERROR;
1953 bp->b_resid = bp->b_bcount;
1954 biodone(bp);
1955 RF_LOCK_MUTEX(raidPtr->mutex);
1956 continue;
1957 }
1958 /*
1959 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1960 */
1961
1962 if (bp->b_bcount & raidPtr->sectorMask) {
1963 bp->b_error = EINVAL;
1964 bp->b_flags |= B_ERROR;
1965 bp->b_resid = bp->b_bcount;
1966 biodone(bp);
1967 RF_LOCK_MUTEX(raidPtr->mutex);
1968 continue;
1969
1970 }
1971 db1_printf(("Calling DoAccess..\n"));
1972
1973
1974 RF_LOCK_MUTEX(raidPtr->mutex);
1975 raidPtr->openings--;
1976 RF_UNLOCK_MUTEX(raidPtr->mutex);
1977
1978 /*
1979 * Everything is async.
1980 */
1981 do_async = 1;
1982
1983 disk_busy(&rs->sc_dkdev);
1984
1985 /* XXX we're still at splbio() here... do we *really*
1986 need to be? */
1987
1988 /* don't ever condition on bp->b_flags & B_WRITE.
1989 * always condition on B_READ instead */
1990
1991 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1992 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1993 do_async, raid_addr, num_blocks,
1994 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1995
1996 if (rc) {
1997 bp->b_error = rc;
1998 bp->b_flags |= B_ERROR;
1999 bp->b_resid = bp->b_bcount;
2000 biodone(bp);
2001 /* continue loop */
2002 }
2003
2004 RF_LOCK_MUTEX(raidPtr->mutex);
2005 }
2006 RF_UNLOCK_MUTEX(raidPtr->mutex);
2007 }
2008
2009
2010
2011
2012 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2013
2014 int
2015 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2016 {
2017 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2018 struct buf *bp;
2019
2020 req->queue = queue;
2021
2022 #if DIAGNOSTIC
2023 if (queue->raidPtr->raidid >= numraid) {
2024 printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
2025 numraid);
2026 panic("Invalid Unit number in rf_DispatchKernelIO");
2027 }
2028 #endif
2029
2030 bp = req->bp;
2031
2032 switch (req->type) {
2033 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2034 /* XXX need to do something extra here.. */
2035 /* I'm leaving this in, as I've never actually seen it used,
2036 * and I'd like folks to report it... GO */
2037 printf(("WAKEUP CALLED\n"));
2038 queue->numOutstanding++;
2039
2040 bp->b_flags = 0;
2041 bp->b_private = req;
2042
2043 KernelWakeupFunc(bp);
2044 break;
2045
2046 case RF_IO_TYPE_READ:
2047 case RF_IO_TYPE_WRITE:
2048 #if RF_ACC_TRACE > 0
2049 if (req->tracerec) {
2050 RF_ETIMER_START(req->tracerec->timer);
2051 }
2052 #endif
2053 InitBP(bp, queue->rf_cinfo->ci_vp,
2054 op, queue->rf_cinfo->ci_dev,
2055 req->sectorOffset, req->numSector,
2056 req->buf, KernelWakeupFunc, (void *) req,
2057 queue->raidPtr->logBytesPerSector, req->b_proc);
2058
2059 if (rf_debugKernelAccess) {
2060 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2061 (long) bp->b_blkno));
2062 }
2063 queue->numOutstanding++;
2064 queue->last_deq_sector = req->sectorOffset;
2065 /* acc wouldn't have been let in if there were any pending
2066 * reqs at any other priority */
2067 queue->curPriority = req->priority;
2068
2069 db1_printf(("Going for %c to unit %d col %d\n",
2070 req->type, queue->raidPtr->raidid,
2071 queue->col));
2072 db1_printf(("sector %d count %d (%d bytes) %d\n",
2073 (int) req->sectorOffset, (int) req->numSector,
2074 (int) (req->numSector <<
2075 queue->raidPtr->logBytesPerSector),
2076 (int) queue->raidPtr->logBytesPerSector));
2077 VOP_STRATEGY(bp->b_vp, bp);
2078
2079 break;
2080
2081 default:
2082 panic("bad req->type in rf_DispatchKernelIO");
2083 }
2084 db1_printf(("Exiting from DispatchKernelIO\n"));
2085
2086 return (0);
2087 }
2088 /* this is the callback function associated with a I/O invoked from
2089 kernel code.
2090 */
2091 static void
2092 KernelWakeupFunc(struct buf *bp)
2093 {
2094 RF_DiskQueueData_t *req = NULL;
2095 RF_DiskQueue_t *queue;
2096 int s;
2097
2098 s = splbio();
2099 db1_printf(("recovering the request queue:\n"));
2100 req = bp->b_private;
2101
2102 queue = (RF_DiskQueue_t *) req->queue;
2103
2104 #if RF_ACC_TRACE > 0
2105 if (req->tracerec) {
2106 RF_ETIMER_STOP(req->tracerec->timer);
2107 RF_ETIMER_EVAL(req->tracerec->timer);
2108 RF_LOCK_MUTEX(rf_tracing_mutex);
2109 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2110 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2111 req->tracerec->num_phys_ios++;
2112 RF_UNLOCK_MUTEX(rf_tracing_mutex);
2113 }
2114 #endif
2115
2116 /* XXX Ok, let's get aggressive... If B_ERROR is set, let's go
2117 * ballistic, and mark the component as hosed... */
2118
2119 if (bp->b_flags & B_ERROR) {
2120 /* Mark the disk as dead */
2121 /* but only mark it once... */
2122 /* and only if it wouldn't leave this RAID set
2123 completely broken */
2124 if (((queue->raidPtr->Disks[queue->col].status ==
2125 rf_ds_optimal) ||
2126 (queue->raidPtr->Disks[queue->col].status ==
2127 rf_ds_used_spare)) &&
2128 (queue->raidPtr->numFailures <
2129 queue->raidPtr->Layout.map->faultsTolerated)) {
2130 printf("raid%d: IO Error. Marking %s as failed.\n",
2131 queue->raidPtr->raidid,
2132 queue->raidPtr->Disks[queue->col].devname);
2133 queue->raidPtr->Disks[queue->col].status =
2134 rf_ds_failed;
2135 queue->raidPtr->status = rf_rs_degraded;
2136 queue->raidPtr->numFailures++;
2137 queue->raidPtr->numNewFailures++;
2138 } else { /* Disk is already dead... */
2139 /* printf("Disk already marked as dead!\n"); */
2140 }
2141
2142 }
2143
2144 /* Fill in the error value */
2145
2146 req->error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
2147
2148 simple_lock(&queue->raidPtr->iodone_lock);
2149
2150 /* Drop this one on the "finished" queue... */
2151 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2152
2153 /* Let the raidio thread know there is work to be done. */
2154 wakeup(&(queue->raidPtr->iodone));
2155
2156 simple_unlock(&queue->raidPtr->iodone_lock);
2157
2158 splx(s);
2159 }
2160
2161
2162
2163 /*
2164 * initialize a buf structure for doing an I/O in the kernel.
2165 */
2166 static void
2167 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2168 RF_SectorNum_t startSect, RF_SectorCount_t numSect, caddr_t bf,
2169 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2170 struct proc *b_proc)
2171 {
2172 /* bp->b_flags = B_PHYS | rw_flag; */
2173 bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */
2174 bp->b_bcount = numSect << logBytesPerSector;
2175 bp->b_bufsize = bp->b_bcount;
2176 bp->b_error = 0;
2177 bp->b_dev = dev;
2178 bp->b_data = bf;
2179 bp->b_blkno = startSect;
2180 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2181 if (bp->b_bcount == 0) {
2182 panic("bp->b_bcount is zero in InitBP!!");
2183 }
2184 bp->b_proc = b_proc;
2185 bp->b_iodone = cbFunc;
2186 bp->b_private = cbArg;
2187 bp->b_vp = b_vp;
2188 if ((bp->b_flags & B_READ) == 0) {
2189 bp->b_vp->v_numoutput++;
2190 }
2191
2192 }
2193
2194 static void
2195 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2196 struct disklabel *lp)
2197 {
2198 memset(lp, 0, sizeof(*lp));
2199
2200 /* fabricate a label... */
2201 lp->d_secperunit = raidPtr->totalSectors;
2202 lp->d_secsize = raidPtr->bytesPerSector;
2203 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2204 lp->d_ntracks = 4 * raidPtr->numCol;
2205 lp->d_ncylinders = raidPtr->totalSectors /
2206 (lp->d_nsectors * lp->d_ntracks);
2207 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2208
2209 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2210 lp->d_type = DTYPE_RAID;
2211 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2212 lp->d_rpm = 3600;
2213 lp->d_interleave = 1;
2214 lp->d_flags = 0;
2215
2216 lp->d_partitions[RAW_PART].p_offset = 0;
2217 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2218 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2219 lp->d_npartitions = RAW_PART + 1;
2220
2221 lp->d_magic = DISKMAGIC;
2222 lp->d_magic2 = DISKMAGIC;
2223 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2224
2225 }
2226 /*
2227 * Read the disklabel from the raid device. If one is not present, fake one
2228 * up.
2229 */
2230 static void
2231 raidgetdisklabel(dev_t dev)
2232 {
2233 int unit = raidunit(dev);
2234 struct raid_softc *rs = &raid_softc[unit];
2235 const char *errstring;
2236 struct disklabel *lp = rs->sc_dkdev.dk_label;
2237 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2238 RF_Raid_t *raidPtr;
2239
2240 db1_printf(("Getting the disklabel...\n"));
2241
2242 memset(clp, 0, sizeof(*clp));
2243
2244 raidPtr = raidPtrs[unit];
2245
2246 raidgetdefaultlabel(raidPtr, rs, lp);
2247
2248 /*
2249 * Call the generic disklabel extraction routine.
2250 */
2251 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2252 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2253 if (errstring)
2254 raidmakedisklabel(rs);
2255 else {
2256 int i;
2257 struct partition *pp;
2258
2259 /*
2260 * Sanity check whether the found disklabel is valid.
2261 *
2262 * This is necessary since total size of the raid device
2263 * may vary when an interleave is changed even though exactly
2264 * same components are used, and old disklabel may used
2265 * if that is found.
2266 */
2267 if (lp->d_secperunit != rs->sc_size)
2268 printf("raid%d: WARNING: %s: "
2269 "total sector size in disklabel (%d) != "
2270 "the size of raid (%ld)\n", unit, rs->sc_xname,
2271 lp->d_secperunit, (long) rs->sc_size);
2272 for (i = 0; i < lp->d_npartitions; i++) {
2273 pp = &lp->d_partitions[i];
2274 if (pp->p_offset + pp->p_size > rs->sc_size)
2275 printf("raid%d: WARNING: %s: end of partition `%c' "
2276 "exceeds the size of raid (%ld)\n",
2277 unit, rs->sc_xname, 'a' + i, (long) rs->sc_size);
2278 }
2279 }
2280
2281 }
2282 /*
2283 * Take care of things one might want to take care of in the event
2284 * that a disklabel isn't present.
2285 */
2286 static void
2287 raidmakedisklabel(struct raid_softc *rs)
2288 {
2289 struct disklabel *lp = rs->sc_dkdev.dk_label;
2290 db1_printf(("Making a label..\n"));
2291
2292 /*
2293 * For historical reasons, if there's no disklabel present
2294 * the raw partition must be marked FS_BSDFFS.
2295 */
2296
2297 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2298
2299 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2300
2301 lp->d_checksum = dkcksum(lp);
2302 }
2303 /*
2304 * Wait interruptibly for an exclusive lock.
2305 *
2306 * XXX
2307 * Several drivers do this; it should be abstracted and made MP-safe.
2308 * (Hmm... where have we seen this warning before :-> GO )
2309 */
2310 static int
2311 raidlock(struct raid_softc *rs)
2312 {
2313 int error;
2314
2315 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2316 rs->sc_flags |= RAIDF_WANTED;
2317 if ((error =
2318 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2319 return (error);
2320 }
2321 rs->sc_flags |= RAIDF_LOCKED;
2322 return (0);
2323 }
2324 /*
2325 * Unlock and wake up any waiters.
2326 */
2327 static void
2328 raidunlock(struct raid_softc *rs)
2329 {
2330
2331 rs->sc_flags &= ~RAIDF_LOCKED;
2332 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2333 rs->sc_flags &= ~RAIDF_WANTED;
2334 wakeup(rs);
2335 }
2336 }
2337
2338
2339 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2340 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2341
2342 int
2343 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2344 {
2345 RF_ComponentLabel_t clabel;
2346 raidread_component_label(dev, b_vp, &clabel);
2347 clabel.mod_counter = mod_counter;
2348 clabel.clean = RF_RAID_CLEAN;
2349 raidwrite_component_label(dev, b_vp, &clabel);
2350 return(0);
2351 }
2352
2353
2354 int
2355 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2356 {
2357 RF_ComponentLabel_t clabel;
2358 raidread_component_label(dev, b_vp, &clabel);
2359 clabel.mod_counter = mod_counter;
2360 clabel.clean = RF_RAID_DIRTY;
2361 raidwrite_component_label(dev, b_vp, &clabel);
2362 return(0);
2363 }
2364
2365 /* ARGSUSED */
2366 int
2367 raidread_component_label(dev_t dev, struct vnode *b_vp,
2368 RF_ComponentLabel_t *clabel)
2369 {
2370 struct buf *bp;
2371 const struct bdevsw *bdev;
2372 int error;
2373
2374 /* XXX should probably ensure that we don't try to do this if
2375 someone has changed rf_protected_sectors. */
2376
2377 if (b_vp == NULL) {
2378 /* For whatever reason, this component is not valid.
2379 Don't try to read a component label from it. */
2380 return(EINVAL);
2381 }
2382
2383 /* get a block of the appropriate size... */
2384 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2385 bp->b_dev = dev;
2386
2387 /* get our ducks in a row for the read */
2388 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2389 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2390 bp->b_flags |= B_READ;
2391 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2392
2393 bdev = bdevsw_lookup(bp->b_dev);
2394 if (bdev == NULL)
2395 return (ENXIO);
2396 (*bdev->d_strategy)(bp);
2397
2398 error = biowait(bp);
2399
2400 if (!error) {
2401 memcpy(clabel, bp->b_data,
2402 sizeof(RF_ComponentLabel_t));
2403 }
2404
2405 brelse(bp);
2406 return(error);
2407 }
2408 /* ARGSUSED */
2409 int
2410 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2411 RF_ComponentLabel_t *clabel)
2412 {
2413 struct buf *bp;
2414 const struct bdevsw *bdev;
2415 int error;
2416
2417 /* get a block of the appropriate size... */
2418 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2419 bp->b_dev = dev;
2420
2421 /* get our ducks in a row for the write */
2422 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2423 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2424 bp->b_flags |= B_WRITE;
2425 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2426
2427 memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
2428
2429 memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
2430
2431 bdev = bdevsw_lookup(bp->b_dev);
2432 if (bdev == NULL)
2433 return (ENXIO);
2434 (*bdev->d_strategy)(bp);
2435 error = biowait(bp);
2436 brelse(bp);
2437 if (error) {
2438 #if 1
2439 printf("Failed to write RAID component info!\n");
2440 #endif
2441 }
2442
2443 return(error);
2444 }
2445
2446 void
2447 rf_markalldirty(RF_Raid_t *raidPtr)
2448 {
2449 RF_ComponentLabel_t clabel;
2450 int sparecol;
2451 int c;
2452 int j;
2453 int scol = -1;
2454
2455 raidPtr->mod_counter++;
2456 for (c = 0; c < raidPtr->numCol; c++) {
2457 /* we don't want to touch (at all) a disk that has
2458 failed */
2459 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2460 raidread_component_label(
2461 raidPtr->Disks[c].dev,
2462 raidPtr->raid_cinfo[c].ci_vp,
2463 &clabel);
2464 if (clabel.status == rf_ds_spared) {
2465 /* XXX do something special...
2466 but whatever you do, don't
2467 try to access it!! */
2468 } else {
2469 raidmarkdirty(
2470 raidPtr->Disks[c].dev,
2471 raidPtr->raid_cinfo[c].ci_vp,
2472 raidPtr->mod_counter);
2473 }
2474 }
2475 }
2476
2477 for( c = 0; c < raidPtr->numSpare ; c++) {
2478 sparecol = raidPtr->numCol + c;
2479 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2480 /*
2481
2482 we claim this disk is "optimal" if it's
2483 rf_ds_used_spare, as that means it should be
2484 directly substitutable for the disk it replaced.
2485 We note that too...
2486
2487 */
2488
2489 for(j=0;j<raidPtr->numCol;j++) {
2490 if (raidPtr->Disks[j].spareCol == sparecol) {
2491 scol = j;
2492 break;
2493 }
2494 }
2495
2496 raidread_component_label(
2497 raidPtr->Disks[sparecol].dev,
2498 raidPtr->raid_cinfo[sparecol].ci_vp,
2499 &clabel);
2500 /* make sure status is noted */
2501
2502 raid_init_component_label(raidPtr, &clabel);
2503
2504 clabel.row = 0;
2505 clabel.column = scol;
2506 /* Note: we *don't* change status from rf_ds_used_spare
2507 to rf_ds_optimal */
2508 /* clabel.status = rf_ds_optimal; */
2509
2510 raidmarkdirty(raidPtr->Disks[sparecol].dev,
2511 raidPtr->raid_cinfo[sparecol].ci_vp,
2512 raidPtr->mod_counter);
2513 }
2514 }
2515 }
2516
2517
2518 void
2519 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2520 {
2521 RF_ComponentLabel_t clabel;
2522 int sparecol;
2523 int c;
2524 int j;
2525 int scol;
2526
2527 scol = -1;
2528
2529 /* XXX should do extra checks to make sure things really are clean,
2530 rather than blindly setting the clean bit... */
2531
2532 raidPtr->mod_counter++;
2533
2534 for (c = 0; c < raidPtr->numCol; c++) {
2535 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2536 raidread_component_label(
2537 raidPtr->Disks[c].dev,
2538 raidPtr->raid_cinfo[c].ci_vp,
2539 &clabel);
2540 /* make sure status is noted */
2541 clabel.status = rf_ds_optimal;
2542
2543 /* bump the counter */
2544 clabel.mod_counter = raidPtr->mod_counter;
2545
2546 /* note what unit we are configured as */
2547 clabel.last_unit = raidPtr->raidid;
2548
2549 raidwrite_component_label(
2550 raidPtr->Disks[c].dev,
2551 raidPtr->raid_cinfo[c].ci_vp,
2552 &clabel);
2553 if (final == RF_FINAL_COMPONENT_UPDATE) {
2554 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2555 raidmarkclean(
2556 raidPtr->Disks[c].dev,
2557 raidPtr->raid_cinfo[c].ci_vp,
2558 raidPtr->mod_counter);
2559 }
2560 }
2561 }
2562 /* else we don't touch it.. */
2563 }
2564
2565 for( c = 0; c < raidPtr->numSpare ; c++) {
2566 sparecol = raidPtr->numCol + c;
2567 /* Need to ensure that the reconstruct actually completed! */
2568 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2569 /*
2570
2571 we claim this disk is "optimal" if it's
2572 rf_ds_used_spare, as that means it should be
2573 directly substitutable for the disk it replaced.
2574 We note that too...
2575
2576 */
2577
2578 for(j=0;j<raidPtr->numCol;j++) {
2579 if (raidPtr->Disks[j].spareCol == sparecol) {
2580 scol = j;
2581 break;
2582 }
2583 }
2584
2585 /* XXX shouldn't *really* need this... */
2586 raidread_component_label(
2587 raidPtr->Disks[sparecol].dev,
2588 raidPtr->raid_cinfo[sparecol].ci_vp,
2589 &clabel);
2590 /* make sure status is noted */
2591
2592 raid_init_component_label(raidPtr, &clabel);
2593
2594 clabel.mod_counter = raidPtr->mod_counter;
2595 clabel.column = scol;
2596 clabel.status = rf_ds_optimal;
2597 clabel.last_unit = raidPtr->raidid;
2598
2599 raidwrite_component_label(
2600 raidPtr->Disks[sparecol].dev,
2601 raidPtr->raid_cinfo[sparecol].ci_vp,
2602 &clabel);
2603 if (final == RF_FINAL_COMPONENT_UPDATE) {
2604 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2605 raidmarkclean( raidPtr->Disks[sparecol].dev,
2606 raidPtr->raid_cinfo[sparecol].ci_vp,
2607 raidPtr->mod_counter);
2608 }
2609 }
2610 }
2611 }
2612 }
2613
2614 void
2615 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2616 {
2617 struct proc *p;
2618 struct lwp *l;
2619
2620 p = raidPtr->engine_thread;
2621 l = LIST_FIRST(&p->p_lwps);
2622
2623 if (vp != NULL) {
2624 if (auto_configured == 1) {
2625 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2626 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
2627 vput(vp);
2628
2629 } else {
2630 (void) vn_close(vp, FREAD | FWRITE, p->p_cred, l);
2631 }
2632 }
2633 }
2634
2635
2636 void
2637 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2638 {
2639 int r,c;
2640 struct vnode *vp;
2641 int acd;
2642
2643
2644 /* We take this opportunity to close the vnodes like we should.. */
2645
2646 for (c = 0; c < raidPtr->numCol; c++) {
2647 vp = raidPtr->raid_cinfo[c].ci_vp;
2648 acd = raidPtr->Disks[c].auto_configured;
2649 rf_close_component(raidPtr, vp, acd);
2650 raidPtr->raid_cinfo[c].ci_vp = NULL;
2651 raidPtr->Disks[c].auto_configured = 0;
2652 }
2653
2654 for (r = 0; r < raidPtr->numSpare; r++) {
2655 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2656 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2657 rf_close_component(raidPtr, vp, acd);
2658 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2659 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2660 }
2661 }
2662
2663
2664 void
2665 rf_ReconThread(struct rf_recon_req *req)
2666 {
2667 int s;
2668 RF_Raid_t *raidPtr;
2669
2670 s = splbio();
2671 raidPtr = (RF_Raid_t *) req->raidPtr;
2672 raidPtr->recon_in_progress = 1;
2673
2674 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2675 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2676
2677 RF_Free(req, sizeof(*req));
2678
2679 raidPtr->recon_in_progress = 0;
2680 splx(s);
2681
2682 /* That's all... */
2683 kthread_exit(0); /* does not return */
2684 }
2685
2686 void
2687 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2688 {
2689 int retcode;
2690 int s;
2691
2692 raidPtr->parity_rewrite_stripes_done = 0;
2693 raidPtr->parity_rewrite_in_progress = 1;
2694 s = splbio();
2695 retcode = rf_RewriteParity(raidPtr);
2696 splx(s);
2697 if (retcode) {
2698 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2699 } else {
2700 /* set the clean bit! If we shutdown correctly,
2701 the clean bit on each component label will get
2702 set */
2703 raidPtr->parity_good = RF_RAID_CLEAN;
2704 }
2705 raidPtr->parity_rewrite_in_progress = 0;
2706
2707 /* Anyone waiting for us to stop? If so, inform them... */
2708 if (raidPtr->waitShutdown) {
2709 wakeup(&raidPtr->parity_rewrite_in_progress);
2710 }
2711
2712 /* That's all... */
2713 kthread_exit(0); /* does not return */
2714 }
2715
2716
2717 void
2718 rf_CopybackThread(RF_Raid_t *raidPtr)
2719 {
2720 int s;
2721
2722 raidPtr->copyback_in_progress = 1;
2723 s = splbio();
2724 rf_CopybackReconstructedData(raidPtr);
2725 splx(s);
2726 raidPtr->copyback_in_progress = 0;
2727
2728 /* That's all... */
2729 kthread_exit(0); /* does not return */
2730 }
2731
2732
2733 void
2734 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2735 {
2736 int s;
2737 RF_Raid_t *raidPtr;
2738
2739 s = splbio();
2740 raidPtr = req->raidPtr;
2741 raidPtr->recon_in_progress = 1;
2742 rf_ReconstructInPlace(raidPtr, req->col);
2743 RF_Free(req, sizeof(*req));
2744 raidPtr->recon_in_progress = 0;
2745 splx(s);
2746
2747 /* That's all... */
2748 kthread_exit(0); /* does not return */
2749 }
2750
2751 static RF_AutoConfig_t *
2752 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2753 const char *cname, RF_SectorCount_t size)
2754 {
2755 int good_one = 0;
2756 RF_ComponentLabel_t *clabel;
2757 RF_AutoConfig_t *ac;
2758
2759 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2760 if (clabel == NULL) {
2761 oomem:
2762 while(ac_list) {
2763 ac = ac_list;
2764 if (ac->clabel)
2765 free(ac->clabel, M_RAIDFRAME);
2766 ac_list = ac_list->next;
2767 free(ac, M_RAIDFRAME);
2768 }
2769 printf("RAID auto config: out of memory!\n");
2770 return NULL; /* XXX probably should panic? */
2771 }
2772
2773 if (!raidread_component_label(dev, vp, clabel)) {
2774 /* Got the label. Does it look reasonable? */
2775 if (rf_reasonable_label(clabel) &&
2776 (clabel->partitionSize <= size)) {
2777 #ifdef DEBUG
2778 printf("Component on: %s: %llu\n",
2779 cname, (unsigned long long)size);
2780 rf_print_component_label(clabel);
2781 #endif
2782 /* if it's reasonable, add it, else ignore it. */
2783 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2784 M_NOWAIT);
2785 if (ac == NULL) {
2786 free(clabel, M_RAIDFRAME);
2787 goto oomem;
2788 }
2789 strlcpy(ac->devname, cname, sizeof(ac->devname));
2790 ac->dev = dev;
2791 ac->vp = vp;
2792 ac->clabel = clabel;
2793 ac->next = ac_list;
2794 ac_list = ac;
2795 good_one = 1;
2796 }
2797 }
2798 if (!good_one) {
2799 /* cleanup */
2800 free(clabel, M_RAIDFRAME);
2801 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2802 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
2803 vput(vp);
2804 }
2805 return ac_list;
2806 }
2807
2808 RF_AutoConfig_t *
2809 rf_find_raid_components()
2810 {
2811 struct vnode *vp;
2812 struct disklabel label;
2813 struct device *dv;
2814 dev_t dev;
2815 int bmajor, bminor, wedge;
2816 int error;
2817 int i;
2818 RF_AutoConfig_t *ac_list;
2819
2820
2821 /* initialize the AutoConfig list */
2822 ac_list = NULL;
2823
2824 /* we begin by trolling through *all* the devices on the system */
2825
2826 for (dv = alldevs.tqh_first; dv != NULL;
2827 dv = dv->dv_list.tqe_next) {
2828
2829 /* we are only interested in disks... */
2830 if (device_class(dv) != DV_DISK)
2831 continue;
2832
2833 /* we don't care about floppies... */
2834 if (device_is_a(dv, "fd")) {
2835 continue;
2836 }
2837
2838 /* we don't care about CD's... */
2839 if (device_is_a(dv, "cd")) {
2840 continue;
2841 }
2842
2843 /* hdfd is the Atari/Hades floppy driver */
2844 if (device_is_a(dv, "hdfd")) {
2845 continue;
2846 }
2847
2848 /* fdisa is the Atari/Milan floppy driver */
2849 if (device_is_a(dv, "fdisa")) {
2850 continue;
2851 }
2852
2853 /* need to find the device_name_to_block_device_major stuff */
2854 bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
2855
2856 /* get a vnode for the raw partition of this disk */
2857
2858 wedge = device_is_a(dv, "dk");
2859 bminor = minor(device_unit(dv));
2860 dev = wedge ? makedev(bmajor, bminor) :
2861 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2862 if (bdevvp(dev, &vp))
2863 panic("RAID can't alloc vnode");
2864
2865 error = VOP_OPEN(vp, FREAD, NOCRED, 0);
2866
2867 if (error) {
2868 /* "Who cares." Continue looking
2869 for something that exists*/
2870 vput(vp);
2871 continue;
2872 }
2873
2874 if (wedge) {
2875 struct dkwedge_info dkw;
2876 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2877 NOCRED, 0);
2878 if (error) {
2879 printf("RAIDframe: can't get wedge info for "
2880 "dev %s (%d)\n", dv->dv_xname, error);
2881 out:
2882 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2883 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
2884 vput(vp);
2885 continue;
2886 }
2887
2888 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0)
2889 goto out;
2890
2891 ac_list = rf_get_component(ac_list, dev, vp,
2892 dv->dv_xname, dkw.dkw_size);
2893 continue;
2894 }
2895
2896 /* Ok, the disk exists. Go get the disklabel. */
2897 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED, 0);
2898 if (error) {
2899 /*
2900 * XXX can't happen - open() would
2901 * have errored out (or faked up one)
2902 */
2903 if (error != ENOTTY)
2904 printf("RAIDframe: can't get label for dev "
2905 "%s (%d)\n", dv->dv_xname, error);
2906 }
2907
2908 /* don't need this any more. We'll allocate it again
2909 a little later if we really do... */
2910 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2911 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
2912 vput(vp);
2913
2914 if (error)
2915 continue;
2916
2917 for (i = 0; i < label.d_npartitions; i++) {
2918 char cname[sizeof(ac_list->devname)];
2919
2920 /* We only support partitions marked as RAID */
2921 if (label.d_partitions[i].p_fstype != FS_RAID)
2922 continue;
2923
2924 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2925 if (bdevvp(dev, &vp))
2926 panic("RAID can't alloc vnode");
2927
2928 error = VOP_OPEN(vp, FREAD, NOCRED, 0);
2929 if (error) {
2930 /* Whatever... */
2931 vput(vp);
2932 continue;
2933 }
2934 snprintf(cname, sizeof(cname), "%s%c",
2935 dv->dv_xname, 'a' + i);
2936 ac_list = rf_get_component(ac_list, dev, vp, cname,
2937 label.d_partitions[i].p_size);
2938 }
2939 }
2940 return ac_list;
2941 }
2942
2943
2944 static int
2945 rf_reasonable_label(RF_ComponentLabel_t *clabel)
2946 {
2947
2948 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2949 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2950 ((clabel->clean == RF_RAID_CLEAN) ||
2951 (clabel->clean == RF_RAID_DIRTY)) &&
2952 clabel->row >=0 &&
2953 clabel->column >= 0 &&
2954 clabel->num_rows > 0 &&
2955 clabel->num_columns > 0 &&
2956 clabel->row < clabel->num_rows &&
2957 clabel->column < clabel->num_columns &&
2958 clabel->blockSize > 0 &&
2959 clabel->numBlocks > 0) {
2960 /* label looks reasonable enough... */
2961 return(1);
2962 }
2963 return(0);
2964 }
2965
2966
2967 #ifdef DEBUG
2968 void
2969 rf_print_component_label(RF_ComponentLabel_t *clabel)
2970 {
2971 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
2972 clabel->row, clabel->column,
2973 clabel->num_rows, clabel->num_columns);
2974 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
2975 clabel->version, clabel->serial_number,
2976 clabel->mod_counter);
2977 printf(" Clean: %s Status: %d\n",
2978 clabel->clean ? "Yes" : "No", clabel->status );
2979 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
2980 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
2981 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
2982 (char) clabel->parityConfig, clabel->blockSize,
2983 clabel->numBlocks);
2984 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
2985 printf(" Contains root partition: %s\n",
2986 clabel->root_partition ? "Yes" : "No" );
2987 printf(" Last configured as: raid%d\n", clabel->last_unit );
2988 #if 0
2989 printf(" Config order: %d\n", clabel->config_order);
2990 #endif
2991
2992 }
2993 #endif
2994
2995 RF_ConfigSet_t *
2996 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
2997 {
2998 RF_AutoConfig_t *ac;
2999 RF_ConfigSet_t *config_sets;
3000 RF_ConfigSet_t *cset;
3001 RF_AutoConfig_t *ac_next;
3002
3003
3004 config_sets = NULL;
3005
3006 /* Go through the AutoConfig list, and figure out which components
3007 belong to what sets. */
3008 ac = ac_list;
3009 while(ac!=NULL) {
3010 /* we're going to putz with ac->next, so save it here
3011 for use at the end of the loop */
3012 ac_next = ac->next;
3013
3014 if (config_sets == NULL) {
3015 /* will need at least this one... */
3016 config_sets = (RF_ConfigSet_t *)
3017 malloc(sizeof(RF_ConfigSet_t),
3018 M_RAIDFRAME, M_NOWAIT);
3019 if (config_sets == NULL) {
3020 panic("rf_create_auto_sets: No memory!");
3021 }
3022 /* this one is easy :) */
3023 config_sets->ac = ac;
3024 config_sets->next = NULL;
3025 config_sets->rootable = 0;
3026 ac->next = NULL;
3027 } else {
3028 /* which set does this component fit into? */
3029 cset = config_sets;
3030 while(cset!=NULL) {
3031 if (rf_does_it_fit(cset, ac)) {
3032 /* looks like it matches... */
3033 ac->next = cset->ac;
3034 cset->ac = ac;
3035 break;
3036 }
3037 cset = cset->next;
3038 }
3039 if (cset==NULL) {
3040 /* didn't find a match above... new set..*/
3041 cset = (RF_ConfigSet_t *)
3042 malloc(sizeof(RF_ConfigSet_t),
3043 M_RAIDFRAME, M_NOWAIT);
3044 if (cset == NULL) {
3045 panic("rf_create_auto_sets: No memory!");
3046 }
3047 cset->ac = ac;
3048 ac->next = NULL;
3049 cset->next = config_sets;
3050 cset->rootable = 0;
3051 config_sets = cset;
3052 }
3053 }
3054 ac = ac_next;
3055 }
3056
3057
3058 return(config_sets);
3059 }
3060
3061 static int
3062 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3063 {
3064 RF_ComponentLabel_t *clabel1, *clabel2;
3065
3066 /* If this one matches the *first* one in the set, that's good
3067 enough, since the other members of the set would have been
3068 through here too... */
3069 /* note that we are not checking partitionSize here..
3070
3071 Note that we are also not checking the mod_counters here.
3072 If everything else matches execpt the mod_counter, that's
3073 good enough for this test. We will deal with the mod_counters
3074 a little later in the autoconfiguration process.
3075
3076 (clabel1->mod_counter == clabel2->mod_counter) &&
3077
3078 The reason we don't check for this is that failed disks
3079 will have lower modification counts. If those disks are
3080 not added to the set they used to belong to, then they will
3081 form their own set, which may result in 2 different sets,
3082 for example, competing to be configured at raid0, and
3083 perhaps competing to be the root filesystem set. If the
3084 wrong ones get configured, or both attempt to become /,
3085 weird behaviour and or serious lossage will occur. Thus we
3086 need to bring them into the fold here, and kick them out at
3087 a later point.
3088
3089 */
3090
3091 clabel1 = cset->ac->clabel;
3092 clabel2 = ac->clabel;
3093 if ((clabel1->version == clabel2->version) &&
3094 (clabel1->serial_number == clabel2->serial_number) &&
3095 (clabel1->num_rows == clabel2->num_rows) &&
3096 (clabel1->num_columns == clabel2->num_columns) &&
3097 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3098 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3099 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3100 (clabel1->parityConfig == clabel2->parityConfig) &&
3101 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3102 (clabel1->blockSize == clabel2->blockSize) &&
3103 (clabel1->numBlocks == clabel2->numBlocks) &&
3104 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3105 (clabel1->root_partition == clabel2->root_partition) &&
3106 (clabel1->last_unit == clabel2->last_unit) &&
3107 (clabel1->config_order == clabel2->config_order)) {
3108 /* if it get's here, it almost *has* to be a match */
3109 } else {
3110 /* it's not consistent with somebody in the set..
3111 punt */
3112 return(0);
3113 }
3114 /* all was fine.. it must fit... */
3115 return(1);
3116 }
3117
3118 int
3119 rf_have_enough_components(RF_ConfigSet_t *cset)
3120 {
3121 RF_AutoConfig_t *ac;
3122 RF_AutoConfig_t *auto_config;
3123 RF_ComponentLabel_t *clabel;
3124 int c;
3125 int num_cols;
3126 int num_missing;
3127 int mod_counter;
3128 int mod_counter_found;
3129 int even_pair_failed;
3130 char parity_type;
3131
3132
3133 /* check to see that we have enough 'live' components
3134 of this set. If so, we can configure it if necessary */
3135
3136 num_cols = cset->ac->clabel->num_columns;
3137 parity_type = cset->ac->clabel->parityConfig;
3138
3139 /* XXX Check for duplicate components!?!?!? */
3140
3141 /* Determine what the mod_counter is supposed to be for this set. */
3142
3143 mod_counter_found = 0;
3144 mod_counter = 0;
3145 ac = cset->ac;
3146 while(ac!=NULL) {
3147 if (mod_counter_found==0) {
3148 mod_counter = ac->clabel->mod_counter;
3149 mod_counter_found = 1;
3150 } else {
3151 if (ac->clabel->mod_counter > mod_counter) {
3152 mod_counter = ac->clabel->mod_counter;
3153 }
3154 }
3155 ac = ac->next;
3156 }
3157
3158 num_missing = 0;
3159 auto_config = cset->ac;
3160
3161 even_pair_failed = 0;
3162 for(c=0; c<num_cols; c++) {
3163 ac = auto_config;
3164 while(ac!=NULL) {
3165 if ((ac->clabel->column == c) &&
3166 (ac->clabel->mod_counter == mod_counter)) {
3167 /* it's this one... */
3168 #ifdef DEBUG
3169 printf("Found: %s at %d\n",
3170 ac->devname,c);
3171 #endif
3172 break;
3173 }
3174 ac=ac->next;
3175 }
3176 if (ac==NULL) {
3177 /* Didn't find one here! */
3178 /* special case for RAID 1, especially
3179 where there are more than 2
3180 components (where RAIDframe treats
3181 things a little differently :( ) */
3182 if (parity_type == '1') {
3183 if (c%2 == 0) { /* even component */
3184 even_pair_failed = 1;
3185 } else { /* odd component. If
3186 we're failed, and
3187 so is the even
3188 component, it's
3189 "Good Night, Charlie" */
3190 if (even_pair_failed == 1) {
3191 return(0);
3192 }
3193 }
3194 } else {
3195 /* normal accounting */
3196 num_missing++;
3197 }
3198 }
3199 if ((parity_type == '1') && (c%2 == 1)) {
3200 /* Just did an even component, and we didn't
3201 bail.. reset the even_pair_failed flag,
3202 and go on to the next component.... */
3203 even_pair_failed = 0;
3204 }
3205 }
3206
3207 clabel = cset->ac->clabel;
3208
3209 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3210 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3211 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3212 /* XXX this needs to be made *much* more general */
3213 /* Too many failures */
3214 return(0);
3215 }
3216 /* otherwise, all is well, and we've got enough to take a kick
3217 at autoconfiguring this set */
3218 return(1);
3219 }
3220
3221 void
3222 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3223 RF_Raid_t *raidPtr)
3224 {
3225 RF_ComponentLabel_t *clabel;
3226 int i;
3227
3228 clabel = ac->clabel;
3229
3230 /* 1. Fill in the common stuff */
3231 config->numRow = clabel->num_rows = 1;
3232 config->numCol = clabel->num_columns;
3233 config->numSpare = 0; /* XXX should this be set here? */
3234 config->sectPerSU = clabel->sectPerSU;
3235 config->SUsPerPU = clabel->SUsPerPU;
3236 config->SUsPerRU = clabel->SUsPerRU;
3237 config->parityConfig = clabel->parityConfig;
3238 /* XXX... */
3239 strcpy(config->diskQueueType,"fifo");
3240 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3241 config->layoutSpecificSize = 0; /* XXX ?? */
3242
3243 while(ac!=NULL) {
3244 /* row/col values will be in range due to the checks
3245 in reasonable_label() */
3246 strcpy(config->devnames[0][ac->clabel->column],
3247 ac->devname);
3248 ac = ac->next;
3249 }
3250
3251 for(i=0;i<RF_MAXDBGV;i++) {
3252 config->debugVars[i][0] = 0;
3253 }
3254 }
3255
3256 int
3257 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3258 {
3259 RF_ComponentLabel_t clabel;
3260 struct vnode *vp;
3261 dev_t dev;
3262 int column;
3263 int sparecol;
3264
3265 raidPtr->autoconfigure = new_value;
3266
3267 for(column=0; column<raidPtr->numCol; column++) {
3268 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3269 dev = raidPtr->Disks[column].dev;
3270 vp = raidPtr->raid_cinfo[column].ci_vp;
3271 raidread_component_label(dev, vp, &clabel);
3272 clabel.autoconfigure = new_value;
3273 raidwrite_component_label(dev, vp, &clabel);
3274 }
3275 }
3276 for(column = 0; column < raidPtr->numSpare ; column++) {
3277 sparecol = raidPtr->numCol + column;
3278 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3279 dev = raidPtr->Disks[sparecol].dev;
3280 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3281 raidread_component_label(dev, vp, &clabel);
3282 clabel.autoconfigure = new_value;
3283 raidwrite_component_label(dev, vp, &clabel);
3284 }
3285 }
3286 return(new_value);
3287 }
3288
3289 int
3290 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3291 {
3292 RF_ComponentLabel_t clabel;
3293 struct vnode *vp;
3294 dev_t dev;
3295 int column;
3296 int sparecol;
3297
3298 raidPtr->root_partition = new_value;
3299 for(column=0; column<raidPtr->numCol; column++) {
3300 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3301 dev = raidPtr->Disks[column].dev;
3302 vp = raidPtr->raid_cinfo[column].ci_vp;
3303 raidread_component_label(dev, vp, &clabel);
3304 clabel.root_partition = new_value;
3305 raidwrite_component_label(dev, vp, &clabel);
3306 }
3307 }
3308 for(column = 0; column < raidPtr->numSpare ; column++) {
3309 sparecol = raidPtr->numCol + column;
3310 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3311 dev = raidPtr->Disks[sparecol].dev;
3312 vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3313 raidread_component_label(dev, vp, &clabel);
3314 clabel.root_partition = new_value;
3315 raidwrite_component_label(dev, vp, &clabel);
3316 }
3317 }
3318 return(new_value);
3319 }
3320
3321 void
3322 rf_release_all_vps(RF_ConfigSet_t *cset)
3323 {
3324 RF_AutoConfig_t *ac;
3325
3326 ac = cset->ac;
3327 while(ac!=NULL) {
3328 /* Close the vp, and give it back */
3329 if (ac->vp) {
3330 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3331 VOP_CLOSE(ac->vp, FREAD, NOCRED, 0);
3332 vput(ac->vp);
3333 ac->vp = NULL;
3334 }
3335 ac = ac->next;
3336 }
3337 }
3338
3339
3340 void
3341 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3342 {
3343 RF_AutoConfig_t *ac;
3344 RF_AutoConfig_t *next_ac;
3345
3346 ac = cset->ac;
3347 while(ac!=NULL) {
3348 next_ac = ac->next;
3349 /* nuke the label */
3350 free(ac->clabel, M_RAIDFRAME);
3351 /* cleanup the config structure */
3352 free(ac, M_RAIDFRAME);
3353 /* "next.." */
3354 ac = next_ac;
3355 }
3356 /* and, finally, nuke the config set */
3357 free(cset, M_RAIDFRAME);
3358 }
3359
3360
3361 void
3362 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3363 {
3364 /* current version number */
3365 clabel->version = RF_COMPONENT_LABEL_VERSION;
3366 clabel->serial_number = raidPtr->serial_number;
3367 clabel->mod_counter = raidPtr->mod_counter;
3368 clabel->num_rows = 1;
3369 clabel->num_columns = raidPtr->numCol;
3370 clabel->clean = RF_RAID_DIRTY; /* not clean */
3371 clabel->status = rf_ds_optimal; /* "It's good!" */
3372
3373 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3374 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3375 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3376
3377 clabel->blockSize = raidPtr->bytesPerSector;
3378 clabel->numBlocks = raidPtr->sectorsPerDisk;
3379
3380 /* XXX not portable */
3381 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3382 clabel->maxOutstanding = raidPtr->maxOutstanding;
3383 clabel->autoconfigure = raidPtr->autoconfigure;
3384 clabel->root_partition = raidPtr->root_partition;
3385 clabel->last_unit = raidPtr->raidid;
3386 clabel->config_order = raidPtr->config_order;
3387 }
3388
3389 int
3390 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3391 {
3392 RF_Raid_t *raidPtr;
3393 RF_Config_t *config;
3394 int raidID;
3395 int retcode;
3396
3397 #ifdef DEBUG
3398 printf("RAID autoconfigure\n");
3399 #endif
3400
3401 retcode = 0;
3402 *unit = -1;
3403
3404 /* 1. Create a config structure */
3405
3406 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3407 M_RAIDFRAME,
3408 M_NOWAIT);
3409 if (config==NULL) {
3410 printf("Out of mem!?!?\n");
3411 /* XXX do something more intelligent here. */
3412 return(1);
3413 }
3414
3415 memset(config, 0, sizeof(RF_Config_t));
3416
3417 /*
3418 2. Figure out what RAID ID this one is supposed to live at
3419 See if we can get the same RAID dev that it was configured
3420 on last time..
3421 */
3422
3423 raidID = cset->ac->clabel->last_unit;
3424 if ((raidID < 0) || (raidID >= numraid)) {
3425 /* let's not wander off into lala land. */
3426 raidID = numraid - 1;
3427 }
3428 if (raidPtrs[raidID]->valid != 0) {
3429
3430 /*
3431 Nope... Go looking for an alternative...
3432 Start high so we don't immediately use raid0 if that's
3433 not taken.
3434 */
3435
3436 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3437 if (raidPtrs[raidID]->valid == 0) {
3438 /* can use this one! */
3439 break;
3440 }
3441 }
3442 }
3443
3444 if (raidID < 0) {
3445 /* punt... */
3446 printf("Unable to auto configure this set!\n");
3447 printf("(Out of RAID devs!)\n");
3448 free(config, M_RAIDFRAME);
3449 return(1);
3450 }
3451
3452 #ifdef DEBUG
3453 printf("Configuring raid%d:\n",raidID);
3454 #endif
3455
3456 raidPtr = raidPtrs[raidID];
3457
3458 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3459 raidPtr->raidid = raidID;
3460 raidPtr->openings = RAIDOUTSTANDING;
3461
3462 /* 3. Build the configuration structure */
3463 rf_create_configuration(cset->ac, config, raidPtr);
3464
3465 /* 4. Do the configuration */
3466 retcode = rf_Configure(raidPtr, config, cset->ac);
3467
3468 if (retcode == 0) {
3469
3470 raidinit(raidPtrs[raidID]);
3471
3472 rf_markalldirty(raidPtrs[raidID]);
3473 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3474 if (cset->ac->clabel->root_partition==1) {
3475 /* everything configured just fine. Make a note
3476 that this set is eligible to be root. */
3477 cset->rootable = 1;
3478 /* XXX do this here? */
3479 raidPtrs[raidID]->root_partition = 1;
3480 }
3481 }
3482
3483 /* 5. Cleanup */
3484 free(config, M_RAIDFRAME);
3485
3486 *unit = raidID;
3487 return(retcode);
3488 }
3489
3490 void
3491 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3492 {
3493 struct buf *bp;
3494
3495 bp = (struct buf *)desc->bp;
3496 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3497 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3498 }
3499
3500 void
3501 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3502 size_t xmin, size_t xmax)
3503 {
3504 pool_init(p, size, 0, 0, 0, w_chan, NULL);
3505 pool_sethiwat(p, xmax);
3506 pool_prime(p, xmin);
3507 pool_setlowat(p, xmin);
3508 }
3509
3510 /*
3511 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3512 * if there is IO pending and if that IO could possibly be done for a
3513 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3514 * otherwise.
3515 *
3516 */
3517
3518 int
3519 rf_buf_queue_check(int raidid)
3520 {
3521 if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
3522 raidPtrs[raidid]->openings > 0) {
3523 /* there is work to do */
3524 return 0;
3525 }
3526 /* default is nothing to do */
3527 return 1;
3528 }
3529
3530 int
3531 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3532 {
3533 struct partinfo dpart;
3534 struct dkwedge_info dkw;
3535 int error;
3536
3537 error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred, l);
3538 if (error == 0) {
3539 diskPtr->blockSize = dpart.disklab->d_secsize;
3540 diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3541 diskPtr->partitionSize = dpart.part->p_size;
3542 return 0;
3543 }
3544
3545 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred, l);
3546 if (error == 0) {
3547 diskPtr->blockSize = 512; /* XXX */
3548 diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3549 diskPtr->partitionSize = dkw.dkw_size;
3550 return 0;
3551 }
3552 return error;
3553 }
3554
3555 static int
3556 raid_match(struct device *self, struct cfdata *cfdata,
3557 void *aux)
3558 {
3559 return 1;
3560 }
3561
3562 static void
3563 raid_attach(struct device *parent, struct device *self,
3564 void *aux)
3565 {
3566
3567 }
3568
3569
3570 static int
3571 raid_detach(struct device *self, int flags)
3572 {
3573 struct raid_softc *rs = (struct raid_softc *)self;
3574
3575 if (rs->sc_flags & RAIDF_INITED)
3576 return EBUSY;
3577
3578 return 0;
3579 }
3580
3581
3582