rf_netbsdkintf.c revision 1.93 1 /* $NetBSD: rf_netbsdkintf.c,v 1.93 2000/07/14 15:26:29 oster Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the NetBSD
20 * Foundation, Inc. and its contributors.
21 * 4. Neither the name of The NetBSD Foundation nor the names of its
22 * contributors may be used to endorse or promote products derived
23 * from this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Copyright (c) 1988 University of Utah.
40 * Copyright (c) 1990, 1993
41 * The Regents of the University of California. All rights reserved.
42 *
43 * This code is derived from software contributed to Berkeley by
44 * the Systems Programming Group of the University of Utah Computer
45 * Science Department.
46 *
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions
49 * are met:
50 * 1. Redistributions of source code must retain the above copyright
51 * notice, this list of conditions and the following disclaimer.
52 * 2. Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * 3. All advertising materials mentioning features or use of this software
56 * must display the following acknowledgement:
57 * This product includes software developed by the University of
58 * California, Berkeley and its contributors.
59 * 4. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
62 *
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE.
74 *
75 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 *
77 * @(#)cd.c 8.2 (Berkeley) 11/16/93
78 */
79
80
81
82
83 /*
84 * Copyright (c) 1995 Carnegie-Mellon University.
85 * All rights reserved.
86 *
87 * Authors: Mark Holland, Jim Zelenka
88 *
89 * Permission to use, copy, modify and distribute this software and
90 * its documentation is hereby granted, provided that both the copyright
91 * notice and this permission notice appear in all copies of the
92 * software, derivative works or modified versions, and any portions
93 * thereof, and that both notices appear in supporting documentation.
94 *
95 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
96 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
97 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
98 *
99 * Carnegie Mellon requests users of this software to return to
100 *
101 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
102 * School of Computer Science
103 * Carnegie Mellon University
104 * Pittsburgh PA 15213-3890
105 *
106 * any improvements or extensions that they make and grant Carnegie the
107 * rights to redistribute these changes.
108 */
109
110 /***********************************************************
111 *
112 * rf_kintf.c -- the kernel interface routines for RAIDframe
113 *
114 ***********************************************************/
115
116 #include <sys/errno.h>
117 #include <sys/param.h>
118 #include <sys/pool.h>
119 #include <sys/queue.h>
120 #include <sys/disk.h>
121 #include <sys/device.h>
122 #include <sys/stat.h>
123 #include <sys/ioctl.h>
124 #include <sys/fcntl.h>
125 #include <sys/systm.h>
126 #include <sys/namei.h>
127 #include <sys/vnode.h>
128 #include <sys/param.h>
129 #include <sys/types.h>
130 #include <machine/types.h>
131 #include <sys/disklabel.h>
132 #include <sys/conf.h>
133 #include <sys/lock.h>
134 #include <sys/buf.h>
135 #include <sys/user.h>
136 #include <sys/reboot.h>
137
138 #include "raid.h"
139 #include "opt_raid_autoconfig.h"
140 #include "rf_raid.h"
141 #include "rf_raidframe.h"
142 #include "rf_copyback.h"
143 #include "rf_dag.h"
144 #include "rf_dagflags.h"
145 #include "rf_diskqueue.h"
146 #include "rf_acctrace.h"
147 #include "rf_etimer.h"
148 #include "rf_general.h"
149 #include "rf_debugMem.h"
150 #include "rf_kintf.h"
151 #include "rf_options.h"
152 #include "rf_driver.h"
153 #include "rf_parityscan.h"
154 #include "rf_debugprint.h"
155 #include "rf_threadstuff.h"
156 #include "rf_configure.h"
157
158 int rf_kdebug_level = 0;
159
160 #ifdef DEBUG
161 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
162 #else /* DEBUG */
163 #define db1_printf(a) { }
164 #endif /* DEBUG */
165
166 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
167
168 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
169
170 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
171 * spare table */
172 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
173 * installation process */
174
175 /* prototypes */
176 static void KernelWakeupFunc(struct buf * bp);
177 static void InitBP(struct buf * bp, struct vnode *, unsigned rw_flag,
178 dev_t dev, RF_SectorNum_t startSect,
179 RF_SectorCount_t numSect, caddr_t buf,
180 void (*cbFunc) (struct buf *), void *cbArg,
181 int logBytesPerSector, struct proc * b_proc);
182 static void raidinit __P((RF_Raid_t *));
183
184 void raidattach __P((int));
185 int raidsize __P((dev_t));
186 int raidopen __P((dev_t, int, int, struct proc *));
187 int raidclose __P((dev_t, int, int, struct proc *));
188 int raidioctl __P((dev_t, u_long, caddr_t, int, struct proc *));
189 int raidwrite __P((dev_t, struct uio *, int));
190 int raidread __P((dev_t, struct uio *, int));
191 void raidstrategy __P((struct buf *));
192 int raiddump __P((dev_t, daddr_t, caddr_t, size_t));
193
194 /*
195 * Pilfered from ccd.c
196 */
197
198 struct raidbuf {
199 struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */
200 struct buf *rf_obp; /* ptr. to original I/O buf */
201 int rf_flags; /* misc. flags */
202 RF_DiskQueueData_t *req;/* the request that this was part of.. */
203 };
204
205
206 #define RAIDGETBUF(rs) pool_get(&(rs)->sc_cbufpool, PR_NOWAIT)
207 #define RAIDPUTBUF(rs, cbp) pool_put(&(rs)->sc_cbufpool, cbp)
208
209 /* XXX Not sure if the following should be replacing the raidPtrs above,
210 or if it should be used in conjunction with that...
211 */
212
213 struct raid_softc {
214 int sc_flags; /* flags */
215 int sc_cflags; /* configuration flags */
216 size_t sc_size; /* size of the raid device */
217 char sc_xname[20]; /* XXX external name */
218 struct disk sc_dkdev; /* generic disk device info */
219 struct pool sc_cbufpool; /* component buffer pool */
220 struct buf_queue buf_queue; /* used for the device queue */
221 };
222 /* sc_flags */
223 #define RAIDF_INITED 0x01 /* unit has been initialized */
224 #define RAIDF_WLABEL 0x02 /* label area is writable */
225 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
226 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
227 #define RAIDF_LOCKED 0x80 /* unit is locked */
228
229 #define raidunit(x) DISKUNIT(x)
230 int numraid = 0;
231
232 /*
233 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
234 * Be aware that large numbers can allow the driver to consume a lot of
235 * kernel memory, especially on writes, and in degraded mode reads.
236 *
237 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
238 * a single 64K write will typically require 64K for the old data,
239 * 64K for the old parity, and 64K for the new parity, for a total
240 * of 192K (if the parity buffer is not re-used immediately).
241 * Even it if is used immedately, that's still 128K, which when multiplied
242 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
243 *
244 * Now in degraded mode, for example, a 64K read on the above setup may
245 * require data reconstruction, which will require *all* of the 4 remaining
246 * disks to participate -- 4 * 32K/disk == 128K again.
247 */
248
249 #ifndef RAIDOUTSTANDING
250 #define RAIDOUTSTANDING 6
251 #endif
252
253 #define RAIDLABELDEV(dev) \
254 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
255
256 /* declared here, and made public, for the benefit of KVM stuff.. */
257 struct raid_softc *raid_softc;
258
259 static void raidgetdefaultlabel __P((RF_Raid_t *, struct raid_softc *,
260 struct disklabel *));
261 static void raidgetdisklabel __P((dev_t));
262 static void raidmakedisklabel __P((struct raid_softc *));
263
264 static int raidlock __P((struct raid_softc *));
265 static void raidunlock __P((struct raid_softc *));
266
267 static void rf_markalldirty __P((RF_Raid_t *));
268 void rf_mountroot_hook __P((struct device *));
269
270 struct device *raidrootdev;
271
272 void rf_ReconThread __P((struct rf_recon_req *));
273 /* XXX what I want is: */
274 /*void rf_ReconThread __P((RF_Raid_t *raidPtr)); */
275 void rf_RewriteParityThread __P((RF_Raid_t *raidPtr));
276 void rf_CopybackThread __P((RF_Raid_t *raidPtr));
277 void rf_ReconstructInPlaceThread __P((struct rf_recon_req *));
278 void rf_buildroothack __P((void *));
279
280 RF_AutoConfig_t *rf_find_raid_components __P((void));
281 RF_ConfigSet_t *rf_create_auto_sets __P((RF_AutoConfig_t *));
282 static int rf_does_it_fit __P((RF_ConfigSet_t *,RF_AutoConfig_t *));
283 static int rf_reasonable_label __P((RF_ComponentLabel_t *));
284 void rf_create_configuration __P((RF_AutoConfig_t *,RF_Config_t *,
285 RF_Raid_t *));
286 int rf_set_autoconfig __P((RF_Raid_t *, int));
287 int rf_set_rootpartition __P((RF_Raid_t *, int));
288 void rf_release_all_vps __P((RF_ConfigSet_t *));
289 void rf_cleanup_config_set __P((RF_ConfigSet_t *));
290 int rf_have_enough_components __P((RF_ConfigSet_t *));
291 int rf_auto_config_set __P((RF_ConfigSet_t *, int *));
292
293 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
294 allow autoconfig to take place.
295 Note that this is overridden by having
296 RAID_AUTOCONFIG as an option in the
297 kernel config file. */
298
299 void
300 raidattach(num)
301 int num;
302 {
303 int raidID;
304 int i, rc;
305 RF_AutoConfig_t *ac_list; /* autoconfig list */
306 RF_ConfigSet_t *config_sets;
307
308 #ifdef DEBUG
309 printf("raidattach: Asked for %d units\n", num);
310 #endif
311
312 if (num <= 0) {
313 #ifdef DIAGNOSTIC
314 panic("raidattach: count <= 0");
315 #endif
316 return;
317 }
318 /* This is where all the initialization stuff gets done. */
319
320 numraid = num;
321
322 /* Make some space for requested number of units... */
323
324 RF_Calloc(raidPtrs, num, sizeof(RF_Raid_t *), (RF_Raid_t **));
325 if (raidPtrs == NULL) {
326 panic("raidPtrs is NULL!!\n");
327 }
328
329 rc = rf_mutex_init(&rf_sparet_wait_mutex);
330 if (rc) {
331 RF_PANIC();
332 }
333
334 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
335
336 for (i = 0; i < num; i++)
337 raidPtrs[i] = NULL;
338 rc = rf_BootRaidframe();
339 if (rc == 0)
340 printf("Kernelized RAIDframe activated\n");
341 else
342 panic("Serious error booting RAID!!\n");
343
344 /* put together some datastructures like the CCD device does.. This
345 * lets us lock the device and what-not when it gets opened. */
346
347 raid_softc = (struct raid_softc *)
348 malloc(num * sizeof(struct raid_softc),
349 M_RAIDFRAME, M_NOWAIT);
350 if (raid_softc == NULL) {
351 printf("WARNING: no memory for RAIDframe driver\n");
352 return;
353 }
354
355 bzero(raid_softc, num * sizeof(struct raid_softc));
356
357 raidrootdev = (struct device *)malloc(num * sizeof(struct device),
358 M_RAIDFRAME, M_NOWAIT);
359 if (raidrootdev == NULL) {
360 panic("No memory for RAIDframe driver!!?!?!\n");
361 }
362
363 for (raidID = 0; raidID < num; raidID++) {
364 BUFQ_INIT(&raid_softc[raidID].buf_queue);
365
366 raidrootdev[raidID].dv_class = DV_DISK;
367 raidrootdev[raidID].dv_cfdata = NULL;
368 raidrootdev[raidID].dv_unit = raidID;
369 raidrootdev[raidID].dv_parent = NULL;
370 raidrootdev[raidID].dv_flags = 0;
371 sprintf(raidrootdev[raidID].dv_xname,"raid%d",raidID);
372
373 RF_Calloc(raidPtrs[raidID], 1, sizeof(RF_Raid_t),
374 (RF_Raid_t *));
375 if (raidPtrs[raidID] == NULL) {
376 printf("WARNING: raidPtrs[%d] is NULL\n", raidID);
377 numraid = raidID;
378 return;
379 }
380 }
381
382 #if RAID_AUTOCONFIG
383 raidautoconfig = 1;
384 #endif
385
386 if (raidautoconfig) {
387 /* 1. locate all RAID components on the system */
388
389 #if DEBUG
390 printf("Searching for raid components...\n");
391 #endif
392 ac_list = rf_find_raid_components();
393
394 /* 2. sort them into their respective sets */
395
396 config_sets = rf_create_auto_sets(ac_list);
397
398 /* 3. evaluate each set and configure the valid ones
399 This gets done in rf_buildroothack() */
400
401 /* schedule the creation of the thread to do the
402 "/ on RAID" stuff */
403
404 kthread_create(rf_buildroothack,config_sets);
405
406 #if 0
407 mountroothook_establish(rf_mountroot_hook, &raidrootdev[0]);
408 #endif
409 }
410
411 }
412
413 void
414 rf_buildroothack(arg)
415 void *arg;
416 {
417 RF_ConfigSet_t *config_sets = arg;
418 RF_ConfigSet_t *cset;
419 RF_ConfigSet_t *next_cset;
420 int retcode;
421 int raidID;
422 int rootID;
423 int num_root;
424
425 num_root = 0;
426 cset = config_sets;
427 while(cset != NULL ) {
428 next_cset = cset->next;
429 if (rf_have_enough_components(cset) &&
430 cset->ac->clabel->autoconfigure==1) {
431 retcode = rf_auto_config_set(cset,&raidID);
432 if (!retcode) {
433 if (cset->rootable) {
434 rootID = raidID;
435 num_root++;
436 }
437 } else {
438 /* The autoconfig didn't work :( */
439 #if DEBUG
440 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
441 #endif
442 rf_release_all_vps(cset);
443 }
444 } else {
445 /* we're not autoconfiguring this set...
446 release the associated resources */
447 rf_release_all_vps(cset);
448 }
449 /* cleanup */
450 rf_cleanup_config_set(cset);
451 cset = next_cset;
452 }
453 if (boothowto & RB_ASKNAME) {
454 /* We don't auto-config... */
455 } else {
456 /* They didn't ask, and we found something bootable... */
457
458 if (num_root == 1) {
459 booted_device = &raidrootdev[rootID];
460 } else if (num_root > 1) {
461 /* we can't guess.. require the user to answer... */
462 boothowto |= RB_ASKNAME;
463 }
464 }
465 }
466
467
468 int
469 raidsize(dev)
470 dev_t dev;
471 {
472 struct raid_softc *rs;
473 struct disklabel *lp;
474 int part, unit, omask, size;
475
476 unit = raidunit(dev);
477 if (unit >= numraid)
478 return (-1);
479 rs = &raid_softc[unit];
480
481 if ((rs->sc_flags & RAIDF_INITED) == 0)
482 return (-1);
483
484 part = DISKPART(dev);
485 omask = rs->sc_dkdev.dk_openmask & (1 << part);
486 lp = rs->sc_dkdev.dk_label;
487
488 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curproc))
489 return (-1);
490
491 if (lp->d_partitions[part].p_fstype != FS_SWAP)
492 size = -1;
493 else
494 size = lp->d_partitions[part].p_size *
495 (lp->d_secsize / DEV_BSIZE);
496
497 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curproc))
498 return (-1);
499
500 return (size);
501
502 }
503
504 int
505 raiddump(dev, blkno, va, size)
506 dev_t dev;
507 daddr_t blkno;
508 caddr_t va;
509 size_t size;
510 {
511 /* Not implemented. */
512 return ENXIO;
513 }
514 /* ARGSUSED */
515 int
516 raidopen(dev, flags, fmt, p)
517 dev_t dev;
518 int flags, fmt;
519 struct proc *p;
520 {
521 int unit = raidunit(dev);
522 struct raid_softc *rs;
523 struct disklabel *lp;
524 int part, pmask;
525 int error = 0;
526
527 if (unit >= numraid)
528 return (ENXIO);
529 rs = &raid_softc[unit];
530
531 if ((error = raidlock(rs)) != 0)
532 return (error);
533 lp = rs->sc_dkdev.dk_label;
534
535 part = DISKPART(dev);
536 pmask = (1 << part);
537
538 db1_printf(("Opening raid device number: %d partition: %d\n",
539 unit, part));
540
541
542 if ((rs->sc_flags & RAIDF_INITED) &&
543 (rs->sc_dkdev.dk_openmask == 0))
544 raidgetdisklabel(dev);
545
546 /* make sure that this partition exists */
547
548 if (part != RAW_PART) {
549 db1_printf(("Not a raw partition..\n"));
550 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
551 ((part >= lp->d_npartitions) ||
552 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
553 error = ENXIO;
554 raidunlock(rs);
555 db1_printf(("Bailing out...\n"));
556 return (error);
557 }
558 }
559 /* Prevent this unit from being unconfigured while open. */
560 switch (fmt) {
561 case S_IFCHR:
562 rs->sc_dkdev.dk_copenmask |= pmask;
563 break;
564
565 case S_IFBLK:
566 rs->sc_dkdev.dk_bopenmask |= pmask;
567 break;
568 }
569
570 if ((rs->sc_dkdev.dk_openmask == 0) &&
571 ((rs->sc_flags & RAIDF_INITED) != 0)) {
572 /* First one... mark things as dirty... Note that we *MUST*
573 have done a configure before this. I DO NOT WANT TO BE
574 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
575 THAT THEY BELONG TOGETHER!!!!! */
576 /* XXX should check to see if we're only open for reading
577 here... If so, we needn't do this, but then need some
578 other way of keeping track of what's happened.. */
579
580 rf_markalldirty( raidPtrs[unit] );
581 }
582
583
584 rs->sc_dkdev.dk_openmask =
585 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
586
587 raidunlock(rs);
588
589 return (error);
590
591
592 }
593 /* ARGSUSED */
594 int
595 raidclose(dev, flags, fmt, p)
596 dev_t dev;
597 int flags, fmt;
598 struct proc *p;
599 {
600 int unit = raidunit(dev);
601 struct raid_softc *rs;
602 int error = 0;
603 int part;
604
605 if (unit >= numraid)
606 return (ENXIO);
607 rs = &raid_softc[unit];
608
609 if ((error = raidlock(rs)) != 0)
610 return (error);
611
612 part = DISKPART(dev);
613
614 /* ...that much closer to allowing unconfiguration... */
615 switch (fmt) {
616 case S_IFCHR:
617 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
618 break;
619
620 case S_IFBLK:
621 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
622 break;
623 }
624 rs->sc_dkdev.dk_openmask =
625 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
626
627 if ((rs->sc_dkdev.dk_openmask == 0) &&
628 ((rs->sc_flags & RAIDF_INITED) != 0)) {
629 /* Last one... device is not unconfigured yet.
630 Device shutdown has taken care of setting the
631 clean bits if RAIDF_INITED is not set
632 mark things as clean... */
633 #if 0
634 printf("Last one on raid%d. Updating status.\n",unit);
635 #endif
636 rf_update_component_labels(raidPtrs[unit],
637 RF_FINAL_COMPONENT_UPDATE);
638 }
639
640 raidunlock(rs);
641 return (0);
642
643 }
644
645 void
646 raidstrategy(bp)
647 struct buf *bp;
648 {
649 int s;
650
651 unsigned int raidID = raidunit(bp->b_dev);
652 RF_Raid_t *raidPtr;
653 struct raid_softc *rs = &raid_softc[raidID];
654 struct disklabel *lp;
655 int wlabel;
656
657 if ((rs->sc_flags & RAIDF_INITED) ==0) {
658 bp->b_error = ENXIO;
659 bp->b_flags = B_ERROR;
660 bp->b_resid = bp->b_bcount;
661 biodone(bp);
662 return;
663 }
664 if (raidID >= numraid || !raidPtrs[raidID]) {
665 bp->b_error = ENODEV;
666 bp->b_flags |= B_ERROR;
667 bp->b_resid = bp->b_bcount;
668 biodone(bp);
669 return;
670 }
671 raidPtr = raidPtrs[raidID];
672 if (!raidPtr->valid) {
673 bp->b_error = ENODEV;
674 bp->b_flags |= B_ERROR;
675 bp->b_resid = bp->b_bcount;
676 biodone(bp);
677 return;
678 }
679 if (bp->b_bcount == 0) {
680 db1_printf(("b_bcount is zero..\n"));
681 biodone(bp);
682 return;
683 }
684 lp = rs->sc_dkdev.dk_label;
685
686 /*
687 * Do bounds checking and adjust transfer. If there's an
688 * error, the bounds check will flag that for us.
689 */
690
691 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
692 if (DISKPART(bp->b_dev) != RAW_PART)
693 if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
694 db1_printf(("Bounds check failed!!:%d %d\n",
695 (int) bp->b_blkno, (int) wlabel));
696 biodone(bp);
697 return;
698 }
699 s = splbio();
700
701 bp->b_resid = 0;
702
703 /* stuff it onto our queue */
704 BUFQ_INSERT_TAIL(&rs->buf_queue, bp);
705
706 raidstart(raidPtrs[raidID]);
707
708 splx(s);
709 }
710 /* ARGSUSED */
711 int
712 raidread(dev, uio, flags)
713 dev_t dev;
714 struct uio *uio;
715 int flags;
716 {
717 int unit = raidunit(dev);
718 struct raid_softc *rs;
719 int part;
720
721 if (unit >= numraid)
722 return (ENXIO);
723 rs = &raid_softc[unit];
724
725 if ((rs->sc_flags & RAIDF_INITED) == 0)
726 return (ENXIO);
727 part = DISKPART(dev);
728
729 db1_printf(("raidread: unit: %d partition: %d\n", unit, part));
730
731 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
732
733 }
734 /* ARGSUSED */
735 int
736 raidwrite(dev, uio, flags)
737 dev_t dev;
738 struct uio *uio;
739 int flags;
740 {
741 int unit = raidunit(dev);
742 struct raid_softc *rs;
743
744 if (unit >= numraid)
745 return (ENXIO);
746 rs = &raid_softc[unit];
747
748 if ((rs->sc_flags & RAIDF_INITED) == 0)
749 return (ENXIO);
750 db1_printf(("raidwrite\n"));
751 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
752
753 }
754
755 int
756 raidioctl(dev, cmd, data, flag, p)
757 dev_t dev;
758 u_long cmd;
759 caddr_t data;
760 int flag;
761 struct proc *p;
762 {
763 int unit = raidunit(dev);
764 int error = 0;
765 int part, pmask;
766 struct raid_softc *rs;
767 RF_Config_t *k_cfg, *u_cfg;
768 RF_Raid_t *raidPtr;
769 RF_RaidDisk_t *diskPtr;
770 RF_AccTotals_t *totals;
771 RF_DeviceConfig_t *d_cfg, **ucfgp;
772 u_char *specific_buf;
773 int retcode = 0;
774 int row;
775 int column;
776 struct rf_recon_req *rrcopy, *rr;
777 RF_ComponentLabel_t *clabel;
778 RF_ComponentLabel_t ci_label;
779 RF_ComponentLabel_t **clabel_ptr;
780 RF_SingleComponent_t *sparePtr,*componentPtr;
781 RF_SingleComponent_t hot_spare;
782 RF_SingleComponent_t component;
783 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
784 int i, j, d;
785
786 if (unit >= numraid)
787 return (ENXIO);
788 rs = &raid_softc[unit];
789 raidPtr = raidPtrs[unit];
790
791 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
792 (int) DISKPART(dev), (int) unit, (int) cmd));
793
794 /* Must be open for writes for these commands... */
795 switch (cmd) {
796 case DIOCSDINFO:
797 case DIOCWDINFO:
798 case DIOCWLABEL:
799 if ((flag & FWRITE) == 0)
800 return (EBADF);
801 }
802
803 /* Must be initialized for these... */
804 switch (cmd) {
805 case DIOCGDINFO:
806 case DIOCSDINFO:
807 case DIOCWDINFO:
808 case DIOCGPART:
809 case DIOCWLABEL:
810 case DIOCGDEFLABEL:
811 case RAIDFRAME_SHUTDOWN:
812 case RAIDFRAME_REWRITEPARITY:
813 case RAIDFRAME_GET_INFO:
814 case RAIDFRAME_RESET_ACCTOTALS:
815 case RAIDFRAME_GET_ACCTOTALS:
816 case RAIDFRAME_KEEP_ACCTOTALS:
817 case RAIDFRAME_GET_SIZE:
818 case RAIDFRAME_FAIL_DISK:
819 case RAIDFRAME_COPYBACK:
820 case RAIDFRAME_CHECK_RECON_STATUS:
821 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
822 case RAIDFRAME_GET_COMPONENT_LABEL:
823 case RAIDFRAME_SET_COMPONENT_LABEL:
824 case RAIDFRAME_ADD_HOT_SPARE:
825 case RAIDFRAME_REMOVE_HOT_SPARE:
826 case RAIDFRAME_INIT_LABELS:
827 case RAIDFRAME_REBUILD_IN_PLACE:
828 case RAIDFRAME_CHECK_PARITY:
829 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
830 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
831 case RAIDFRAME_CHECK_COPYBACK_STATUS:
832 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
833 case RAIDFRAME_SET_AUTOCONFIG:
834 case RAIDFRAME_SET_ROOT:
835 case RAIDFRAME_DELETE_COMPONENT:
836 case RAIDFRAME_INCORPORATE_HOT_SPARE:
837 if ((rs->sc_flags & RAIDF_INITED) == 0)
838 return (ENXIO);
839 }
840
841 switch (cmd) {
842
843 /* configure the system */
844 case RAIDFRAME_CONFIGURE:
845
846 if (raidPtr->valid) {
847 /* There is a valid RAID set running on this unit! */
848 printf("raid%d: Device already configured!\n",unit);
849 return(EINVAL);
850 }
851
852 /* copy-in the configuration information */
853 /* data points to a pointer to the configuration structure */
854
855 u_cfg = *((RF_Config_t **) data);
856 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
857 if (k_cfg == NULL) {
858 return (ENOMEM);
859 }
860 retcode = copyin((caddr_t) u_cfg, (caddr_t) k_cfg,
861 sizeof(RF_Config_t));
862 if (retcode) {
863 RF_Free(k_cfg, sizeof(RF_Config_t));
864 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
865 retcode));
866 return (retcode);
867 }
868 /* allocate a buffer for the layout-specific data, and copy it
869 * in */
870 if (k_cfg->layoutSpecificSize) {
871 if (k_cfg->layoutSpecificSize > 10000) {
872 /* sanity check */
873 RF_Free(k_cfg, sizeof(RF_Config_t));
874 return (EINVAL);
875 }
876 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
877 (u_char *));
878 if (specific_buf == NULL) {
879 RF_Free(k_cfg, sizeof(RF_Config_t));
880 return (ENOMEM);
881 }
882 retcode = copyin(k_cfg->layoutSpecific,
883 (caddr_t) specific_buf,
884 k_cfg->layoutSpecificSize);
885 if (retcode) {
886 RF_Free(k_cfg, sizeof(RF_Config_t));
887 RF_Free(specific_buf,
888 k_cfg->layoutSpecificSize);
889 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
890 retcode));
891 return (retcode);
892 }
893 } else
894 specific_buf = NULL;
895 k_cfg->layoutSpecific = specific_buf;
896
897 /* should do some kind of sanity check on the configuration.
898 * Store the sum of all the bytes in the last byte? */
899
900 /* configure the system */
901
902 /*
903 * Clear the entire RAID descriptor, just to make sure
904 * there is no stale data left in the case of a
905 * reconfiguration
906 */
907 bzero((char *) raidPtr, sizeof(RF_Raid_t));
908 raidPtr->raidid = unit;
909
910 retcode = rf_Configure(raidPtr, k_cfg, NULL);
911
912 if (retcode == 0) {
913
914 /* allow this many simultaneous IO's to
915 this RAID device */
916 raidPtr->openings = RAIDOUTSTANDING;
917
918 raidinit(raidPtr);
919 rf_markalldirty(raidPtr);
920 }
921 /* free the buffers. No return code here. */
922 if (k_cfg->layoutSpecificSize) {
923 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
924 }
925 RF_Free(k_cfg, sizeof(RF_Config_t));
926
927 return (retcode);
928
929 /* shutdown the system */
930 case RAIDFRAME_SHUTDOWN:
931
932 if ((error = raidlock(rs)) != 0)
933 return (error);
934
935 /*
936 * If somebody has a partition mounted, we shouldn't
937 * shutdown.
938 */
939
940 part = DISKPART(dev);
941 pmask = (1 << part);
942 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
943 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
944 (rs->sc_dkdev.dk_copenmask & pmask))) {
945 raidunlock(rs);
946 return (EBUSY);
947 }
948
949 retcode = rf_Shutdown(raidPtr);
950
951 pool_destroy(&rs->sc_cbufpool);
952
953 /* It's no longer initialized... */
954 rs->sc_flags &= ~RAIDF_INITED;
955
956 /* Detach the disk. */
957 disk_detach(&rs->sc_dkdev);
958
959 raidunlock(rs);
960
961 return (retcode);
962 case RAIDFRAME_GET_COMPONENT_LABEL:
963 clabel_ptr = (RF_ComponentLabel_t **) data;
964 /* need to read the component label for the disk indicated
965 by row,column in clabel */
966
967 /* For practice, let's get it directly fromdisk, rather
968 than from the in-core copy */
969 RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
970 (RF_ComponentLabel_t *));
971 if (clabel == NULL)
972 return (ENOMEM);
973
974 bzero((char *) clabel, sizeof(RF_ComponentLabel_t));
975
976 retcode = copyin( *clabel_ptr, clabel,
977 sizeof(RF_ComponentLabel_t));
978
979 if (retcode) {
980 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
981 return(retcode);
982 }
983
984 row = clabel->row;
985 column = clabel->column;
986
987 if ((row < 0) || (row >= raidPtr->numRow) ||
988 (column < 0) || (column >= raidPtr->numCol +
989 raidPtr->numSpare)) {
990 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
991 return(EINVAL);
992 }
993
994 raidread_component_label(raidPtr->Disks[row][column].dev,
995 raidPtr->raid_cinfo[row][column].ci_vp,
996 clabel );
997
998 retcode = copyout((caddr_t) clabel,
999 (caddr_t) *clabel_ptr,
1000 sizeof(RF_ComponentLabel_t));
1001 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1002 return (retcode);
1003
1004 case RAIDFRAME_SET_COMPONENT_LABEL:
1005 clabel = (RF_ComponentLabel_t *) data;
1006
1007 /* XXX check the label for valid stuff... */
1008 /* Note that some things *should not* get modified --
1009 the user should be re-initing the labels instead of
1010 trying to patch things.
1011 */
1012
1013 printf("Got component label:\n");
1014 printf("Version: %d\n",clabel->version);
1015 printf("Serial Number: %d\n",clabel->serial_number);
1016 printf("Mod counter: %d\n",clabel->mod_counter);
1017 printf("Row: %d\n", clabel->row);
1018 printf("Column: %d\n", clabel->column);
1019 printf("Num Rows: %d\n", clabel->num_rows);
1020 printf("Num Columns: %d\n", clabel->num_columns);
1021 printf("Clean: %d\n", clabel->clean);
1022 printf("Status: %d\n", clabel->status);
1023
1024 row = clabel->row;
1025 column = clabel->column;
1026
1027 if ((row < 0) || (row >= raidPtr->numRow) ||
1028 (column < 0) || (column >= raidPtr->numCol)) {
1029 return(EINVAL);
1030 }
1031
1032 /* XXX this isn't allowed to do anything for now :-) */
1033
1034 /* XXX and before it is, we need to fill in the rest
1035 of the fields!?!?!?! */
1036 #if 0
1037 raidwrite_component_label(
1038 raidPtr->Disks[row][column].dev,
1039 raidPtr->raid_cinfo[row][column].ci_vp,
1040 clabel );
1041 #endif
1042 return (0);
1043
1044 case RAIDFRAME_INIT_LABELS:
1045 clabel = (RF_ComponentLabel_t *) data;
1046 /*
1047 we only want the serial number from
1048 the above. We get all the rest of the information
1049 from the config that was used to create this RAID
1050 set.
1051 */
1052
1053 raidPtr->serial_number = clabel->serial_number;
1054
1055 raid_init_component_label(raidPtr, &ci_label);
1056 ci_label.serial_number = clabel->serial_number;
1057
1058 for(row=0;row<raidPtr->numRow;row++) {
1059 ci_label.row = row;
1060 for(column=0;column<raidPtr->numCol;column++) {
1061 diskPtr = &raidPtr->Disks[row][column];
1062 ci_label.partitionSize = diskPtr->partitionSize;
1063 ci_label.column = column;
1064 raidwrite_component_label(
1065 raidPtr->Disks[row][column].dev,
1066 raidPtr->raid_cinfo[row][column].ci_vp,
1067 &ci_label );
1068 }
1069 }
1070
1071 return (retcode);
1072 case RAIDFRAME_SET_AUTOCONFIG:
1073 d = rf_set_autoconfig(raidPtr, *(int *) data);
1074 printf("New autoconfig value is: %d\n", d);
1075 *(int *) data = d;
1076 return (retcode);
1077
1078 case RAIDFRAME_SET_ROOT:
1079 d = rf_set_rootpartition(raidPtr, *(int *) data);
1080 printf("New rootpartition value is: %d\n", d);
1081 *(int *) data = d;
1082 return (retcode);
1083
1084 /* initialize all parity */
1085 case RAIDFRAME_REWRITEPARITY:
1086
1087 if (raidPtr->Layout.map->faultsTolerated == 0) {
1088 /* Parity for RAID 0 is trivially correct */
1089 raidPtr->parity_good = RF_RAID_CLEAN;
1090 return(0);
1091 }
1092
1093 if (raidPtr->parity_rewrite_in_progress == 1) {
1094 /* Re-write is already in progress! */
1095 return(EINVAL);
1096 }
1097
1098 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1099 rf_RewriteParityThread,
1100 raidPtr,"raid_parity");
1101 return (retcode);
1102
1103
1104 case RAIDFRAME_ADD_HOT_SPARE:
1105 sparePtr = (RF_SingleComponent_t *) data;
1106 memcpy( &hot_spare, sparePtr, sizeof(RF_SingleComponent_t));
1107 retcode = rf_add_hot_spare(raidPtr, &hot_spare);
1108 return(retcode);
1109
1110 case RAIDFRAME_REMOVE_HOT_SPARE:
1111 return(retcode);
1112
1113 case RAIDFRAME_DELETE_COMPONENT:
1114 componentPtr = (RF_SingleComponent_t *)data;
1115 memcpy( &component, componentPtr,
1116 sizeof(RF_SingleComponent_t));
1117 retcode = rf_delete_component(raidPtr, &component);
1118 return(retcode);
1119
1120 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1121 componentPtr = (RF_SingleComponent_t *)data;
1122 memcpy( &component, componentPtr,
1123 sizeof(RF_SingleComponent_t));
1124 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1125 return(retcode);
1126
1127 case RAIDFRAME_REBUILD_IN_PLACE:
1128
1129 if (raidPtr->Layout.map->faultsTolerated == 0) {
1130 /* Can't do this on a RAID 0!! */
1131 return(EINVAL);
1132 }
1133
1134 if (raidPtr->recon_in_progress == 1) {
1135 /* a reconstruct is already in progress! */
1136 return(EINVAL);
1137 }
1138
1139 componentPtr = (RF_SingleComponent_t *) data;
1140 memcpy( &component, componentPtr,
1141 sizeof(RF_SingleComponent_t));
1142 row = component.row;
1143 column = component.column;
1144 printf("Rebuild: %d %d\n",row, column);
1145 if ((row < 0) || (row >= raidPtr->numRow) ||
1146 (column < 0) || (column >= raidPtr->numCol)) {
1147 return(EINVAL);
1148 }
1149
1150 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1151 if (rrcopy == NULL)
1152 return(ENOMEM);
1153
1154 rrcopy->raidPtr = (void *) raidPtr;
1155 rrcopy->row = row;
1156 rrcopy->col = column;
1157
1158 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1159 rf_ReconstructInPlaceThread,
1160 rrcopy,"raid_reconip");
1161 return(retcode);
1162
1163 case RAIDFRAME_GET_INFO:
1164 if (!raidPtr->valid)
1165 return (ENODEV);
1166 ucfgp = (RF_DeviceConfig_t **) data;
1167 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1168 (RF_DeviceConfig_t *));
1169 if (d_cfg == NULL)
1170 return (ENOMEM);
1171 bzero((char *) d_cfg, sizeof(RF_DeviceConfig_t));
1172 d_cfg->rows = raidPtr->numRow;
1173 d_cfg->cols = raidPtr->numCol;
1174 d_cfg->ndevs = raidPtr->numRow * raidPtr->numCol;
1175 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1176 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1177 return (ENOMEM);
1178 }
1179 d_cfg->nspares = raidPtr->numSpare;
1180 if (d_cfg->nspares >= RF_MAX_DISKS) {
1181 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1182 return (ENOMEM);
1183 }
1184 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1185 d = 0;
1186 for (i = 0; i < d_cfg->rows; i++) {
1187 for (j = 0; j < d_cfg->cols; j++) {
1188 d_cfg->devs[d] = raidPtr->Disks[i][j];
1189 d++;
1190 }
1191 }
1192 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1193 d_cfg->spares[i] = raidPtr->Disks[0][j];
1194 }
1195 retcode = copyout((caddr_t) d_cfg, (caddr_t) * ucfgp,
1196 sizeof(RF_DeviceConfig_t));
1197 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1198
1199 return (retcode);
1200
1201 case RAIDFRAME_CHECK_PARITY:
1202 *(int *) data = raidPtr->parity_good;
1203 return (0);
1204
1205 case RAIDFRAME_RESET_ACCTOTALS:
1206 bzero(&raidPtr->acc_totals, sizeof(raidPtr->acc_totals));
1207 return (0);
1208
1209 case RAIDFRAME_GET_ACCTOTALS:
1210 totals = (RF_AccTotals_t *) data;
1211 *totals = raidPtr->acc_totals;
1212 return (0);
1213
1214 case RAIDFRAME_KEEP_ACCTOTALS:
1215 raidPtr->keep_acc_totals = *(int *)data;
1216 return (0);
1217
1218 case RAIDFRAME_GET_SIZE:
1219 *(int *) data = raidPtr->totalSectors;
1220 return (0);
1221
1222 /* fail a disk & optionally start reconstruction */
1223 case RAIDFRAME_FAIL_DISK:
1224
1225 if (raidPtr->Layout.map->faultsTolerated == 0) {
1226 /* Can't do this on a RAID 0!! */
1227 return(EINVAL);
1228 }
1229
1230 rr = (struct rf_recon_req *) data;
1231
1232 if (rr->row < 0 || rr->row >= raidPtr->numRow
1233 || rr->col < 0 || rr->col >= raidPtr->numCol)
1234 return (EINVAL);
1235
1236 printf("raid%d: Failing the disk: row: %d col: %d\n",
1237 unit, rr->row, rr->col);
1238
1239 /* make a copy of the recon request so that we don't rely on
1240 * the user's buffer */
1241 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1242 if (rrcopy == NULL)
1243 return(ENOMEM);
1244 bcopy(rr, rrcopy, sizeof(*rr));
1245 rrcopy->raidPtr = (void *) raidPtr;
1246
1247 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1248 rf_ReconThread,
1249 rrcopy,"raid_recon");
1250 return (0);
1251
1252 /* invoke a copyback operation after recon on whatever disk
1253 * needs it, if any */
1254 case RAIDFRAME_COPYBACK:
1255
1256 if (raidPtr->Layout.map->faultsTolerated == 0) {
1257 /* This makes no sense on a RAID 0!! */
1258 return(EINVAL);
1259 }
1260
1261 if (raidPtr->copyback_in_progress == 1) {
1262 /* Copyback is already in progress! */
1263 return(EINVAL);
1264 }
1265
1266 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1267 rf_CopybackThread,
1268 raidPtr,"raid_copyback");
1269 return (retcode);
1270
1271 /* return the percentage completion of reconstruction */
1272 case RAIDFRAME_CHECK_RECON_STATUS:
1273 if (raidPtr->Layout.map->faultsTolerated == 0) {
1274 /* This makes no sense on a RAID 0, so tell the
1275 user it's done. */
1276 *(int *) data = 100;
1277 return(0);
1278 }
1279 row = 0; /* XXX we only consider a single row... */
1280 if (raidPtr->status[row] != rf_rs_reconstructing)
1281 *(int *) data = 100;
1282 else
1283 *(int *) data = raidPtr->reconControl[row]->percentComplete;
1284 return (0);
1285 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1286 progressInfoPtr = (RF_ProgressInfo_t **) data;
1287 row = 0; /* XXX we only consider a single row... */
1288 if (raidPtr->status[row] != rf_rs_reconstructing) {
1289 progressInfo.remaining = 0;
1290 progressInfo.completed = 100;
1291 progressInfo.total = 100;
1292 } else {
1293 progressInfo.total =
1294 raidPtr->reconControl[row]->numRUsTotal;
1295 progressInfo.completed =
1296 raidPtr->reconControl[row]->numRUsComplete;
1297 progressInfo.remaining = progressInfo.total -
1298 progressInfo.completed;
1299 }
1300 retcode = copyout((caddr_t) &progressInfo,
1301 (caddr_t) *progressInfoPtr,
1302 sizeof(RF_ProgressInfo_t));
1303 return (retcode);
1304
1305 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1306 if (raidPtr->Layout.map->faultsTolerated == 0) {
1307 /* This makes no sense on a RAID 0, so tell the
1308 user it's done. */
1309 *(int *) data = 100;
1310 return(0);
1311 }
1312 if (raidPtr->parity_rewrite_in_progress == 1) {
1313 *(int *) data = 100 *
1314 raidPtr->parity_rewrite_stripes_done /
1315 raidPtr->Layout.numStripe;
1316 } else {
1317 *(int *) data = 100;
1318 }
1319 return (0);
1320
1321 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1322 progressInfoPtr = (RF_ProgressInfo_t **) data;
1323 if (raidPtr->parity_rewrite_in_progress == 1) {
1324 progressInfo.total = raidPtr->Layout.numStripe;
1325 progressInfo.completed =
1326 raidPtr->parity_rewrite_stripes_done;
1327 progressInfo.remaining = progressInfo.total -
1328 progressInfo.completed;
1329 } else {
1330 progressInfo.remaining = 0;
1331 progressInfo.completed = 100;
1332 progressInfo.total = 100;
1333 }
1334 retcode = copyout((caddr_t) &progressInfo,
1335 (caddr_t) *progressInfoPtr,
1336 sizeof(RF_ProgressInfo_t));
1337 return (retcode);
1338
1339 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1340 if (raidPtr->Layout.map->faultsTolerated == 0) {
1341 /* This makes no sense on a RAID 0 */
1342 *(int *) data = 100;
1343 return(0);
1344 }
1345 if (raidPtr->copyback_in_progress == 1) {
1346 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1347 raidPtr->Layout.numStripe;
1348 } else {
1349 *(int *) data = 100;
1350 }
1351 return (0);
1352
1353 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1354 progressInfoPtr = (RF_ProgressInfo_t **) data;
1355 if (raidPtr->copyback_in_progress == 1) {
1356 progressInfo.total = raidPtr->Layout.numStripe;
1357 progressInfo.completed =
1358 raidPtr->copyback_stripes_done;
1359 progressInfo.remaining = progressInfo.total -
1360 progressInfo.completed;
1361 } else {
1362 progressInfo.remaining = 0;
1363 progressInfo.completed = 100;
1364 progressInfo.total = 100;
1365 }
1366 retcode = copyout((caddr_t) &progressInfo,
1367 (caddr_t) *progressInfoPtr,
1368 sizeof(RF_ProgressInfo_t));
1369 return (retcode);
1370
1371 /* the sparetable daemon calls this to wait for the kernel to
1372 * need a spare table. this ioctl does not return until a
1373 * spare table is needed. XXX -- calling mpsleep here in the
1374 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1375 * -- I should either compute the spare table in the kernel,
1376 * or have a different -- XXX XXX -- interface (a different
1377 * character device) for delivering the table -- XXX */
1378 #if 0
1379 case RAIDFRAME_SPARET_WAIT:
1380 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1381 while (!rf_sparet_wait_queue)
1382 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1383 waitreq = rf_sparet_wait_queue;
1384 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1385 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1386
1387 /* structure assignment */
1388 *((RF_SparetWait_t *) data) = *waitreq;
1389
1390 RF_Free(waitreq, sizeof(*waitreq));
1391 return (0);
1392
1393 /* wakes up a process waiting on SPARET_WAIT and puts an error
1394 * code in it that will cause the dameon to exit */
1395 case RAIDFRAME_ABORT_SPARET_WAIT:
1396 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1397 waitreq->fcol = -1;
1398 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1399 waitreq->next = rf_sparet_wait_queue;
1400 rf_sparet_wait_queue = waitreq;
1401 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1402 wakeup(&rf_sparet_wait_queue);
1403 return (0);
1404
1405 /* used by the spare table daemon to deliver a spare table
1406 * into the kernel */
1407 case RAIDFRAME_SEND_SPARET:
1408
1409 /* install the spare table */
1410 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1411
1412 /* respond to the requestor. the return status of the spare
1413 * table installation is passed in the "fcol" field */
1414 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1415 waitreq->fcol = retcode;
1416 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1417 waitreq->next = rf_sparet_resp_queue;
1418 rf_sparet_resp_queue = waitreq;
1419 wakeup(&rf_sparet_resp_queue);
1420 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1421
1422 return (retcode);
1423 #endif
1424
1425 default:
1426 break; /* fall through to the os-specific code below */
1427
1428 }
1429
1430 if (!raidPtr->valid)
1431 return (EINVAL);
1432
1433 /*
1434 * Add support for "regular" device ioctls here.
1435 */
1436
1437 switch (cmd) {
1438 case DIOCGDINFO:
1439 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1440 break;
1441
1442 case DIOCGPART:
1443 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1444 ((struct partinfo *) data)->part =
1445 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1446 break;
1447
1448 case DIOCWDINFO:
1449 case DIOCSDINFO:
1450 if ((error = raidlock(rs)) != 0)
1451 return (error);
1452
1453 rs->sc_flags |= RAIDF_LABELLING;
1454
1455 error = setdisklabel(rs->sc_dkdev.dk_label,
1456 (struct disklabel *) data, 0, rs->sc_dkdev.dk_cpulabel);
1457 if (error == 0) {
1458 if (cmd == DIOCWDINFO)
1459 error = writedisklabel(RAIDLABELDEV(dev),
1460 raidstrategy, rs->sc_dkdev.dk_label,
1461 rs->sc_dkdev.dk_cpulabel);
1462 }
1463 rs->sc_flags &= ~RAIDF_LABELLING;
1464
1465 raidunlock(rs);
1466
1467 if (error)
1468 return (error);
1469 break;
1470
1471 case DIOCWLABEL:
1472 if (*(int *) data != 0)
1473 rs->sc_flags |= RAIDF_WLABEL;
1474 else
1475 rs->sc_flags &= ~RAIDF_WLABEL;
1476 break;
1477
1478 case DIOCGDEFLABEL:
1479 raidgetdefaultlabel(raidPtr, rs,
1480 (struct disklabel *) data);
1481 break;
1482
1483 default:
1484 retcode = ENOTTY;
1485 }
1486 return (retcode);
1487
1488 }
1489
1490
1491 /* raidinit -- complete the rest of the initialization for the
1492 RAIDframe device. */
1493
1494
1495 static void
1496 raidinit(raidPtr)
1497 RF_Raid_t *raidPtr;
1498 {
1499 struct raid_softc *rs;
1500 int unit;
1501
1502 unit = raidPtr->raidid;
1503
1504 rs = &raid_softc[unit];
1505 pool_init(&rs->sc_cbufpool, sizeof(struct raidbuf), 0,
1506 0, 0, "raidpl", 0, NULL, NULL, M_RAIDFRAME);
1507
1508
1509 /* XXX should check return code first... */
1510 rs->sc_flags |= RAIDF_INITED;
1511
1512 sprintf(rs->sc_xname, "raid%d", unit); /* XXX doesn't check bounds. */
1513
1514 rs->sc_dkdev.dk_name = rs->sc_xname;
1515
1516 /* disk_attach actually creates space for the CPU disklabel, among
1517 * other things, so it's critical to call this *BEFORE* we try putzing
1518 * with disklabels. */
1519
1520 disk_attach(&rs->sc_dkdev);
1521
1522 /* XXX There may be a weird interaction here between this, and
1523 * protectedSectors, as used in RAIDframe. */
1524
1525 rs->sc_size = raidPtr->totalSectors;
1526
1527 }
1528
1529 /* wake up the daemon & tell it to get us a spare table
1530 * XXX
1531 * the entries in the queues should be tagged with the raidPtr
1532 * so that in the extremely rare case that two recons happen at once,
1533 * we know for which device were requesting a spare table
1534 * XXX
1535 *
1536 * XXX This code is not currently used. GO
1537 */
1538 int
1539 rf_GetSpareTableFromDaemon(req)
1540 RF_SparetWait_t *req;
1541 {
1542 int retcode;
1543
1544 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1545 req->next = rf_sparet_wait_queue;
1546 rf_sparet_wait_queue = req;
1547 wakeup(&rf_sparet_wait_queue);
1548
1549 /* mpsleep unlocks the mutex */
1550 while (!rf_sparet_resp_queue) {
1551 tsleep(&rf_sparet_resp_queue, PRIBIO,
1552 "raidframe getsparetable", 0);
1553 }
1554 req = rf_sparet_resp_queue;
1555 rf_sparet_resp_queue = req->next;
1556 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1557
1558 retcode = req->fcol;
1559 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1560 * alloc'd */
1561 return (retcode);
1562 }
1563
1564 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1565 * bp & passes it down.
1566 * any calls originating in the kernel must use non-blocking I/O
1567 * do some extra sanity checking to return "appropriate" error values for
1568 * certain conditions (to make some standard utilities work)
1569 *
1570 * Formerly known as: rf_DoAccessKernel
1571 */
1572 void
1573 raidstart(raidPtr)
1574 RF_Raid_t *raidPtr;
1575 {
1576 RF_SectorCount_t num_blocks, pb, sum;
1577 RF_RaidAddr_t raid_addr;
1578 int retcode;
1579 struct partition *pp;
1580 daddr_t blocknum;
1581 int unit;
1582 struct raid_softc *rs;
1583 int do_async;
1584 struct buf *bp;
1585
1586 unit = raidPtr->raidid;
1587 rs = &raid_softc[unit];
1588
1589 /* quick check to see if anything has died recently */
1590 RF_LOCK_MUTEX(raidPtr->mutex);
1591 if (raidPtr->numNewFailures > 0) {
1592 rf_update_component_labels(raidPtr,
1593 RF_NORMAL_COMPONENT_UPDATE);
1594 raidPtr->numNewFailures--;
1595 }
1596 RF_UNLOCK_MUTEX(raidPtr->mutex);
1597
1598 /* Check to see if we're at the limit... */
1599 RF_LOCK_MUTEX(raidPtr->mutex);
1600 while (raidPtr->openings > 0) {
1601 RF_UNLOCK_MUTEX(raidPtr->mutex);
1602
1603 /* get the next item, if any, from the queue */
1604 if ((bp = BUFQ_FIRST(&rs->buf_queue)) == NULL) {
1605 /* nothing more to do */
1606 return;
1607 }
1608 BUFQ_REMOVE(&rs->buf_queue, bp);
1609
1610 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1611 * partition.. Need to make it absolute to the underlying
1612 * device.. */
1613
1614 blocknum = bp->b_blkno;
1615 if (DISKPART(bp->b_dev) != RAW_PART) {
1616 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1617 blocknum += pp->p_offset;
1618 }
1619
1620 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1621 (int) blocknum));
1622
1623 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1624 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1625
1626 /* *THIS* is where we adjust what block we're going to...
1627 * but DO NOT TOUCH bp->b_blkno!!! */
1628 raid_addr = blocknum;
1629
1630 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1631 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1632 sum = raid_addr + num_blocks + pb;
1633 if (1 || rf_debugKernelAccess) {
1634 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1635 (int) raid_addr, (int) sum, (int) num_blocks,
1636 (int) pb, (int) bp->b_resid));
1637 }
1638 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1639 || (sum < num_blocks) || (sum < pb)) {
1640 bp->b_error = ENOSPC;
1641 bp->b_flags |= B_ERROR;
1642 bp->b_resid = bp->b_bcount;
1643 biodone(bp);
1644 RF_LOCK_MUTEX(raidPtr->mutex);
1645 continue;
1646 }
1647 /*
1648 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1649 */
1650
1651 if (bp->b_bcount & raidPtr->sectorMask) {
1652 bp->b_error = EINVAL;
1653 bp->b_flags |= B_ERROR;
1654 bp->b_resid = bp->b_bcount;
1655 biodone(bp);
1656 RF_LOCK_MUTEX(raidPtr->mutex);
1657 continue;
1658
1659 }
1660 db1_printf(("Calling DoAccess..\n"));
1661
1662
1663 RF_LOCK_MUTEX(raidPtr->mutex);
1664 raidPtr->openings--;
1665 RF_UNLOCK_MUTEX(raidPtr->mutex);
1666
1667 /*
1668 * Everything is async.
1669 */
1670 do_async = 1;
1671
1672 /* don't ever condition on bp->b_flags & B_WRITE.
1673 * always condition on B_READ instead */
1674
1675 /* XXX we're still at splbio() here... do we *really*
1676 need to be? */
1677
1678
1679 retcode = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1680 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1681 do_async, raid_addr, num_blocks,
1682 bp->b_data, bp, NULL, NULL,
1683 RF_DAG_NONBLOCKING_IO, NULL, NULL, NULL);
1684
1685
1686 RF_LOCK_MUTEX(raidPtr->mutex);
1687 }
1688 RF_UNLOCK_MUTEX(raidPtr->mutex);
1689 }
1690
1691
1692
1693
1694 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1695
1696 int
1697 rf_DispatchKernelIO(queue, req)
1698 RF_DiskQueue_t *queue;
1699 RF_DiskQueueData_t *req;
1700 {
1701 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1702 struct buf *bp;
1703 struct raidbuf *raidbp = NULL;
1704 struct raid_softc *rs;
1705 int unit;
1706 int s;
1707
1708 s=0;
1709 /* s = splbio();*/ /* want to test this */
1710 /* XXX along with the vnode, we also need the softc associated with
1711 * this device.. */
1712
1713 req->queue = queue;
1714
1715 unit = queue->raidPtr->raidid;
1716
1717 db1_printf(("DispatchKernelIO unit: %d\n", unit));
1718
1719 if (unit >= numraid) {
1720 printf("Invalid unit number: %d %d\n", unit, numraid);
1721 panic("Invalid Unit number in rf_DispatchKernelIO\n");
1722 }
1723 rs = &raid_softc[unit];
1724
1725 /* XXX is this the right place? */
1726 disk_busy(&rs->sc_dkdev);
1727
1728 bp = req->bp;
1729 #if 1
1730 /* XXX when there is a physical disk failure, someone is passing us a
1731 * buffer that contains old stuff!! Attempt to deal with this problem
1732 * without taking a performance hit... (not sure where the real bug
1733 * is. It's buried in RAIDframe somewhere) :-( GO ) */
1734
1735 if (bp->b_flags & B_ERROR) {
1736 bp->b_flags &= ~B_ERROR;
1737 }
1738 if (bp->b_error != 0) {
1739 bp->b_error = 0;
1740 }
1741 #endif
1742 raidbp = RAIDGETBUF(rs);
1743
1744 raidbp->rf_flags = 0; /* XXX not really used anywhere... */
1745
1746 /*
1747 * context for raidiodone
1748 */
1749 raidbp->rf_obp = bp;
1750 raidbp->req = req;
1751
1752 LIST_INIT(&raidbp->rf_buf.b_dep);
1753
1754 switch (req->type) {
1755 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1756 /* XXX need to do something extra here.. */
1757 /* I'm leaving this in, as I've never actually seen it used,
1758 * and I'd like folks to report it... GO */
1759 printf(("WAKEUP CALLED\n"));
1760 queue->numOutstanding++;
1761
1762 /* XXX need to glue the original buffer into this?? */
1763
1764 KernelWakeupFunc(&raidbp->rf_buf);
1765 break;
1766
1767 case RF_IO_TYPE_READ:
1768 case RF_IO_TYPE_WRITE:
1769
1770 if (req->tracerec) {
1771 RF_ETIMER_START(req->tracerec->timer);
1772 }
1773 InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp,
1774 op | bp->b_flags, queue->rf_cinfo->ci_dev,
1775 req->sectorOffset, req->numSector,
1776 req->buf, KernelWakeupFunc, (void *) req,
1777 queue->raidPtr->logBytesPerSector, req->b_proc);
1778
1779 if (rf_debugKernelAccess) {
1780 db1_printf(("dispatch: bp->b_blkno = %ld\n",
1781 (long) bp->b_blkno));
1782 }
1783 queue->numOutstanding++;
1784 queue->last_deq_sector = req->sectorOffset;
1785 /* acc wouldn't have been let in if there were any pending
1786 * reqs at any other priority */
1787 queue->curPriority = req->priority;
1788
1789 db1_printf(("Going for %c to unit %d row %d col %d\n",
1790 req->type, unit, queue->row, queue->col));
1791 db1_printf(("sector %d count %d (%d bytes) %d\n",
1792 (int) req->sectorOffset, (int) req->numSector,
1793 (int) (req->numSector <<
1794 queue->raidPtr->logBytesPerSector),
1795 (int) queue->raidPtr->logBytesPerSector));
1796 if ((raidbp->rf_buf.b_flags & B_READ) == 0) {
1797 raidbp->rf_buf.b_vp->v_numoutput++;
1798 }
1799 VOP_STRATEGY(&raidbp->rf_buf);
1800
1801 break;
1802
1803 default:
1804 panic("bad req->type in rf_DispatchKernelIO");
1805 }
1806 db1_printf(("Exiting from DispatchKernelIO\n"));
1807 /* splx(s); */ /* want to test this */
1808 return (0);
1809 }
1810 /* this is the callback function associated with a I/O invoked from
1811 kernel code.
1812 */
1813 static void
1814 KernelWakeupFunc(vbp)
1815 struct buf *vbp;
1816 {
1817 RF_DiskQueueData_t *req = NULL;
1818 RF_DiskQueue_t *queue;
1819 struct raidbuf *raidbp = (struct raidbuf *) vbp;
1820 struct buf *bp;
1821 struct raid_softc *rs;
1822 int unit;
1823 int s;
1824
1825 s = splbio();
1826 db1_printf(("recovering the request queue:\n"));
1827 req = raidbp->req;
1828
1829 bp = raidbp->rf_obp;
1830
1831 queue = (RF_DiskQueue_t *) req->queue;
1832
1833 if (raidbp->rf_buf.b_flags & B_ERROR) {
1834 bp->b_flags |= B_ERROR;
1835 bp->b_error = raidbp->rf_buf.b_error ?
1836 raidbp->rf_buf.b_error : EIO;
1837 }
1838
1839 /* XXX methinks this could be wrong... */
1840 #if 1
1841 bp->b_resid = raidbp->rf_buf.b_resid;
1842 #endif
1843
1844 if (req->tracerec) {
1845 RF_ETIMER_STOP(req->tracerec->timer);
1846 RF_ETIMER_EVAL(req->tracerec->timer);
1847 RF_LOCK_MUTEX(rf_tracing_mutex);
1848 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
1849 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
1850 req->tracerec->num_phys_ios++;
1851 RF_UNLOCK_MUTEX(rf_tracing_mutex);
1852 }
1853 bp->b_bcount = raidbp->rf_buf.b_bcount; /* XXXX ?? */
1854
1855 unit = queue->raidPtr->raidid; /* *Much* simpler :-> */
1856
1857
1858 /* XXX Ok, let's get aggressive... If B_ERROR is set, let's go
1859 * ballistic, and mark the component as hosed... */
1860
1861 if (bp->b_flags & B_ERROR) {
1862 /* Mark the disk as dead */
1863 /* but only mark it once... */
1864 if (queue->raidPtr->Disks[queue->row][queue->col].status ==
1865 rf_ds_optimal) {
1866 printf("raid%d: IO Error. Marking %s as failed.\n",
1867 unit, queue->raidPtr->Disks[queue->row][queue->col].devname);
1868 queue->raidPtr->Disks[queue->row][queue->col].status =
1869 rf_ds_failed;
1870 queue->raidPtr->status[queue->row] = rf_rs_degraded;
1871 queue->raidPtr->numFailures++;
1872 queue->raidPtr->numNewFailures++;
1873 } else { /* Disk is already dead... */
1874 /* printf("Disk already marked as dead!\n"); */
1875 }
1876
1877 }
1878
1879 rs = &raid_softc[unit];
1880 RAIDPUTBUF(rs, raidbp);
1881
1882
1883 if (bp->b_resid == 0) {
1884 /* XXX is this the right place for a disk_unbusy()??!??!?!? */
1885 disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid));
1886 }
1887
1888 rf_DiskIOComplete(queue, req, (bp->b_flags & B_ERROR) ? 1 : 0);
1889 (req->CompleteFunc) (req->argument, (bp->b_flags & B_ERROR) ? 1 : 0);
1890
1891 splx(s);
1892 }
1893
1894
1895
1896 /*
1897 * initialize a buf structure for doing an I/O in the kernel.
1898 */
1899 static void
1900 InitBP(bp, b_vp, rw_flag, dev, startSect, numSect, buf, cbFunc, cbArg,
1901 logBytesPerSector, b_proc)
1902 struct buf *bp;
1903 struct vnode *b_vp;
1904 unsigned rw_flag;
1905 dev_t dev;
1906 RF_SectorNum_t startSect;
1907 RF_SectorCount_t numSect;
1908 caddr_t buf;
1909 void (*cbFunc) (struct buf *);
1910 void *cbArg;
1911 int logBytesPerSector;
1912 struct proc *b_proc;
1913 {
1914 /* bp->b_flags = B_PHYS | rw_flag; */
1915 bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */
1916 bp->b_bcount = numSect << logBytesPerSector;
1917 bp->b_bufsize = bp->b_bcount;
1918 bp->b_error = 0;
1919 bp->b_dev = dev;
1920 bp->b_data = buf;
1921 bp->b_blkno = startSect;
1922 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
1923 if (bp->b_bcount == 0) {
1924 panic("bp->b_bcount is zero in InitBP!!\n");
1925 }
1926 bp->b_proc = b_proc;
1927 bp->b_iodone = cbFunc;
1928 bp->b_vp = b_vp;
1929
1930 }
1931
1932 static void
1933 raidgetdefaultlabel(raidPtr, rs, lp)
1934 RF_Raid_t *raidPtr;
1935 struct raid_softc *rs;
1936 struct disklabel *lp;
1937 {
1938 db1_printf(("Building a default label...\n"));
1939 bzero(lp, sizeof(*lp));
1940
1941 /* fabricate a label... */
1942 lp->d_secperunit = raidPtr->totalSectors;
1943 lp->d_secsize = raidPtr->bytesPerSector;
1944 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
1945 lp->d_ntracks = 1;
1946 lp->d_ncylinders = raidPtr->totalSectors /
1947 (lp->d_nsectors * lp->d_ntracks);
1948 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1949
1950 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
1951 lp->d_type = DTYPE_RAID;
1952 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1953 lp->d_rpm = 3600;
1954 lp->d_interleave = 1;
1955 lp->d_flags = 0;
1956
1957 lp->d_partitions[RAW_PART].p_offset = 0;
1958 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
1959 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1960 lp->d_npartitions = RAW_PART + 1;
1961
1962 lp->d_magic = DISKMAGIC;
1963 lp->d_magic2 = DISKMAGIC;
1964 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
1965
1966 }
1967 /*
1968 * Read the disklabel from the raid device. If one is not present, fake one
1969 * up.
1970 */
1971 static void
1972 raidgetdisklabel(dev)
1973 dev_t dev;
1974 {
1975 int unit = raidunit(dev);
1976 struct raid_softc *rs = &raid_softc[unit];
1977 char *errstring;
1978 struct disklabel *lp = rs->sc_dkdev.dk_label;
1979 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
1980 RF_Raid_t *raidPtr;
1981
1982 db1_printf(("Getting the disklabel...\n"));
1983
1984 bzero(clp, sizeof(*clp));
1985
1986 raidPtr = raidPtrs[unit];
1987
1988 raidgetdefaultlabel(raidPtr, rs, lp);
1989
1990 /*
1991 * Call the generic disklabel extraction routine.
1992 */
1993 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
1994 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
1995 if (errstring)
1996 raidmakedisklabel(rs);
1997 else {
1998 int i;
1999 struct partition *pp;
2000
2001 /*
2002 * Sanity check whether the found disklabel is valid.
2003 *
2004 * This is necessary since total size of the raid device
2005 * may vary when an interleave is changed even though exactly
2006 * same componets are used, and old disklabel may used
2007 * if that is found.
2008 */
2009 if (lp->d_secperunit != rs->sc_size)
2010 printf("WARNING: %s: "
2011 "total sector size in disklabel (%d) != "
2012 "the size of raid (%ld)\n", rs->sc_xname,
2013 lp->d_secperunit, (long) rs->sc_size);
2014 for (i = 0; i < lp->d_npartitions; i++) {
2015 pp = &lp->d_partitions[i];
2016 if (pp->p_offset + pp->p_size > rs->sc_size)
2017 printf("WARNING: %s: end of partition `%c' "
2018 "exceeds the size of raid (%ld)\n",
2019 rs->sc_xname, 'a' + i, (long) rs->sc_size);
2020 }
2021 }
2022
2023 }
2024 /*
2025 * Take care of things one might want to take care of in the event
2026 * that a disklabel isn't present.
2027 */
2028 static void
2029 raidmakedisklabel(rs)
2030 struct raid_softc *rs;
2031 {
2032 struct disklabel *lp = rs->sc_dkdev.dk_label;
2033 db1_printf(("Making a label..\n"));
2034
2035 /*
2036 * For historical reasons, if there's no disklabel present
2037 * the raw partition must be marked FS_BSDFFS.
2038 */
2039
2040 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2041
2042 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2043
2044 lp->d_checksum = dkcksum(lp);
2045 }
2046 /*
2047 * Lookup the provided name in the filesystem. If the file exists,
2048 * is a valid block device, and isn't being used by anyone else,
2049 * set *vpp to the file's vnode.
2050 * You'll find the original of this in ccd.c
2051 */
2052 int
2053 raidlookup(path, p, vpp)
2054 char *path;
2055 struct proc *p;
2056 struct vnode **vpp; /* result */
2057 {
2058 struct nameidata nd;
2059 struct vnode *vp;
2060 struct vattr va;
2061 int error;
2062
2063 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p);
2064 if ((error = vn_open(&nd, FREAD | FWRITE, 0)) != 0) {
2065 #ifdef DEBUG
2066 printf("RAIDframe: vn_open returned %d\n", error);
2067 #endif
2068 return (error);
2069 }
2070 vp = nd.ni_vp;
2071 if (vp->v_usecount > 1) {
2072 VOP_UNLOCK(vp, 0);
2073 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
2074 return (EBUSY);
2075 }
2076 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) != 0) {
2077 VOP_UNLOCK(vp, 0);
2078 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
2079 return (error);
2080 }
2081 /* XXX: eventually we should handle VREG, too. */
2082 if (va.va_type != VBLK) {
2083 VOP_UNLOCK(vp, 0);
2084 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
2085 return (ENOTBLK);
2086 }
2087 VOP_UNLOCK(vp, 0);
2088 *vpp = vp;
2089 return (0);
2090 }
2091 /*
2092 * Wait interruptibly for an exclusive lock.
2093 *
2094 * XXX
2095 * Several drivers do this; it should be abstracted and made MP-safe.
2096 * (Hmm... where have we seen this warning before :-> GO )
2097 */
2098 static int
2099 raidlock(rs)
2100 struct raid_softc *rs;
2101 {
2102 int error;
2103
2104 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2105 rs->sc_flags |= RAIDF_WANTED;
2106 if ((error =
2107 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2108 return (error);
2109 }
2110 rs->sc_flags |= RAIDF_LOCKED;
2111 return (0);
2112 }
2113 /*
2114 * Unlock and wake up any waiters.
2115 */
2116 static void
2117 raidunlock(rs)
2118 struct raid_softc *rs;
2119 {
2120
2121 rs->sc_flags &= ~RAIDF_LOCKED;
2122 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2123 rs->sc_flags &= ~RAIDF_WANTED;
2124 wakeup(rs);
2125 }
2126 }
2127
2128
2129 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2130 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2131
2132 int
2133 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2134 {
2135 RF_ComponentLabel_t clabel;
2136 raidread_component_label(dev, b_vp, &clabel);
2137 clabel.mod_counter = mod_counter;
2138 clabel.clean = RF_RAID_CLEAN;
2139 raidwrite_component_label(dev, b_vp, &clabel);
2140 return(0);
2141 }
2142
2143
2144 int
2145 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2146 {
2147 RF_ComponentLabel_t clabel;
2148 raidread_component_label(dev, b_vp, &clabel);
2149 clabel.mod_counter = mod_counter;
2150 clabel.clean = RF_RAID_DIRTY;
2151 raidwrite_component_label(dev, b_vp, &clabel);
2152 return(0);
2153 }
2154
2155 /* ARGSUSED */
2156 int
2157 raidread_component_label(dev, b_vp, clabel)
2158 dev_t dev;
2159 struct vnode *b_vp;
2160 RF_ComponentLabel_t *clabel;
2161 {
2162 struct buf *bp;
2163 int error;
2164
2165 /* XXX should probably ensure that we don't try to do this if
2166 someone has changed rf_protected_sectors. */
2167
2168 /* get a block of the appropriate size... */
2169 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2170 bp->b_dev = dev;
2171
2172 /* get our ducks in a row for the read */
2173 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2174 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2175 bp->b_flags = B_BUSY | B_READ;
2176 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2177
2178 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2179
2180 error = biowait(bp);
2181
2182 if (!error) {
2183 memcpy(clabel, bp->b_data,
2184 sizeof(RF_ComponentLabel_t));
2185 #if 0
2186 rf_print_component_label( clabel );
2187 #endif
2188 } else {
2189 #if 0
2190 printf("Failed to read RAID component label!\n");
2191 #endif
2192 }
2193
2194 bp->b_flags = B_INVAL | B_AGE;
2195 brelse(bp);
2196 return(error);
2197 }
2198 /* ARGSUSED */
2199 int
2200 raidwrite_component_label(dev, b_vp, clabel)
2201 dev_t dev;
2202 struct vnode *b_vp;
2203 RF_ComponentLabel_t *clabel;
2204 {
2205 struct buf *bp;
2206 int error;
2207
2208 /* get a block of the appropriate size... */
2209 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2210 bp->b_dev = dev;
2211
2212 /* get our ducks in a row for the write */
2213 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2214 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2215 bp->b_flags = B_BUSY | B_WRITE;
2216 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2217
2218 memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
2219
2220 memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
2221
2222 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2223 error = biowait(bp);
2224 bp->b_flags = B_INVAL | B_AGE;
2225 brelse(bp);
2226 if (error) {
2227 #if 1
2228 printf("Failed to write RAID component info!\n");
2229 #endif
2230 }
2231
2232 return(error);
2233 }
2234
2235 void
2236 rf_markalldirty(raidPtr)
2237 RF_Raid_t *raidPtr;
2238 {
2239 RF_ComponentLabel_t clabel;
2240 int r,c;
2241
2242 raidPtr->mod_counter++;
2243 for (r = 0; r < raidPtr->numRow; r++) {
2244 for (c = 0; c < raidPtr->numCol; c++) {
2245 if (raidPtr->Disks[r][c].status != rf_ds_failed) {
2246 raidread_component_label(
2247 raidPtr->Disks[r][c].dev,
2248 raidPtr->raid_cinfo[r][c].ci_vp,
2249 &clabel);
2250 if (clabel.status == rf_ds_spared) {
2251 /* XXX do something special...
2252 but whatever you do, don't
2253 try to access it!! */
2254 } else {
2255 #if 0
2256 clabel.status =
2257 raidPtr->Disks[r][c].status;
2258 raidwrite_component_label(
2259 raidPtr->Disks[r][c].dev,
2260 raidPtr->raid_cinfo[r][c].ci_vp,
2261 &clabel);
2262 #endif
2263 raidmarkdirty(
2264 raidPtr->Disks[r][c].dev,
2265 raidPtr->raid_cinfo[r][c].ci_vp,
2266 raidPtr->mod_counter);
2267 }
2268 }
2269 }
2270 }
2271 /* printf("Component labels marked dirty.\n"); */
2272 #if 0
2273 for( c = 0; c < raidPtr->numSpare ; c++) {
2274 sparecol = raidPtr->numCol + c;
2275 if (raidPtr->Disks[r][sparecol].status == rf_ds_used_spare) {
2276 /*
2277
2278 XXX this is where we get fancy and map this spare
2279 into it's correct spot in the array.
2280
2281 */
2282 /*
2283
2284 we claim this disk is "optimal" if it's
2285 rf_ds_used_spare, as that means it should be
2286 directly substitutable for the disk it replaced.
2287 We note that too...
2288
2289 */
2290
2291 for(i=0;i<raidPtr->numRow;i++) {
2292 for(j=0;j<raidPtr->numCol;j++) {
2293 if ((raidPtr->Disks[i][j].spareRow ==
2294 r) &&
2295 (raidPtr->Disks[i][j].spareCol ==
2296 sparecol)) {
2297 srow = r;
2298 scol = sparecol;
2299 break;
2300 }
2301 }
2302 }
2303
2304 raidread_component_label(
2305 raidPtr->Disks[r][sparecol].dev,
2306 raidPtr->raid_cinfo[r][sparecol].ci_vp,
2307 &clabel);
2308 /* make sure status is noted */
2309 clabel.version = RF_COMPONENT_LABEL_VERSION;
2310 clabel.mod_counter = raidPtr->mod_counter;
2311 clabel.serial_number = raidPtr->serial_number;
2312 clabel.row = srow;
2313 clabel.column = scol;
2314 clabel.num_rows = raidPtr->numRow;
2315 clabel.num_columns = raidPtr->numCol;
2316 clabel.clean = RF_RAID_DIRTY; /* changed in a bit*/
2317 clabel.status = rf_ds_optimal;
2318 raidwrite_component_label(
2319 raidPtr->Disks[r][sparecol].dev,
2320 raidPtr->raid_cinfo[r][sparecol].ci_vp,
2321 &clabel);
2322 raidmarkclean( raidPtr->Disks[r][sparecol].dev,
2323 raidPtr->raid_cinfo[r][sparecol].ci_vp);
2324 }
2325 }
2326
2327 #endif
2328 }
2329
2330
2331 void
2332 rf_update_component_labels(raidPtr, final)
2333 RF_Raid_t *raidPtr;
2334 int final;
2335 {
2336 RF_ComponentLabel_t clabel;
2337 int sparecol;
2338 int r,c;
2339 int i,j;
2340 int srow, scol;
2341
2342 srow = -1;
2343 scol = -1;
2344
2345 /* XXX should do extra checks to make sure things really are clean,
2346 rather than blindly setting the clean bit... */
2347
2348 raidPtr->mod_counter++;
2349
2350 for (r = 0; r < raidPtr->numRow; r++) {
2351 for (c = 0; c < raidPtr->numCol; c++) {
2352 if (raidPtr->Disks[r][c].status == rf_ds_optimal) {
2353 raidread_component_label(
2354 raidPtr->Disks[r][c].dev,
2355 raidPtr->raid_cinfo[r][c].ci_vp,
2356 &clabel);
2357 /* make sure status is noted */
2358 clabel.status = rf_ds_optimal;
2359 /* bump the counter */
2360 clabel.mod_counter = raidPtr->mod_counter;
2361
2362 raidwrite_component_label(
2363 raidPtr->Disks[r][c].dev,
2364 raidPtr->raid_cinfo[r][c].ci_vp,
2365 &clabel);
2366 if (final == RF_FINAL_COMPONENT_UPDATE) {
2367 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2368 raidmarkclean(
2369 raidPtr->Disks[r][c].dev,
2370 raidPtr->raid_cinfo[r][c].ci_vp,
2371 raidPtr->mod_counter);
2372 }
2373 }
2374 }
2375 /* else we don't touch it.. */
2376 }
2377 }
2378
2379 for( c = 0; c < raidPtr->numSpare ; c++) {
2380 sparecol = raidPtr->numCol + c;
2381 if (raidPtr->Disks[0][sparecol].status == rf_ds_used_spare) {
2382 /*
2383
2384 we claim this disk is "optimal" if it's
2385 rf_ds_used_spare, as that means it should be
2386 directly substitutable for the disk it replaced.
2387 We note that too...
2388
2389 */
2390
2391 for(i=0;i<raidPtr->numRow;i++) {
2392 for(j=0;j<raidPtr->numCol;j++) {
2393 if ((raidPtr->Disks[i][j].spareRow ==
2394 0) &&
2395 (raidPtr->Disks[i][j].spareCol ==
2396 sparecol)) {
2397 srow = i;
2398 scol = j;
2399 break;
2400 }
2401 }
2402 }
2403
2404 /* XXX shouldn't *really* need this... */
2405 raidread_component_label(
2406 raidPtr->Disks[0][sparecol].dev,
2407 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2408 &clabel);
2409 /* make sure status is noted */
2410
2411 raid_init_component_label(raidPtr, &clabel);
2412
2413 clabel.mod_counter = raidPtr->mod_counter;
2414 clabel.row = srow;
2415 clabel.column = scol;
2416 clabel.status = rf_ds_optimal;
2417
2418 raidwrite_component_label(
2419 raidPtr->Disks[0][sparecol].dev,
2420 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2421 &clabel);
2422 if (final == RF_FINAL_COMPONENT_UPDATE) {
2423 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2424 raidmarkclean( raidPtr->Disks[0][sparecol].dev,
2425 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2426 raidPtr->mod_counter);
2427 }
2428 }
2429 }
2430 }
2431 /* printf("Component labels updated\n"); */
2432 }
2433
2434 void
2435 rf_close_component(raidPtr, vp, auto_configured)
2436 RF_Raid_t *raidPtr;
2437 struct vnode *vp;
2438 int auto_configured;
2439 {
2440 struct proc *p;
2441
2442 p = raidPtr->engine_thread;
2443
2444 if (vp != NULL) {
2445 if (auto_configured == 1) {
2446 VOP_CLOSE(vp, FREAD, NOCRED, 0);
2447 vput(vp);
2448
2449 } else {
2450 VOP_UNLOCK(vp, 0);
2451 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
2452 }
2453 } else {
2454 printf("vnode was NULL\n");
2455 }
2456 }
2457
2458
2459 void
2460 rf_UnconfigureVnodes(raidPtr)
2461 RF_Raid_t *raidPtr;
2462 {
2463 int r,c;
2464 struct proc *p;
2465 struct vnode *vp;
2466 int acd;
2467
2468
2469 /* We take this opportunity to close the vnodes like we should.. */
2470
2471 p = raidPtr->engine_thread;
2472
2473 for (r = 0; r < raidPtr->numRow; r++) {
2474 for (c = 0; c < raidPtr->numCol; c++) {
2475 printf("Closing vnode for row: %d col: %d\n", r, c);
2476 vp = raidPtr->raid_cinfo[r][c].ci_vp;
2477 acd = raidPtr->Disks[r][c].auto_configured;
2478 rf_close_component(raidPtr, vp, acd);
2479 raidPtr->raid_cinfo[r][c].ci_vp = NULL;
2480 raidPtr->Disks[r][c].auto_configured = 0;
2481 }
2482 }
2483 for (r = 0; r < raidPtr->numSpare; r++) {
2484 printf("Closing vnode for spare: %d\n", r);
2485 vp = raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp;
2486 acd = raidPtr->Disks[0][raidPtr->numCol + r].auto_configured;
2487 rf_close_component(raidPtr, vp, acd);
2488 raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp = NULL;
2489 raidPtr->Disks[0][raidPtr->numCol + r].auto_configured = 0;
2490 }
2491 }
2492
2493
2494 void
2495 rf_ReconThread(req)
2496 struct rf_recon_req *req;
2497 {
2498 int s;
2499 RF_Raid_t *raidPtr;
2500
2501 s = splbio();
2502 raidPtr = (RF_Raid_t *) req->raidPtr;
2503 raidPtr->recon_in_progress = 1;
2504
2505 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col,
2506 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2507
2508 /* XXX get rid of this! we don't need it at all.. */
2509 RF_Free(req, sizeof(*req));
2510
2511 raidPtr->recon_in_progress = 0;
2512 splx(s);
2513
2514 /* That's all... */
2515 kthread_exit(0); /* does not return */
2516 }
2517
2518 void
2519 rf_RewriteParityThread(raidPtr)
2520 RF_Raid_t *raidPtr;
2521 {
2522 int retcode;
2523 int s;
2524
2525 raidPtr->parity_rewrite_in_progress = 1;
2526 s = splbio();
2527 retcode = rf_RewriteParity(raidPtr);
2528 splx(s);
2529 if (retcode) {
2530 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2531 } else {
2532 /* set the clean bit! If we shutdown correctly,
2533 the clean bit on each component label will get
2534 set */
2535 raidPtr->parity_good = RF_RAID_CLEAN;
2536 }
2537 raidPtr->parity_rewrite_in_progress = 0;
2538
2539 /* Anyone waiting for us to stop? If so, inform them... */
2540 if (raidPtr->waitShutdown) {
2541 wakeup(&raidPtr->parity_rewrite_in_progress);
2542 }
2543
2544 /* That's all... */
2545 kthread_exit(0); /* does not return */
2546 }
2547
2548
2549 void
2550 rf_CopybackThread(raidPtr)
2551 RF_Raid_t *raidPtr;
2552 {
2553 int s;
2554
2555 raidPtr->copyback_in_progress = 1;
2556 s = splbio();
2557 rf_CopybackReconstructedData(raidPtr);
2558 splx(s);
2559 raidPtr->copyback_in_progress = 0;
2560
2561 /* That's all... */
2562 kthread_exit(0); /* does not return */
2563 }
2564
2565
2566 void
2567 rf_ReconstructInPlaceThread(req)
2568 struct rf_recon_req *req;
2569 {
2570 int retcode;
2571 int s;
2572 RF_Raid_t *raidPtr;
2573
2574 s = splbio();
2575 raidPtr = req->raidPtr;
2576 raidPtr->recon_in_progress = 1;
2577 retcode = rf_ReconstructInPlace(raidPtr, req->row, req->col);
2578 RF_Free(req, sizeof(*req));
2579 raidPtr->recon_in_progress = 0;
2580 splx(s);
2581
2582 /* That's all... */
2583 kthread_exit(0); /* does not return */
2584 }
2585
2586 void
2587 rf_mountroot_hook(dev)
2588 struct device *dev;
2589 {
2590
2591 }
2592
2593
2594 RF_AutoConfig_t *
2595 rf_find_raid_components()
2596 {
2597 struct devnametobdevmaj *dtobdm;
2598 struct vnode *vp;
2599 struct disklabel label;
2600 struct device *dv;
2601 char *cd_name;
2602 dev_t dev;
2603 int error;
2604 int i;
2605 int good_one;
2606 RF_ComponentLabel_t *clabel;
2607 RF_AutoConfig_t *ac_list;
2608 RF_AutoConfig_t *ac;
2609
2610
2611 /* initialize the AutoConfig list */
2612 ac_list = NULL;
2613
2614 if (raidautoconfig) {
2615
2616 /* we begin by trolling through *all* the devices on the system */
2617
2618 for (dv = alldevs.tqh_first; dv != NULL;
2619 dv = dv->dv_list.tqe_next) {
2620
2621 /* we are only interested in disks... */
2622 if (dv->dv_class != DV_DISK)
2623 continue;
2624
2625 /* we don't care about floppies... */
2626 if (!strcmp(dv->dv_cfdata->cf_driver->cd_name,"fd")) {
2627 continue;
2628 }
2629
2630 /* need to find the device_name_to_block_device_major stuff */
2631 cd_name = dv->dv_cfdata->cf_driver->cd_name;
2632 dtobdm = dev_name2blk;
2633 while (dtobdm->d_name && strcmp(dtobdm->d_name, cd_name)) {
2634 dtobdm++;
2635 }
2636
2637 /* get a vnode for the raw partition of this disk */
2638
2639 dev = MAKEDISKDEV(dtobdm->d_maj, dv->dv_unit, RAW_PART);
2640 if (bdevvp(dev, &vp))
2641 panic("RAID can't alloc vnode");
2642
2643 error = VOP_OPEN(vp, FREAD, NOCRED, 0);
2644
2645 if (error) {
2646 /* "Who cares." Continue looking
2647 for something that exists*/
2648 vput(vp);
2649 continue;
2650 }
2651
2652 /* Ok, the disk exists. Go get the disklabel. */
2653 error = VOP_IOCTL(vp, DIOCGDINFO, (caddr_t)&label,
2654 FREAD, NOCRED, 0);
2655 if (error) {
2656 /*
2657 * XXX can't happen - open() would
2658 * have errored out (or faked up one)
2659 */
2660 printf("can't get label for dev %s%c (%d)!?!?\n",
2661 dv->dv_xname, 'a' + RAW_PART, error);
2662 }
2663
2664 /* don't need this any more. We'll allocate it again
2665 a little later if we really do... */
2666 VOP_CLOSE(vp, FREAD, NOCRED, 0);
2667 vput(vp);
2668
2669 for (i=0; i < label.d_npartitions; i++) {
2670 /* We only support partitions marked as RAID */
2671 if (label.d_partitions[i].p_fstype != FS_RAID)
2672 continue;
2673
2674 dev = MAKEDISKDEV(dtobdm->d_maj, dv->dv_unit, i);
2675 if (bdevvp(dev, &vp))
2676 panic("RAID can't alloc vnode");
2677
2678 error = VOP_OPEN(vp, FREAD, NOCRED, 0);
2679 if (error) {
2680 /* Whatever... */
2681 vput(vp);
2682 continue;
2683 }
2684
2685 good_one = 0;
2686
2687 clabel = (RF_ComponentLabel_t *)
2688 malloc(sizeof(RF_ComponentLabel_t),
2689 M_RAIDFRAME, M_NOWAIT);
2690 if (clabel == NULL) {
2691 /* XXX CLEANUP HERE */
2692 printf("RAID auto config: out of memory!\n");
2693 return(NULL); /* XXX probably should panic? */
2694 }
2695
2696 if (!raidread_component_label(dev, vp, clabel)) {
2697 /* Got the label. Does it look reasonable? */
2698 if (rf_reasonable_label(clabel) &&
2699 (clabel->partitionSize <=
2700 label.d_partitions[i].p_size)) {
2701 #if DEBUG
2702 printf("Component on: %s%c: %d\n",
2703 dv->dv_xname, 'a'+i,
2704 label.d_partitions[i].p_size);
2705 rf_print_component_label(clabel);
2706 #endif
2707 /* if it's reasonable, add it,
2708 else ignore it. */
2709 ac = (RF_AutoConfig_t *)
2710 malloc(sizeof(RF_AutoConfig_t),
2711 M_RAIDFRAME,
2712 M_NOWAIT);
2713 if (ac == NULL) {
2714 /* XXX should panic?? */
2715 return(NULL);
2716 }
2717
2718 sprintf(ac->devname, "%s%c",
2719 dv->dv_xname, 'a'+i);
2720 ac->dev = dev;
2721 ac->vp = vp;
2722 ac->clabel = clabel;
2723 ac->next = ac_list;
2724 ac_list = ac;
2725 good_one = 1;
2726 }
2727 }
2728 if (!good_one) {
2729 /* cleanup */
2730 free(clabel, M_RAIDFRAME);
2731 VOP_CLOSE(vp, FREAD, NOCRED, 0);
2732 vput(vp);
2733 }
2734 }
2735 }
2736 }
2737 return(ac_list);
2738 }
2739
2740 static int
2741 rf_reasonable_label(clabel)
2742 RF_ComponentLabel_t *clabel;
2743 {
2744
2745 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2746 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2747 ((clabel->clean == RF_RAID_CLEAN) ||
2748 (clabel->clean == RF_RAID_DIRTY)) &&
2749 clabel->row >=0 &&
2750 clabel->column >= 0 &&
2751 clabel->num_rows > 0 &&
2752 clabel->num_columns > 0 &&
2753 clabel->row < clabel->num_rows &&
2754 clabel->column < clabel->num_columns &&
2755 clabel->blockSize > 0 &&
2756 clabel->numBlocks > 0) {
2757 /* label looks reasonable enough... */
2758 return(1);
2759 }
2760 return(0);
2761 }
2762
2763
2764 void
2765 rf_print_component_label(clabel)
2766 RF_ComponentLabel_t *clabel;
2767 {
2768 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
2769 clabel->row, clabel->column,
2770 clabel->num_rows, clabel->num_columns);
2771 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
2772 clabel->version, clabel->serial_number,
2773 clabel->mod_counter);
2774 printf(" Clean: %s Status: %d\n",
2775 clabel->clean ? "Yes" : "No", clabel->status );
2776 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
2777 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
2778 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
2779 (char) clabel->parityConfig, clabel->blockSize,
2780 clabel->numBlocks);
2781 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
2782 printf(" Contains root partition: %s\n",
2783 clabel->root_partition ? "Yes" : "No" );
2784 printf(" Last configured as: raid%d\n", clabel->last_unit );
2785 #if 0
2786 printf(" Config order: %d\n", clabel->config_order);
2787 #endif
2788
2789 }
2790
2791 RF_ConfigSet_t *
2792 rf_create_auto_sets(ac_list)
2793 RF_AutoConfig_t *ac_list;
2794 {
2795 RF_AutoConfig_t *ac;
2796 RF_ConfigSet_t *config_sets;
2797 RF_ConfigSet_t *cset;
2798 RF_AutoConfig_t *ac_next;
2799
2800
2801 config_sets = NULL;
2802
2803 /* Go through the AutoConfig list, and figure out which components
2804 belong to what sets. */
2805 ac = ac_list;
2806 while(ac!=NULL) {
2807 /* we're going to putz with ac->next, so save it here
2808 for use at the end of the loop */
2809 ac_next = ac->next;
2810
2811 if (config_sets == NULL) {
2812 /* will need at least this one... */
2813 config_sets = (RF_ConfigSet_t *)
2814 malloc(sizeof(RF_ConfigSet_t),
2815 M_RAIDFRAME, M_NOWAIT);
2816 if (config_sets == NULL) {
2817 panic("rf_create_auto_sets: No memory!\n");
2818 }
2819 /* this one is easy :) */
2820 config_sets->ac = ac;
2821 config_sets->next = NULL;
2822 config_sets->rootable = 0;
2823 ac->next = NULL;
2824 } else {
2825 /* which set does this component fit into? */
2826 cset = config_sets;
2827 while(cset!=NULL) {
2828 if (rf_does_it_fit(cset, ac)) {
2829 /* looks like it matches... */
2830 ac->next = cset->ac;
2831 cset->ac = ac;
2832 break;
2833 }
2834 cset = cset->next;
2835 }
2836 if (cset==NULL) {
2837 /* didn't find a match above... new set..*/
2838 cset = (RF_ConfigSet_t *)
2839 malloc(sizeof(RF_ConfigSet_t),
2840 M_RAIDFRAME, M_NOWAIT);
2841 if (cset == NULL) {
2842 panic("rf_create_auto_sets: No memory!\n");
2843 }
2844 cset->ac = ac;
2845 ac->next = NULL;
2846 cset->next = config_sets;
2847 cset->rootable = 0;
2848 config_sets = cset;
2849 }
2850 }
2851 ac = ac_next;
2852 }
2853
2854
2855 return(config_sets);
2856 }
2857
2858 static int
2859 rf_does_it_fit(cset, ac)
2860 RF_ConfigSet_t *cset;
2861 RF_AutoConfig_t *ac;
2862 {
2863 RF_ComponentLabel_t *clabel1, *clabel2;
2864
2865 /* If this one matches the *first* one in the set, that's good
2866 enough, since the other members of the set would have been
2867 through here too... */
2868 /* note that we are not checking partitionSize here..
2869
2870 Note that we are also not checking the mod_counters here.
2871 If everything else matches execpt the mod_counter, that's
2872 good enough for this test. We will deal with the mod_counters
2873 a little later in the autoconfiguration process.
2874
2875 (clabel1->mod_counter == clabel2->mod_counter) &&
2876
2877 The reason we don't check for this is that failed disks
2878 will have lower modification counts. If those disks are
2879 not added to the set they used to belong to, then they will
2880 form their own set, which may result in 2 different sets,
2881 for example, competing to be configured at raid0, and
2882 perhaps competing to be the root filesystem set. If the
2883 wrong ones get configured, or both attempt to become /,
2884 weird behaviour and or serious lossage will occur. Thus we
2885 need to bring them into the fold here, and kick them out at
2886 a later point.
2887
2888 */
2889
2890 clabel1 = cset->ac->clabel;
2891 clabel2 = ac->clabel;
2892 if ((clabel1->version == clabel2->version) &&
2893 (clabel1->serial_number == clabel2->serial_number) &&
2894 (clabel1->num_rows == clabel2->num_rows) &&
2895 (clabel1->num_columns == clabel2->num_columns) &&
2896 (clabel1->sectPerSU == clabel2->sectPerSU) &&
2897 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
2898 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
2899 (clabel1->parityConfig == clabel2->parityConfig) &&
2900 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
2901 (clabel1->blockSize == clabel2->blockSize) &&
2902 (clabel1->numBlocks == clabel2->numBlocks) &&
2903 (clabel1->autoconfigure == clabel2->autoconfigure) &&
2904 (clabel1->root_partition == clabel2->root_partition) &&
2905 (clabel1->last_unit == clabel2->last_unit) &&
2906 (clabel1->config_order == clabel2->config_order)) {
2907 /* if it get's here, it almost *has* to be a match */
2908 } else {
2909 /* it's not consistent with somebody in the set..
2910 punt */
2911 return(0);
2912 }
2913 /* all was fine.. it must fit... */
2914 return(1);
2915 }
2916
2917 int
2918 rf_have_enough_components(cset)
2919 RF_ConfigSet_t *cset;
2920 {
2921 RF_AutoConfig_t *ac;
2922 RF_AutoConfig_t *auto_config;
2923 RF_ComponentLabel_t *clabel;
2924 int r,c;
2925 int num_rows;
2926 int num_cols;
2927 int num_missing;
2928 int mod_counter;
2929 int mod_counter_found;
2930 int even_pair_failed;
2931 char parity_type;
2932
2933
2934 /* check to see that we have enough 'live' components
2935 of this set. If so, we can configure it if necessary */
2936
2937 num_rows = cset->ac->clabel->num_rows;
2938 num_cols = cset->ac->clabel->num_columns;
2939 parity_type = cset->ac->clabel->parityConfig;
2940
2941 /* XXX Check for duplicate components!?!?!? */
2942
2943 /* Determine what the mod_counter is supposed to be for this set. */
2944
2945 mod_counter_found = 0;
2946 ac = cset->ac;
2947 while(ac!=NULL) {
2948 if (mod_counter_found==0) {
2949 mod_counter = ac->clabel->mod_counter;
2950 mod_counter_found = 1;
2951 } else {
2952 if (ac->clabel->mod_counter > mod_counter) {
2953 mod_counter = ac->clabel->mod_counter;
2954 }
2955 }
2956 ac = ac->next;
2957 }
2958
2959 num_missing = 0;
2960 auto_config = cset->ac;
2961
2962 for(r=0; r<num_rows; r++) {
2963 even_pair_failed = 0;
2964 for(c=0; c<num_cols; c++) {
2965 ac = auto_config;
2966 while(ac!=NULL) {
2967 if ((ac->clabel->row == r) &&
2968 (ac->clabel->column == c) &&
2969 (ac->clabel->mod_counter == mod_counter)) {
2970 /* it's this one... */
2971 #if DEBUG
2972 printf("Found: %s at %d,%d\n",
2973 ac->devname,r,c);
2974 #endif
2975 break;
2976 }
2977 ac=ac->next;
2978 }
2979 if (ac==NULL) {
2980 /* Didn't find one here! */
2981 /* special case for RAID 1, especially
2982 where there are more than 2
2983 components (where RAIDframe treats
2984 things a little differently :( ) */
2985 if (parity_type == '1') {
2986 if (c%2 == 0) { /* even component */
2987 even_pair_failed = 1;
2988 } else { /* odd component. If
2989 we're failed, and
2990 so is the even
2991 component, it's
2992 "Good Night, Charlie" */
2993 if (even_pair_failed == 1) {
2994 return(0);
2995 }
2996 }
2997 } else {
2998 /* normal accounting */
2999 num_missing++;
3000 }
3001 }
3002 if ((parity_type == '1') && (c%2 == 1)) {
3003 /* Just did an even component, and we didn't
3004 bail.. reset the even_pair_failed flag,
3005 and go on to the next component.... */
3006 even_pair_failed = 0;
3007 }
3008 }
3009 }
3010
3011 clabel = cset->ac->clabel;
3012
3013 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3014 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3015 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3016 /* XXX this needs to be made *much* more general */
3017 /* Too many failures */
3018 return(0);
3019 }
3020 /* otherwise, all is well, and we've got enough to take a kick
3021 at autoconfiguring this set */
3022 return(1);
3023 }
3024
3025 void
3026 rf_create_configuration(ac,config,raidPtr)
3027 RF_AutoConfig_t *ac;
3028 RF_Config_t *config;
3029 RF_Raid_t *raidPtr;
3030 {
3031 RF_ComponentLabel_t *clabel;
3032 int i;
3033
3034 clabel = ac->clabel;
3035
3036 /* 1. Fill in the common stuff */
3037 config->numRow = clabel->num_rows;
3038 config->numCol = clabel->num_columns;
3039 config->numSpare = 0; /* XXX should this be set here? */
3040 config->sectPerSU = clabel->sectPerSU;
3041 config->SUsPerPU = clabel->SUsPerPU;
3042 config->SUsPerRU = clabel->SUsPerRU;
3043 config->parityConfig = clabel->parityConfig;
3044 /* XXX... */
3045 strcpy(config->diskQueueType,"fifo");
3046 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3047 config->layoutSpecificSize = 0; /* XXX ?? */
3048
3049 while(ac!=NULL) {
3050 /* row/col values will be in range due to the checks
3051 in reasonable_label() */
3052 strcpy(config->devnames[ac->clabel->row][ac->clabel->column],
3053 ac->devname);
3054 ac = ac->next;
3055 }
3056
3057 for(i=0;i<RF_MAXDBGV;i++) {
3058 config->debugVars[i][0] = NULL;
3059 }
3060 }
3061
3062 int
3063 rf_set_autoconfig(raidPtr, new_value)
3064 RF_Raid_t *raidPtr;
3065 int new_value;
3066 {
3067 RF_ComponentLabel_t clabel;
3068 struct vnode *vp;
3069 dev_t dev;
3070 int row, column;
3071
3072 raidPtr->autoconfigure = new_value;
3073 for(row=0; row<raidPtr->numRow; row++) {
3074 for(column=0; column<raidPtr->numCol; column++) {
3075 if (raidPtr->Disks[row][column].status ==
3076 rf_ds_optimal) {
3077 dev = raidPtr->Disks[row][column].dev;
3078 vp = raidPtr->raid_cinfo[row][column].ci_vp;
3079 raidread_component_label(dev, vp, &clabel);
3080 clabel.autoconfigure = new_value;
3081 raidwrite_component_label(dev, vp, &clabel);
3082 }
3083 }
3084 }
3085 return(new_value);
3086 }
3087
3088 int
3089 rf_set_rootpartition(raidPtr, new_value)
3090 RF_Raid_t *raidPtr;
3091 int new_value;
3092 {
3093 RF_ComponentLabel_t clabel;
3094 struct vnode *vp;
3095 dev_t dev;
3096 int row, column;
3097
3098 raidPtr->root_partition = new_value;
3099 for(row=0; row<raidPtr->numRow; row++) {
3100 for(column=0; column<raidPtr->numCol; column++) {
3101 if (raidPtr->Disks[row][column].status ==
3102 rf_ds_optimal) {
3103 dev = raidPtr->Disks[row][column].dev;
3104 vp = raidPtr->raid_cinfo[row][column].ci_vp;
3105 raidread_component_label(dev, vp, &clabel);
3106 clabel.root_partition = new_value;
3107 raidwrite_component_label(dev, vp, &clabel);
3108 }
3109 }
3110 }
3111 return(new_value);
3112 }
3113
3114 void
3115 rf_release_all_vps(cset)
3116 RF_ConfigSet_t *cset;
3117 {
3118 RF_AutoConfig_t *ac;
3119
3120 ac = cset->ac;
3121 while(ac!=NULL) {
3122 /* Close the vp, and give it back */
3123 if (ac->vp) {
3124 VOP_CLOSE(ac->vp, FREAD, NOCRED, 0);
3125 vput(ac->vp);
3126 ac->vp = NULL;
3127 }
3128 ac = ac->next;
3129 }
3130 }
3131
3132
3133 void
3134 rf_cleanup_config_set(cset)
3135 RF_ConfigSet_t *cset;
3136 {
3137 RF_AutoConfig_t *ac;
3138 RF_AutoConfig_t *next_ac;
3139
3140 ac = cset->ac;
3141 while(ac!=NULL) {
3142 next_ac = ac->next;
3143 /* nuke the label */
3144 free(ac->clabel, M_RAIDFRAME);
3145 /* cleanup the config structure */
3146 free(ac, M_RAIDFRAME);
3147 /* "next.." */
3148 ac = next_ac;
3149 }
3150 /* and, finally, nuke the config set */
3151 free(cset, M_RAIDFRAME);
3152 }
3153
3154
3155 void
3156 raid_init_component_label(raidPtr, clabel)
3157 RF_Raid_t *raidPtr;
3158 RF_ComponentLabel_t *clabel;
3159 {
3160 /* current version number */
3161 clabel->version = RF_COMPONENT_LABEL_VERSION;
3162 clabel->serial_number = raidPtr->serial_number;
3163 clabel->mod_counter = raidPtr->mod_counter;
3164 clabel->num_rows = raidPtr->numRow;
3165 clabel->num_columns = raidPtr->numCol;
3166 clabel->clean = RF_RAID_DIRTY; /* not clean */
3167 clabel->status = rf_ds_optimal; /* "It's good!" */
3168
3169 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3170 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3171 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3172
3173 clabel->blockSize = raidPtr->bytesPerSector;
3174 clabel->numBlocks = raidPtr->sectorsPerDisk;
3175
3176 /* XXX not portable */
3177 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3178 clabel->maxOutstanding = raidPtr->maxOutstanding;
3179 clabel->autoconfigure = raidPtr->autoconfigure;
3180 clabel->root_partition = raidPtr->root_partition;
3181 clabel->last_unit = raidPtr->raidid;
3182 clabel->config_order = raidPtr->config_order;
3183 }
3184
3185 int
3186 rf_auto_config_set(cset,unit)
3187 RF_ConfigSet_t *cset;
3188 int *unit;
3189 {
3190 RF_Raid_t *raidPtr;
3191 RF_Config_t *config;
3192 int raidID;
3193 int retcode;
3194
3195 printf("RAID autoconfigure\n");
3196
3197 retcode = 0;
3198 *unit = -1;
3199
3200 /* 1. Create a config structure */
3201
3202 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3203 M_RAIDFRAME,
3204 M_NOWAIT);
3205 if (config==NULL) {
3206 printf("Out of mem!?!?\n");
3207 /* XXX do something more intelligent here. */
3208 return(1);
3209 }
3210
3211 memset(config, 0, sizeof(RF_Config_t));
3212
3213 /* XXX raidID needs to be set correctly.. */
3214
3215 /*
3216 2. Figure out what RAID ID this one is supposed to live at
3217 See if we can get the same RAID dev that it was configured
3218 on last time..
3219 */
3220
3221 raidID = cset->ac->clabel->last_unit;
3222 if ((raidID < 0) || (raidID >= numraid)) {
3223 /* let's not wander off into lala land. */
3224 raidID = numraid - 1;
3225 }
3226 if (raidPtrs[raidID]->valid != 0) {
3227
3228 /*
3229 Nope... Go looking for an alternative...
3230 Start high so we don't immediately use raid0 if that's
3231 not taken.
3232 */
3233
3234 for(raidID = numraid; raidID >= 0; raidID--) {
3235 if (raidPtrs[raidID]->valid == 0) {
3236 /* can use this one! */
3237 break;
3238 }
3239 }
3240 }
3241
3242 if (raidID < 0) {
3243 /* punt... */
3244 printf("Unable to auto configure this set!\n");
3245 printf("(Out of RAID devs!)\n");
3246 return(1);
3247 }
3248 printf("Configuring raid%d:\n",raidID);
3249 raidPtr = raidPtrs[raidID];
3250
3251 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3252 raidPtr->raidid = raidID;
3253 raidPtr->openings = RAIDOUTSTANDING;
3254
3255 /* 3. Build the configuration structure */
3256 rf_create_configuration(cset->ac, config, raidPtr);
3257
3258 /* 4. Do the configuration */
3259 retcode = rf_Configure(raidPtr, config, cset->ac);
3260
3261 if (retcode == 0) {
3262
3263 raidinit(raidPtrs[raidID]);
3264
3265 rf_markalldirty(raidPtrs[raidID]);
3266 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3267 if (cset->ac->clabel->root_partition==1) {
3268 /* everything configured just fine. Make a note
3269 that this set is eligible to be root. */
3270 cset->rootable = 1;
3271 /* XXX do this here? */
3272 raidPtrs[raidID]->root_partition = 1;
3273 }
3274 }
3275
3276 /* 5. Cleanup */
3277 free(config, M_RAIDFRAME);
3278
3279 *unit = raidID;
3280 return(retcode);
3281 }
3282