rf_netbsdkintf.c revision 1.72 1 /* $NetBSD: rf_netbsdkintf.c,v 1.72 2000/03/22 01:41:41 oster Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the NetBSD
20 * Foundation, Inc. and its contributors.
21 * 4. Neither the name of The NetBSD Foundation nor the names of its
22 * contributors may be used to endorse or promote products derived
23 * from this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Copyright (c) 1988 University of Utah.
40 * Copyright (c) 1990, 1993
41 * The Regents of the University of California. All rights reserved.
42 *
43 * This code is derived from software contributed to Berkeley by
44 * the Systems Programming Group of the University of Utah Computer
45 * Science Department.
46 *
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions
49 * are met:
50 * 1. Redistributions of source code must retain the above copyright
51 * notice, this list of conditions and the following disclaimer.
52 * 2. Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * 3. All advertising materials mentioning features or use of this software
56 * must display the following acknowledgement:
57 * This product includes software developed by the University of
58 * California, Berkeley and its contributors.
59 * 4. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
62 *
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE.
74 *
75 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 *
77 * @(#)cd.c 8.2 (Berkeley) 11/16/93
78 */
79
80
81
82
83 /*
84 * Copyright (c) 1995 Carnegie-Mellon University.
85 * All rights reserved.
86 *
87 * Authors: Mark Holland, Jim Zelenka
88 *
89 * Permission to use, copy, modify and distribute this software and
90 * its documentation is hereby granted, provided that both the copyright
91 * notice and this permission notice appear in all copies of the
92 * software, derivative works or modified versions, and any portions
93 * thereof, and that both notices appear in supporting documentation.
94 *
95 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
96 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
97 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
98 *
99 * Carnegie Mellon requests users of this software to return to
100 *
101 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
102 * School of Computer Science
103 * Carnegie Mellon University
104 * Pittsburgh PA 15213-3890
105 *
106 * any improvements or extensions that they make and grant Carnegie the
107 * rights to redistribute these changes.
108 */
109
110 /***********************************************************
111 *
112 * rf_kintf.c -- the kernel interface routines for RAIDframe
113 *
114 ***********************************************************/
115
116 #include <sys/errno.h>
117 #include <sys/param.h>
118 #include <sys/pool.h>
119 #include <sys/queue.h>
120 #include <sys/disk.h>
121 #include <sys/device.h>
122 #include <sys/stat.h>
123 #include <sys/ioctl.h>
124 #include <sys/fcntl.h>
125 #include <sys/systm.h>
126 #include <sys/namei.h>
127 #include <sys/vnode.h>
128 #include <sys/param.h>
129 #include <sys/types.h>
130 #include <machine/types.h>
131 #include <sys/disklabel.h>
132 #include <sys/conf.h>
133 #include <sys/lock.h>
134 #include <sys/buf.h>
135 #include <sys/user.h>
136 #include <sys/reboot.h>
137
138 #include "raid.h"
139 #include "opt_raid_autoconfig.h"
140 #include "rf_raid.h"
141 #include "rf_raidframe.h"
142 #include "rf_copyback.h"
143 #include "rf_dag.h"
144 #include "rf_dagflags.h"
145 #include "rf_diskqueue.h"
146 #include "rf_acctrace.h"
147 #include "rf_etimer.h"
148 #include "rf_general.h"
149 #include "rf_debugMem.h"
150 #include "rf_kintf.h"
151 #include "rf_options.h"
152 #include "rf_driver.h"
153 #include "rf_parityscan.h"
154 #include "rf_debugprint.h"
155 #include "rf_threadstuff.h"
156 #include "rf_configure.h"
157
158 int rf_kdebug_level = 0;
159
160 #ifdef DEBUG
161 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
162 #else /* DEBUG */
163 #define db1_printf(a) { }
164 #endif /* DEBUG */
165
166 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
167
168 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
169
170 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
171 * spare table */
172 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
173 * installation process */
174
175 /* prototypes */
176 static void KernelWakeupFunc(struct buf * bp);
177 static void InitBP(struct buf * bp, struct vnode *, unsigned rw_flag,
178 dev_t dev, RF_SectorNum_t startSect,
179 RF_SectorCount_t numSect, caddr_t buf,
180 void (*cbFunc) (struct buf *), void *cbArg,
181 int logBytesPerSector, struct proc * b_proc);
182 static void raidinit __P((RF_Raid_t *));
183
184 void raidattach __P((int));
185 int raidsize __P((dev_t));
186 int raidopen __P((dev_t, int, int, struct proc *));
187 int raidclose __P((dev_t, int, int, struct proc *));
188 int raidioctl __P((dev_t, u_long, caddr_t, int, struct proc *));
189 int raidwrite __P((dev_t, struct uio *, int));
190 int raidread __P((dev_t, struct uio *, int));
191 void raidstrategy __P((struct buf *));
192 int raiddump __P((dev_t, daddr_t, caddr_t, size_t));
193
194 /*
195 * Pilfered from ccd.c
196 */
197
198 struct raidbuf {
199 struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */
200 struct buf *rf_obp; /* ptr. to original I/O buf */
201 int rf_flags; /* misc. flags */
202 RF_DiskQueueData_t *req;/* the request that this was part of.. */
203 };
204
205
206 #define RAIDGETBUF(rs) pool_get(&(rs)->sc_cbufpool, PR_NOWAIT)
207 #define RAIDPUTBUF(rs, cbp) pool_put(&(rs)->sc_cbufpool, cbp)
208
209 /* XXX Not sure if the following should be replacing the raidPtrs above,
210 or if it should be used in conjunction with that...
211 */
212
213 struct raid_softc {
214 int sc_flags; /* flags */
215 int sc_cflags; /* configuration flags */
216 size_t sc_size; /* size of the raid device */
217 char sc_xname[20]; /* XXX external name */
218 struct disk sc_dkdev; /* generic disk device info */
219 struct pool sc_cbufpool; /* component buffer pool */
220 struct buf_queue buf_queue; /* used for the device queue */
221 };
222 /* sc_flags */
223 #define RAIDF_INITED 0x01 /* unit has been initialized */
224 #define RAIDF_WLABEL 0x02 /* label area is writable */
225 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
226 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
227 #define RAIDF_LOCKED 0x80 /* unit is locked */
228
229 #define raidunit(x) DISKUNIT(x)
230 int numraid = 0;
231
232 /*
233 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
234 * Be aware that large numbers can allow the driver to consume a lot of
235 * kernel memory, especially on writes, and in degraded mode reads.
236 *
237 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
238 * a single 64K write will typically require 64K for the old data,
239 * 64K for the old parity, and 64K for the new parity, for a total
240 * of 192K (if the parity buffer is not re-used immediately).
241 * Even it if is used immedately, that's still 128K, which when multiplied
242 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
243 *
244 * Now in degraded mode, for example, a 64K read on the above setup may
245 * require data reconstruction, which will require *all* of the 4 remaining
246 * disks to participate -- 4 * 32K/disk == 128K again.
247 */
248
249 #ifndef RAIDOUTSTANDING
250 #define RAIDOUTSTANDING 6
251 #endif
252
253 #define RAIDLABELDEV(dev) \
254 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
255
256 /* declared here, and made public, for the benefit of KVM stuff.. */
257 struct raid_softc *raid_softc;
258
259 static void raidgetdefaultlabel __P((RF_Raid_t *, struct raid_softc *,
260 struct disklabel *));
261 static void raidgetdisklabel __P((dev_t));
262 static void raidmakedisklabel __P((struct raid_softc *));
263
264 static int raidlock __P((struct raid_softc *));
265 static void raidunlock __P((struct raid_softc *));
266
267 static void rf_markalldirty __P((RF_Raid_t *));
268 void rf_mountroot_hook __P((struct device *));
269
270 struct device *raidrootdev;
271
272 void rf_ReconThread __P((struct rf_recon_req *));
273 /* XXX what I want is: */
274 /*void rf_ReconThread __P((RF_Raid_t *raidPtr)); */
275 void rf_RewriteParityThread __P((RF_Raid_t *raidPtr));
276 void rf_CopybackThread __P((RF_Raid_t *raidPtr));
277 void rf_ReconstructInPlaceThread __P((struct rf_recon_req *));
278 void rf_buildroothack __P((void *));
279
280 RF_AutoConfig_t *rf_find_raid_components __P((void));
281 RF_ConfigSet_t *rf_create_auto_sets __P((RF_AutoConfig_t *));
282 static int rf_does_it_fit __P((RF_ConfigSet_t *,RF_AutoConfig_t *));
283 static int rf_reasonable_label __P((RF_ComponentLabel_t *));
284 void rf_create_configuration __P((RF_AutoConfig_t *,RF_Config_t *,
285 RF_Raid_t *));
286 int rf_set_autoconfig __P((RF_Raid_t *, int));
287 int rf_set_rootpartition __P((RF_Raid_t *, int));
288 void rf_release_all_vps __P((RF_ConfigSet_t *));
289 void rf_cleanup_config_set __P((RF_ConfigSet_t *));
290 int rf_have_enough_components __P((RF_ConfigSet_t *));
291 int rf_auto_config_set __P((RF_ConfigSet_t *, int *));
292
293 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not
294 allow autoconfig to take place.
295 Note that this is overridden by having
296 RAID_AUTOCONFIG as an option in the
297 kernel config file. */
298 extern struct device *booted_device;
299
300 void
301 raidattach(num)
302 int num;
303 {
304 int raidID;
305 int i, rc;
306 RF_AutoConfig_t *ac_list; /* autoconfig list */
307 RF_ConfigSet_t *config_sets;
308
309 #ifdef DEBUG
310 printf("raidattach: Asked for %d units\n", num);
311 #endif
312
313 if (num <= 0) {
314 #ifdef DIAGNOSTIC
315 panic("raidattach: count <= 0");
316 #endif
317 return;
318 }
319 /* This is where all the initialization stuff gets done. */
320
321 numraid = num;
322
323 /* Make some space for requested number of units... */
324
325 RF_Calloc(raidPtrs, num, sizeof(RF_Raid_t *), (RF_Raid_t **));
326 if (raidPtrs == NULL) {
327 panic("raidPtrs is NULL!!\n");
328 }
329
330 rc = rf_mutex_init(&rf_sparet_wait_mutex);
331 if (rc) {
332 RF_PANIC();
333 }
334
335 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
336
337 for (i = 0; i < num; i++)
338 raidPtrs[i] = NULL;
339 rc = rf_BootRaidframe();
340 if (rc == 0)
341 printf("Kernelized RAIDframe activated\n");
342 else
343 panic("Serious error booting RAID!!\n");
344
345 /* put together some datastructures like the CCD device does.. This
346 * lets us lock the device and what-not when it gets opened. */
347
348 raid_softc = (struct raid_softc *)
349 malloc(num * sizeof(struct raid_softc),
350 M_RAIDFRAME, M_NOWAIT);
351 if (raid_softc == NULL) {
352 printf("WARNING: no memory for RAIDframe driver\n");
353 return;
354 }
355
356 bzero(raid_softc, num * sizeof(struct raid_softc));
357
358 raidrootdev = (struct device *)malloc(num * sizeof(struct device),
359 M_RAIDFRAME, M_NOWAIT);
360 if (raidrootdev == NULL) {
361 panic("No memory for RAIDframe driver!!?!?!\n");
362 }
363
364 for (raidID = 0; raidID < num; raidID++) {
365 BUFQ_INIT(&raid_softc[raidID].buf_queue);
366
367 raidrootdev[raidID].dv_class = DV_DISK;
368 raidrootdev[raidID].dv_cfdata = NULL;
369 raidrootdev[raidID].dv_unit = raidID;
370 raidrootdev[raidID].dv_parent = NULL;
371 raidrootdev[raidID].dv_flags = 0;
372 sprintf(raidrootdev[raidID].dv_xname,"raid%d",raidID);
373
374 RF_Calloc(raidPtrs[raidID], 1, sizeof(RF_Raid_t),
375 (RF_Raid_t *));
376 if (raidPtrs[raidID] == NULL) {
377 printf("WARNING: raidPtrs[%d] is NULL\n", raidID);
378 numraid = raidID;
379 return;
380 }
381 }
382
383 #if RAID_AUTOCONFIG
384 raidautoconfig = 1;
385 #endif
386
387 if (raidautoconfig) {
388 /* 1. locate all RAID components on the system */
389
390 #if DEBUG
391 printf("Searching for raid components...\n");
392 #endif
393 ac_list = rf_find_raid_components();
394
395 /* 2. sort them into their respective sets */
396
397 config_sets = rf_create_auto_sets(ac_list);
398
399 /* 3. evaluate each set and configure the valid ones
400 This gets done in rf_buildroothack() */
401
402 /* schedule the creation of the thread to do the
403 "/ on RAID" stuff */
404
405 kthread_create(rf_buildroothack,config_sets);
406
407 #if 0
408 mountroothook_establish(rf_mountroot_hook, &raidrootdev[0]);
409 #endif
410 }
411
412 }
413
414 void
415 rf_buildroothack(arg)
416 void *arg;
417 {
418 RF_ConfigSet_t *config_sets = arg;
419 RF_ConfigSet_t *cset;
420 RF_ConfigSet_t *next_cset;
421 int retcode;
422 int raidID;
423 int rootID;
424 int num_root;
425
426 num_root = 0;
427 cset = config_sets;
428 while(cset != NULL ) {
429 next_cset = cset->next;
430 if (rf_have_enough_components(cset) &&
431 cset->ac->clabel->autoconfigure==1) {
432 retcode = rf_auto_config_set(cset,&raidID);
433 if (!retcode) {
434 if (cset->rootable) {
435 rootID = raidID;
436 num_root++;
437 }
438 } else {
439 /* The autoconfig didn't work :( */
440 #if DEBUG
441 printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
442 #endif
443 rf_release_all_vps(cset);
444 }
445 } else {
446 /* we're not autoconfiguring this set...
447 release the associated resources */
448 rf_release_all_vps(cset);
449 }
450 /* cleanup */
451 rf_cleanup_config_set(cset);
452 cset = next_cset;
453 }
454 if (boothowto & RB_ASKNAME) {
455 /* We don't auto-config... */
456 } else {
457 /* They didn't ask, and we found something bootable... */
458
459 if (num_root == 1) {
460 booted_device = &raidrootdev[rootID];
461 } else if (num_root > 1) {
462 /* we can't guess.. require the user to answer... */
463 boothowto |= RB_ASKNAME;
464 }
465 }
466 }
467
468
469 int
470 raidsize(dev)
471 dev_t dev;
472 {
473 struct raid_softc *rs;
474 struct disklabel *lp;
475 int part, unit, omask, size;
476
477 unit = raidunit(dev);
478 if (unit >= numraid)
479 return (-1);
480 rs = &raid_softc[unit];
481
482 if ((rs->sc_flags & RAIDF_INITED) == 0)
483 return (-1);
484
485 part = DISKPART(dev);
486 omask = rs->sc_dkdev.dk_openmask & (1 << part);
487 lp = rs->sc_dkdev.dk_label;
488
489 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curproc))
490 return (-1);
491
492 if (lp->d_partitions[part].p_fstype != FS_SWAP)
493 size = -1;
494 else
495 size = lp->d_partitions[part].p_size *
496 (lp->d_secsize / DEV_BSIZE);
497
498 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curproc))
499 return (-1);
500
501 return (size);
502
503 }
504
505 int
506 raiddump(dev, blkno, va, size)
507 dev_t dev;
508 daddr_t blkno;
509 caddr_t va;
510 size_t size;
511 {
512 /* Not implemented. */
513 return ENXIO;
514 }
515 /* ARGSUSED */
516 int
517 raidopen(dev, flags, fmt, p)
518 dev_t dev;
519 int flags, fmt;
520 struct proc *p;
521 {
522 int unit = raidunit(dev);
523 struct raid_softc *rs;
524 struct disklabel *lp;
525 int part, pmask;
526 int error = 0;
527
528 if (unit >= numraid)
529 return (ENXIO);
530 rs = &raid_softc[unit];
531
532 if ((error = raidlock(rs)) != 0)
533 return (error);
534 lp = rs->sc_dkdev.dk_label;
535
536 part = DISKPART(dev);
537 pmask = (1 << part);
538
539 db1_printf(("Opening raid device number: %d partition: %d\n",
540 unit, part));
541
542
543 if ((rs->sc_flags & RAIDF_INITED) &&
544 (rs->sc_dkdev.dk_openmask == 0))
545 raidgetdisklabel(dev);
546
547 /* make sure that this partition exists */
548
549 if (part != RAW_PART) {
550 db1_printf(("Not a raw partition..\n"));
551 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
552 ((part >= lp->d_npartitions) ||
553 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
554 error = ENXIO;
555 raidunlock(rs);
556 db1_printf(("Bailing out...\n"));
557 return (error);
558 }
559 }
560 /* Prevent this unit from being unconfigured while open. */
561 switch (fmt) {
562 case S_IFCHR:
563 rs->sc_dkdev.dk_copenmask |= pmask;
564 break;
565
566 case S_IFBLK:
567 rs->sc_dkdev.dk_bopenmask |= pmask;
568 break;
569 }
570
571 if ((rs->sc_dkdev.dk_openmask == 0) &&
572 ((rs->sc_flags & RAIDF_INITED) != 0)) {
573 /* First one... mark things as dirty... Note that we *MUST*
574 have done a configure before this. I DO NOT WANT TO BE
575 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
576 THAT THEY BELONG TOGETHER!!!!! */
577 /* XXX should check to see if we're only open for reading
578 here... If so, we needn't do this, but then need some
579 other way of keeping track of what's happened.. */
580
581 rf_markalldirty( raidPtrs[unit] );
582 }
583
584
585 rs->sc_dkdev.dk_openmask =
586 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
587
588 raidunlock(rs);
589
590 return (error);
591
592
593 }
594 /* ARGSUSED */
595 int
596 raidclose(dev, flags, fmt, p)
597 dev_t dev;
598 int flags, fmt;
599 struct proc *p;
600 {
601 int unit = raidunit(dev);
602 struct raid_softc *rs;
603 int error = 0;
604 int part;
605
606 if (unit >= numraid)
607 return (ENXIO);
608 rs = &raid_softc[unit];
609
610 if ((error = raidlock(rs)) != 0)
611 return (error);
612
613 part = DISKPART(dev);
614
615 /* ...that much closer to allowing unconfiguration... */
616 switch (fmt) {
617 case S_IFCHR:
618 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
619 break;
620
621 case S_IFBLK:
622 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
623 break;
624 }
625 rs->sc_dkdev.dk_openmask =
626 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
627
628 if ((rs->sc_dkdev.dk_openmask == 0) &&
629 ((rs->sc_flags & RAIDF_INITED) != 0)) {
630 /* Last one... device is not unconfigured yet.
631 Device shutdown has taken care of setting the
632 clean bits if RAIDF_INITED is not set
633 mark things as clean... */
634 #if 0
635 printf("Last one on raid%d. Updating status.\n",unit);
636 #endif
637 rf_final_update_component_labels( raidPtrs[unit] );
638 }
639
640 raidunlock(rs);
641 return (0);
642
643 }
644
645 void
646 raidstrategy(bp)
647 register struct buf *bp;
648 {
649 register int s;
650
651 unsigned int raidID = raidunit(bp->b_dev);
652 RF_Raid_t *raidPtr;
653 struct raid_softc *rs = &raid_softc[raidID];
654 struct disklabel *lp;
655 int wlabel;
656
657 if ((rs->sc_flags & RAIDF_INITED) ==0) {
658 bp->b_error = ENXIO;
659 bp->b_flags = B_ERROR;
660 bp->b_resid = bp->b_bcount;
661 biodone(bp);
662 return;
663 }
664 if (raidID >= numraid || !raidPtrs[raidID]) {
665 bp->b_error = ENODEV;
666 bp->b_flags |= B_ERROR;
667 bp->b_resid = bp->b_bcount;
668 biodone(bp);
669 return;
670 }
671 raidPtr = raidPtrs[raidID];
672 if (!raidPtr->valid) {
673 bp->b_error = ENODEV;
674 bp->b_flags |= B_ERROR;
675 bp->b_resid = bp->b_bcount;
676 biodone(bp);
677 return;
678 }
679 if (bp->b_bcount == 0) {
680 db1_printf(("b_bcount is zero..\n"));
681 biodone(bp);
682 return;
683 }
684 lp = rs->sc_dkdev.dk_label;
685
686 /*
687 * Do bounds checking and adjust transfer. If there's an
688 * error, the bounds check will flag that for us.
689 */
690
691 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
692 if (DISKPART(bp->b_dev) != RAW_PART)
693 if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
694 db1_printf(("Bounds check failed!!:%d %d\n",
695 (int) bp->b_blkno, (int) wlabel));
696 biodone(bp);
697 return;
698 }
699 s = splbio();
700
701 bp->b_resid = 0;
702
703 /* stuff it onto our queue */
704 BUFQ_INSERT_TAIL(&rs->buf_queue, bp);
705
706 raidstart(raidPtrs[raidID]);
707
708 splx(s);
709 }
710 /* ARGSUSED */
711 int
712 raidread(dev, uio, flags)
713 dev_t dev;
714 struct uio *uio;
715 int flags;
716 {
717 int unit = raidunit(dev);
718 struct raid_softc *rs;
719 int part;
720
721 if (unit >= numraid)
722 return (ENXIO);
723 rs = &raid_softc[unit];
724
725 if ((rs->sc_flags & RAIDF_INITED) == 0)
726 return (ENXIO);
727 part = DISKPART(dev);
728
729 db1_printf(("raidread: unit: %d partition: %d\n", unit, part));
730
731 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
732
733 }
734 /* ARGSUSED */
735 int
736 raidwrite(dev, uio, flags)
737 dev_t dev;
738 struct uio *uio;
739 int flags;
740 {
741 int unit = raidunit(dev);
742 struct raid_softc *rs;
743
744 if (unit >= numraid)
745 return (ENXIO);
746 rs = &raid_softc[unit];
747
748 if ((rs->sc_flags & RAIDF_INITED) == 0)
749 return (ENXIO);
750 db1_printf(("raidwrite\n"));
751 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
752
753 }
754
755 int
756 raidioctl(dev, cmd, data, flag, p)
757 dev_t dev;
758 u_long cmd;
759 caddr_t data;
760 int flag;
761 struct proc *p;
762 {
763 int unit = raidunit(dev);
764 int error = 0;
765 int part, pmask;
766 struct raid_softc *rs;
767 RF_Config_t *k_cfg, *u_cfg;
768 RF_Raid_t *raidPtr;
769 RF_RaidDisk_t *diskPtr;
770 RF_AccTotals_t *totals;
771 RF_DeviceConfig_t *d_cfg, **ucfgp;
772 u_char *specific_buf;
773 int retcode = 0;
774 int row;
775 int column;
776 struct rf_recon_req *rrcopy, *rr;
777 RF_ComponentLabel_t *clabel;
778 RF_ComponentLabel_t ci_label;
779 RF_ComponentLabel_t **clabel_ptr;
780 RF_SingleComponent_t *sparePtr,*componentPtr;
781 RF_SingleComponent_t hot_spare;
782 RF_SingleComponent_t component;
783 int i, j, d;
784
785 if (unit >= numraid)
786 return (ENXIO);
787 rs = &raid_softc[unit];
788 raidPtr = raidPtrs[unit];
789
790 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
791 (int) DISKPART(dev), (int) unit, (int) cmd));
792
793 /* Must be open for writes for these commands... */
794 switch (cmd) {
795 case DIOCSDINFO:
796 case DIOCWDINFO:
797 case DIOCWLABEL:
798 if ((flag & FWRITE) == 0)
799 return (EBADF);
800 }
801
802 /* Must be initialized for these... */
803 switch (cmd) {
804 case DIOCGDINFO:
805 case DIOCSDINFO:
806 case DIOCWDINFO:
807 case DIOCGPART:
808 case DIOCWLABEL:
809 case DIOCGDEFLABEL:
810 case RAIDFRAME_SHUTDOWN:
811 case RAIDFRAME_REWRITEPARITY:
812 case RAIDFRAME_GET_INFO:
813 case RAIDFRAME_RESET_ACCTOTALS:
814 case RAIDFRAME_GET_ACCTOTALS:
815 case RAIDFRAME_KEEP_ACCTOTALS:
816 case RAIDFRAME_GET_SIZE:
817 case RAIDFRAME_FAIL_DISK:
818 case RAIDFRAME_COPYBACK:
819 case RAIDFRAME_CHECK_RECON_STATUS:
820 case RAIDFRAME_GET_COMPONENT_LABEL:
821 case RAIDFRAME_SET_COMPONENT_LABEL:
822 case RAIDFRAME_ADD_HOT_SPARE:
823 case RAIDFRAME_REMOVE_HOT_SPARE:
824 case RAIDFRAME_INIT_LABELS:
825 case RAIDFRAME_REBUILD_IN_PLACE:
826 case RAIDFRAME_CHECK_PARITY:
827 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
828 case RAIDFRAME_CHECK_COPYBACK_STATUS:
829 case RAIDFRAME_SET_AUTOCONFIG:
830 case RAIDFRAME_SET_ROOT:
831 if ((rs->sc_flags & RAIDF_INITED) == 0)
832 return (ENXIO);
833 }
834
835 switch (cmd) {
836
837 /* configure the system */
838 case RAIDFRAME_CONFIGURE:
839
840 if (raidPtr->valid) {
841 /* There is a valid RAID set running on this unit! */
842 printf("raid%d: Device already configured!\n",unit);
843 return(EINVAL);
844 }
845
846 /* copy-in the configuration information */
847 /* data points to a pointer to the configuration structure */
848
849 u_cfg = *((RF_Config_t **) data);
850 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
851 if (k_cfg == NULL) {
852 return (ENOMEM);
853 }
854 retcode = copyin((caddr_t) u_cfg, (caddr_t) k_cfg,
855 sizeof(RF_Config_t));
856 if (retcode) {
857 RF_Free(k_cfg, sizeof(RF_Config_t));
858 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
859 retcode));
860 return (retcode);
861 }
862 /* allocate a buffer for the layout-specific data, and copy it
863 * in */
864 if (k_cfg->layoutSpecificSize) {
865 if (k_cfg->layoutSpecificSize > 10000) {
866 /* sanity check */
867 RF_Free(k_cfg, sizeof(RF_Config_t));
868 return (EINVAL);
869 }
870 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
871 (u_char *));
872 if (specific_buf == NULL) {
873 RF_Free(k_cfg, sizeof(RF_Config_t));
874 return (ENOMEM);
875 }
876 retcode = copyin(k_cfg->layoutSpecific,
877 (caddr_t) specific_buf,
878 k_cfg->layoutSpecificSize);
879 if (retcode) {
880 RF_Free(k_cfg, sizeof(RF_Config_t));
881 RF_Free(specific_buf,
882 k_cfg->layoutSpecificSize);
883 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
884 retcode));
885 return (retcode);
886 }
887 } else
888 specific_buf = NULL;
889 k_cfg->layoutSpecific = specific_buf;
890
891 /* should do some kind of sanity check on the configuration.
892 * Store the sum of all the bytes in the last byte? */
893
894 /* configure the system */
895
896 /*
897 * Clear the entire RAID descriptor, just to make sure
898 * there is no stale data left in the case of a
899 * reconfiguration
900 */
901 bzero((char *) raidPtr, sizeof(RF_Raid_t));
902 raidPtr->raidid = unit;
903
904 retcode = rf_Configure(raidPtr, k_cfg, NULL);
905
906 if (retcode == 0) {
907
908 /* allow this many simultaneous IO's to
909 this RAID device */
910 raidPtr->openings = RAIDOUTSTANDING;
911
912 raidinit(raidPtr);
913 rf_markalldirty(raidPtr);
914 }
915 /* free the buffers. No return code here. */
916 if (k_cfg->layoutSpecificSize) {
917 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
918 }
919 RF_Free(k_cfg, sizeof(RF_Config_t));
920
921 return (retcode);
922
923 /* shutdown the system */
924 case RAIDFRAME_SHUTDOWN:
925
926 if ((error = raidlock(rs)) != 0)
927 return (error);
928
929 /*
930 * If somebody has a partition mounted, we shouldn't
931 * shutdown.
932 */
933
934 part = DISKPART(dev);
935 pmask = (1 << part);
936 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
937 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
938 (rs->sc_dkdev.dk_copenmask & pmask))) {
939 raidunlock(rs);
940 return (EBUSY);
941 }
942
943 retcode = rf_Shutdown(raidPtr);
944
945 pool_destroy(&rs->sc_cbufpool);
946
947 /* It's no longer initialized... */
948 rs->sc_flags &= ~RAIDF_INITED;
949
950 /* Detach the disk. */
951 disk_detach(&rs->sc_dkdev);
952
953 raidunlock(rs);
954
955 return (retcode);
956 case RAIDFRAME_GET_COMPONENT_LABEL:
957 clabel_ptr = (RF_ComponentLabel_t **) data;
958 /* need to read the component label for the disk indicated
959 by row,column in clabel */
960
961 /* For practice, let's get it directly fromdisk, rather
962 than from the in-core copy */
963 RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
964 (RF_ComponentLabel_t *));
965 if (clabel == NULL)
966 return (ENOMEM);
967
968 bzero((char *) clabel, sizeof(RF_ComponentLabel_t));
969
970 retcode = copyin( *clabel_ptr, clabel,
971 sizeof(RF_ComponentLabel_t));
972
973 if (retcode) {
974 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
975 return(retcode);
976 }
977
978 row = clabel->row;
979 column = clabel->column;
980
981 if ((row < 0) || (row >= raidPtr->numRow) ||
982 (column < 0) || (column >= raidPtr->numCol)) {
983 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
984 return(EINVAL);
985 }
986
987 raidread_component_label(raidPtr->Disks[row][column].dev,
988 raidPtr->raid_cinfo[row][column].ci_vp,
989 clabel );
990
991 retcode = copyout((caddr_t) clabel,
992 (caddr_t) *clabel_ptr,
993 sizeof(RF_ComponentLabel_t));
994 RF_Free( clabel, sizeof(RF_ComponentLabel_t));
995 return (retcode);
996
997 case RAIDFRAME_SET_COMPONENT_LABEL:
998 clabel = (RF_ComponentLabel_t *) data;
999
1000 /* XXX check the label for valid stuff... */
1001 /* Note that some things *should not* get modified --
1002 the user should be re-initing the labels instead of
1003 trying to patch things.
1004 */
1005
1006 printf("Got component label:\n");
1007 printf("Version: %d\n",clabel->version);
1008 printf("Serial Number: %d\n",clabel->serial_number);
1009 printf("Mod counter: %d\n",clabel->mod_counter);
1010 printf("Row: %d\n", clabel->row);
1011 printf("Column: %d\n", clabel->column);
1012 printf("Num Rows: %d\n", clabel->num_rows);
1013 printf("Num Columns: %d\n", clabel->num_columns);
1014 printf("Clean: %d\n", clabel->clean);
1015 printf("Status: %d\n", clabel->status);
1016
1017 row = clabel->row;
1018 column = clabel->column;
1019
1020 if ((row < 0) || (row >= raidPtr->numRow) ||
1021 (column < 0) || (column >= raidPtr->numCol)) {
1022 return(EINVAL);
1023 }
1024
1025 /* XXX this isn't allowed to do anything for now :-) */
1026
1027 /* XXX and before it is, we need to fill in the rest
1028 of the fields!?!?!?! */
1029 #if 0
1030 raidwrite_component_label(
1031 raidPtr->Disks[row][column].dev,
1032 raidPtr->raid_cinfo[row][column].ci_vp,
1033 clabel );
1034 #endif
1035 return (0);
1036
1037 case RAIDFRAME_INIT_LABELS:
1038 clabel = (RF_ComponentLabel_t *) data;
1039 /*
1040 we only want the serial number from
1041 the above. We get all the rest of the information
1042 from the config that was used to create this RAID
1043 set.
1044 */
1045
1046 raidPtr->serial_number = clabel->serial_number;
1047
1048 raid_init_component_label(raidPtr, &ci_label);
1049 ci_label.serial_number = clabel->serial_number;
1050
1051 for(row=0;row<raidPtr->numRow;row++) {
1052 ci_label.row = row;
1053 for(column=0;column<raidPtr->numCol;column++) {
1054 diskPtr = &raidPtr->Disks[row][column];
1055 ci_label.partitionSize = diskPtr->partitionSize;
1056 ci_label.column = column;
1057 raidwrite_component_label(
1058 raidPtr->Disks[row][column].dev,
1059 raidPtr->raid_cinfo[row][column].ci_vp,
1060 &ci_label );
1061 }
1062 }
1063
1064 return (retcode);
1065 case RAIDFRAME_SET_AUTOCONFIG:
1066 d = rf_set_autoconfig(raidPtr, *data);
1067 printf("New autoconfig value is: %d\n", d);
1068 *data = d;
1069 return (retcode);
1070
1071 case RAIDFRAME_SET_ROOT:
1072 d = rf_set_rootpartition(raidPtr, *data);
1073 printf("New rootpartition value is: %d\n", d);
1074 *data = d;
1075 return (retcode);
1076
1077 /* initialize all parity */
1078 case RAIDFRAME_REWRITEPARITY:
1079
1080 if (raidPtr->Layout.map->faultsTolerated == 0) {
1081 /* Parity for RAID 0 is trivially correct */
1082 raidPtr->parity_good = RF_RAID_CLEAN;
1083 return(0);
1084 }
1085
1086 if (raidPtr->parity_rewrite_in_progress == 1) {
1087 /* Re-write is already in progress! */
1088 return(EINVAL);
1089 }
1090
1091 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1092 rf_RewriteParityThread,
1093 raidPtr,"raid_parity");
1094 return (retcode);
1095
1096
1097 case RAIDFRAME_ADD_HOT_SPARE:
1098 sparePtr = (RF_SingleComponent_t *) data;
1099 memcpy( &hot_spare, sparePtr, sizeof(RF_SingleComponent_t));
1100 printf("Adding spare\n");
1101 retcode = rf_add_hot_spare(raidPtr, &hot_spare);
1102 return(retcode);
1103
1104 case RAIDFRAME_REMOVE_HOT_SPARE:
1105 return(retcode);
1106
1107 case RAIDFRAME_REBUILD_IN_PLACE:
1108
1109 if (raidPtr->Layout.map->faultsTolerated == 0) {
1110 /* Can't do this on a RAID 0!! */
1111 return(EINVAL);
1112 }
1113
1114 if (raidPtr->recon_in_progress == 1) {
1115 /* a reconstruct is already in progress! */
1116 return(EINVAL);
1117 }
1118
1119 componentPtr = (RF_SingleComponent_t *) data;
1120 memcpy( &component, componentPtr,
1121 sizeof(RF_SingleComponent_t));
1122 row = component.row;
1123 column = component.column;
1124 printf("Rebuild: %d %d\n",row, column);
1125 if ((row < 0) || (row >= raidPtr->numRow) ||
1126 (column < 0) || (column >= raidPtr->numCol)) {
1127 return(EINVAL);
1128 }
1129
1130 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1131 if (rrcopy == NULL)
1132 return(ENOMEM);
1133
1134 rrcopy->raidPtr = (void *) raidPtr;
1135 rrcopy->row = row;
1136 rrcopy->col = column;
1137
1138 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1139 rf_ReconstructInPlaceThread,
1140 rrcopy,"raid_reconip");
1141 return(retcode);
1142
1143 case RAIDFRAME_GET_INFO:
1144 if (!raidPtr->valid)
1145 return (ENODEV);
1146 ucfgp = (RF_DeviceConfig_t **) data;
1147 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1148 (RF_DeviceConfig_t *));
1149 if (d_cfg == NULL)
1150 return (ENOMEM);
1151 bzero((char *) d_cfg, sizeof(RF_DeviceConfig_t));
1152 d_cfg->rows = raidPtr->numRow;
1153 d_cfg->cols = raidPtr->numCol;
1154 d_cfg->ndevs = raidPtr->numRow * raidPtr->numCol;
1155 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1156 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1157 return (ENOMEM);
1158 }
1159 d_cfg->nspares = raidPtr->numSpare;
1160 if (d_cfg->nspares >= RF_MAX_DISKS) {
1161 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1162 return (ENOMEM);
1163 }
1164 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1165 d = 0;
1166 for (i = 0; i < d_cfg->rows; i++) {
1167 for (j = 0; j < d_cfg->cols; j++) {
1168 d_cfg->devs[d] = raidPtr->Disks[i][j];
1169 d++;
1170 }
1171 }
1172 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1173 d_cfg->spares[i] = raidPtr->Disks[0][j];
1174 }
1175 retcode = copyout((caddr_t) d_cfg, (caddr_t) * ucfgp,
1176 sizeof(RF_DeviceConfig_t));
1177 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1178
1179 return (retcode);
1180
1181 case RAIDFRAME_CHECK_PARITY:
1182 *(int *) data = raidPtr->parity_good;
1183 return (0);
1184
1185 case RAIDFRAME_RESET_ACCTOTALS:
1186 bzero(&raidPtr->acc_totals, sizeof(raidPtr->acc_totals));
1187 return (0);
1188
1189 case RAIDFRAME_GET_ACCTOTALS:
1190 totals = (RF_AccTotals_t *) data;
1191 *totals = raidPtr->acc_totals;
1192 return (0);
1193
1194 case RAIDFRAME_KEEP_ACCTOTALS:
1195 raidPtr->keep_acc_totals = *(int *)data;
1196 return (0);
1197
1198 case RAIDFRAME_GET_SIZE:
1199 *(int *) data = raidPtr->totalSectors;
1200 return (0);
1201
1202 /* fail a disk & optionally start reconstruction */
1203 case RAIDFRAME_FAIL_DISK:
1204
1205 if (raidPtr->Layout.map->faultsTolerated == 0) {
1206 /* Can't do this on a RAID 0!! */
1207 return(EINVAL);
1208 }
1209
1210 rr = (struct rf_recon_req *) data;
1211
1212 if (rr->row < 0 || rr->row >= raidPtr->numRow
1213 || rr->col < 0 || rr->col >= raidPtr->numCol)
1214 return (EINVAL);
1215
1216 printf("raid%d: Failing the disk: row: %d col: %d\n",
1217 unit, rr->row, rr->col);
1218
1219 /* make a copy of the recon request so that we don't rely on
1220 * the user's buffer */
1221 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1222 if (rrcopy == NULL)
1223 return(ENOMEM);
1224 bcopy(rr, rrcopy, sizeof(*rr));
1225 rrcopy->raidPtr = (void *) raidPtr;
1226
1227 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1228 rf_ReconThread,
1229 rrcopy,"raid_recon");
1230 return (0);
1231
1232 /* invoke a copyback operation after recon on whatever disk
1233 * needs it, if any */
1234 case RAIDFRAME_COPYBACK:
1235
1236 if (raidPtr->Layout.map->faultsTolerated == 0) {
1237 /* This makes no sense on a RAID 0!! */
1238 return(EINVAL);
1239 }
1240
1241 if (raidPtr->copyback_in_progress == 1) {
1242 /* Copyback is already in progress! */
1243 return(EINVAL);
1244 }
1245
1246 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1247 rf_CopybackThread,
1248 raidPtr,"raid_copyback");
1249 return (retcode);
1250
1251 /* return the percentage completion of reconstruction */
1252 case RAIDFRAME_CHECK_RECON_STATUS:
1253 if (raidPtr->Layout.map->faultsTolerated == 0) {
1254 /* This makes no sense on a RAID 0, so tell the
1255 user it's done. */
1256 *(int *) data = 100;
1257 return(0);
1258 }
1259 row = 0; /* XXX we only consider a single row... */
1260 if (raidPtr->status[row] != rf_rs_reconstructing)
1261 *(int *) data = 100;
1262 else
1263 *(int *) data = raidPtr->reconControl[row]->percentComplete;
1264 return (0);
1265
1266 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1267 if (raidPtr->Layout.map->faultsTolerated == 0) {
1268 /* This makes no sense on a RAID 0 */
1269 return(EINVAL);
1270 }
1271 if (raidPtr->parity_rewrite_in_progress == 1) {
1272 *(int *) data = 100 * raidPtr->parity_rewrite_stripes_done / raidPtr->Layout.numStripe;
1273 } else {
1274 *(int *) data = 100;
1275 }
1276 return (0);
1277
1278 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1279 if (raidPtr->Layout.map->faultsTolerated == 0) {
1280 /* This makes no sense on a RAID 0 */
1281 return(EINVAL);
1282 }
1283 if (raidPtr->copyback_in_progress == 1) {
1284 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1285 raidPtr->Layout.numStripe;
1286 } else {
1287 *(int *) data = 100;
1288 }
1289 return (0);
1290
1291
1292 /* the sparetable daemon calls this to wait for the kernel to
1293 * need a spare table. this ioctl does not return until a
1294 * spare table is needed. XXX -- calling mpsleep here in the
1295 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1296 * -- I should either compute the spare table in the kernel,
1297 * or have a different -- XXX XXX -- interface (a different
1298 * character device) for delivering the table -- XXX */
1299 #if 0
1300 case RAIDFRAME_SPARET_WAIT:
1301 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1302 while (!rf_sparet_wait_queue)
1303 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1304 waitreq = rf_sparet_wait_queue;
1305 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1306 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1307
1308 /* structure assignment */
1309 *((RF_SparetWait_t *) data) = *waitreq;
1310
1311 RF_Free(waitreq, sizeof(*waitreq));
1312 return (0);
1313
1314 /* wakes up a process waiting on SPARET_WAIT and puts an error
1315 * code in it that will cause the dameon to exit */
1316 case RAIDFRAME_ABORT_SPARET_WAIT:
1317 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1318 waitreq->fcol = -1;
1319 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1320 waitreq->next = rf_sparet_wait_queue;
1321 rf_sparet_wait_queue = waitreq;
1322 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1323 wakeup(&rf_sparet_wait_queue);
1324 return (0);
1325
1326 /* used by the spare table daemon to deliver a spare table
1327 * into the kernel */
1328 case RAIDFRAME_SEND_SPARET:
1329
1330 /* install the spare table */
1331 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1332
1333 /* respond to the requestor. the return status of the spare
1334 * table installation is passed in the "fcol" field */
1335 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1336 waitreq->fcol = retcode;
1337 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1338 waitreq->next = rf_sparet_resp_queue;
1339 rf_sparet_resp_queue = waitreq;
1340 wakeup(&rf_sparet_resp_queue);
1341 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1342
1343 return (retcode);
1344 #endif
1345
1346 default:
1347 break; /* fall through to the os-specific code below */
1348
1349 }
1350
1351 if (!raidPtr->valid)
1352 return (EINVAL);
1353
1354 /*
1355 * Add support for "regular" device ioctls here.
1356 */
1357
1358 switch (cmd) {
1359 case DIOCGDINFO:
1360 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1361 break;
1362
1363 case DIOCGPART:
1364 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1365 ((struct partinfo *) data)->part =
1366 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1367 break;
1368
1369 case DIOCWDINFO:
1370 case DIOCSDINFO:
1371 if ((error = raidlock(rs)) != 0)
1372 return (error);
1373
1374 rs->sc_flags |= RAIDF_LABELLING;
1375
1376 error = setdisklabel(rs->sc_dkdev.dk_label,
1377 (struct disklabel *) data, 0, rs->sc_dkdev.dk_cpulabel);
1378 if (error == 0) {
1379 if (cmd == DIOCWDINFO)
1380 error = writedisklabel(RAIDLABELDEV(dev),
1381 raidstrategy, rs->sc_dkdev.dk_label,
1382 rs->sc_dkdev.dk_cpulabel);
1383 }
1384 rs->sc_flags &= ~RAIDF_LABELLING;
1385
1386 raidunlock(rs);
1387
1388 if (error)
1389 return (error);
1390 break;
1391
1392 case DIOCWLABEL:
1393 if (*(int *) data != 0)
1394 rs->sc_flags |= RAIDF_WLABEL;
1395 else
1396 rs->sc_flags &= ~RAIDF_WLABEL;
1397 break;
1398
1399 case DIOCGDEFLABEL:
1400 raidgetdefaultlabel(raidPtr, rs,
1401 (struct disklabel *) data);
1402 break;
1403
1404 default:
1405 retcode = ENOTTY;
1406 }
1407 return (retcode);
1408
1409 }
1410
1411
1412 /* raidinit -- complete the rest of the initialization for the
1413 RAIDframe device. */
1414
1415
1416 static void
1417 raidinit(raidPtr)
1418 RF_Raid_t *raidPtr;
1419 {
1420 struct raid_softc *rs;
1421 int unit;
1422
1423 unit = raidPtr->raidid;
1424
1425 rs = &raid_softc[unit];
1426 pool_init(&rs->sc_cbufpool, sizeof(struct raidbuf), 0,
1427 0, 0, "raidpl", 0, NULL, NULL, M_RAIDFRAME);
1428
1429
1430 /* XXX should check return code first... */
1431 rs->sc_flags |= RAIDF_INITED;
1432
1433 sprintf(rs->sc_xname, "raid%d", unit); /* XXX doesn't check bounds. */
1434
1435 rs->sc_dkdev.dk_name = rs->sc_xname;
1436
1437 /* disk_attach actually creates space for the CPU disklabel, among
1438 * other things, so it's critical to call this *BEFORE* we try putzing
1439 * with disklabels. */
1440
1441 disk_attach(&rs->sc_dkdev);
1442
1443 /* XXX There may be a weird interaction here between this, and
1444 * protectedSectors, as used in RAIDframe. */
1445
1446 rs->sc_size = raidPtr->totalSectors;
1447
1448 }
1449
1450 /* wake up the daemon & tell it to get us a spare table
1451 * XXX
1452 * the entries in the queues should be tagged with the raidPtr
1453 * so that in the extremely rare case that two recons happen at once,
1454 * we know for which device were requesting a spare table
1455 * XXX
1456 *
1457 * XXX This code is not currently used. GO
1458 */
1459 int
1460 rf_GetSpareTableFromDaemon(req)
1461 RF_SparetWait_t *req;
1462 {
1463 int retcode;
1464
1465 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1466 req->next = rf_sparet_wait_queue;
1467 rf_sparet_wait_queue = req;
1468 wakeup(&rf_sparet_wait_queue);
1469
1470 /* mpsleep unlocks the mutex */
1471 while (!rf_sparet_resp_queue) {
1472 tsleep(&rf_sparet_resp_queue, PRIBIO,
1473 "raidframe getsparetable", 0);
1474 }
1475 req = rf_sparet_resp_queue;
1476 rf_sparet_resp_queue = req->next;
1477 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1478
1479 retcode = req->fcol;
1480 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1481 * alloc'd */
1482 return (retcode);
1483 }
1484
1485 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1486 * bp & passes it down.
1487 * any calls originating in the kernel must use non-blocking I/O
1488 * do some extra sanity checking to return "appropriate" error values for
1489 * certain conditions (to make some standard utilities work)
1490 *
1491 * Formerly known as: rf_DoAccessKernel
1492 */
1493 void
1494 raidstart(raidPtr)
1495 RF_Raid_t *raidPtr;
1496 {
1497 RF_SectorCount_t num_blocks, pb, sum;
1498 RF_RaidAddr_t raid_addr;
1499 int retcode;
1500 struct partition *pp;
1501 daddr_t blocknum;
1502 int unit;
1503 struct raid_softc *rs;
1504 int do_async;
1505 struct buf *bp;
1506
1507 unit = raidPtr->raidid;
1508 rs = &raid_softc[unit];
1509
1510 /* quick check to see if anything has died recently */
1511 RF_LOCK_MUTEX(raidPtr->mutex);
1512 if (raidPtr->numNewFailures > 0) {
1513 rf_update_component_labels(raidPtr);
1514 raidPtr->numNewFailures--;
1515 }
1516 RF_UNLOCK_MUTEX(raidPtr->mutex);
1517
1518 /* Check to see if we're at the limit... */
1519 RF_LOCK_MUTEX(raidPtr->mutex);
1520 while (raidPtr->openings > 0) {
1521 RF_UNLOCK_MUTEX(raidPtr->mutex);
1522
1523 /* get the next item, if any, from the queue */
1524 if ((bp = BUFQ_FIRST(&rs->buf_queue)) == NULL) {
1525 /* nothing more to do */
1526 return;
1527 }
1528 BUFQ_REMOVE(&rs->buf_queue, bp);
1529
1530 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1531 * partition.. Need to make it absolute to the underlying
1532 * device.. */
1533
1534 blocknum = bp->b_blkno;
1535 if (DISKPART(bp->b_dev) != RAW_PART) {
1536 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1537 blocknum += pp->p_offset;
1538 }
1539
1540 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1541 (int) blocknum));
1542
1543 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1544 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1545
1546 /* *THIS* is where we adjust what block we're going to...
1547 * but DO NOT TOUCH bp->b_blkno!!! */
1548 raid_addr = blocknum;
1549
1550 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1551 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1552 sum = raid_addr + num_blocks + pb;
1553 if (1 || rf_debugKernelAccess) {
1554 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1555 (int) raid_addr, (int) sum, (int) num_blocks,
1556 (int) pb, (int) bp->b_resid));
1557 }
1558 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1559 || (sum < num_blocks) || (sum < pb)) {
1560 bp->b_error = ENOSPC;
1561 bp->b_flags |= B_ERROR;
1562 bp->b_resid = bp->b_bcount;
1563 biodone(bp);
1564 RF_LOCK_MUTEX(raidPtr->mutex);
1565 continue;
1566 }
1567 /*
1568 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1569 */
1570
1571 if (bp->b_bcount & raidPtr->sectorMask) {
1572 bp->b_error = EINVAL;
1573 bp->b_flags |= B_ERROR;
1574 bp->b_resid = bp->b_bcount;
1575 biodone(bp);
1576 RF_LOCK_MUTEX(raidPtr->mutex);
1577 continue;
1578
1579 }
1580 db1_printf(("Calling DoAccess..\n"));
1581
1582
1583 RF_LOCK_MUTEX(raidPtr->mutex);
1584 raidPtr->openings--;
1585 RF_UNLOCK_MUTEX(raidPtr->mutex);
1586
1587 /*
1588 * Everything is async.
1589 */
1590 do_async = 1;
1591
1592 /* don't ever condition on bp->b_flags & B_WRITE.
1593 * always condition on B_READ instead */
1594
1595 /* XXX we're still at splbio() here... do we *really*
1596 need to be? */
1597
1598
1599 retcode = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1600 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1601 do_async, raid_addr, num_blocks,
1602 bp->b_un.b_addr, bp, NULL, NULL,
1603 RF_DAG_NONBLOCKING_IO, NULL, NULL, NULL);
1604
1605
1606 RF_LOCK_MUTEX(raidPtr->mutex);
1607 }
1608 RF_UNLOCK_MUTEX(raidPtr->mutex);
1609 }
1610
1611
1612
1613
1614 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1615
1616 int
1617 rf_DispatchKernelIO(queue, req)
1618 RF_DiskQueue_t *queue;
1619 RF_DiskQueueData_t *req;
1620 {
1621 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1622 struct buf *bp;
1623 struct raidbuf *raidbp = NULL;
1624 struct raid_softc *rs;
1625 int unit;
1626 int s;
1627
1628 s=0;
1629 /* s = splbio();*/ /* want to test this */
1630 /* XXX along with the vnode, we also need the softc associated with
1631 * this device.. */
1632
1633 req->queue = queue;
1634
1635 unit = queue->raidPtr->raidid;
1636
1637 db1_printf(("DispatchKernelIO unit: %d\n", unit));
1638
1639 if (unit >= numraid) {
1640 printf("Invalid unit number: %d %d\n", unit, numraid);
1641 panic("Invalid Unit number in rf_DispatchKernelIO\n");
1642 }
1643 rs = &raid_softc[unit];
1644
1645 /* XXX is this the right place? */
1646 disk_busy(&rs->sc_dkdev);
1647
1648 bp = req->bp;
1649 #if 1
1650 /* XXX when there is a physical disk failure, someone is passing us a
1651 * buffer that contains old stuff!! Attempt to deal with this problem
1652 * without taking a performance hit... (not sure where the real bug
1653 * is. It's buried in RAIDframe somewhere) :-( GO ) */
1654
1655 if (bp->b_flags & B_ERROR) {
1656 bp->b_flags &= ~B_ERROR;
1657 }
1658 if (bp->b_error != 0) {
1659 bp->b_error = 0;
1660 }
1661 #endif
1662 raidbp = RAIDGETBUF(rs);
1663
1664 raidbp->rf_flags = 0; /* XXX not really used anywhere... */
1665
1666 /*
1667 * context for raidiodone
1668 */
1669 raidbp->rf_obp = bp;
1670 raidbp->req = req;
1671
1672 LIST_INIT(&raidbp->rf_buf.b_dep);
1673
1674 switch (req->type) {
1675 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1676 /* XXX need to do something extra here.. */
1677 /* I'm leaving this in, as I've never actually seen it used,
1678 * and I'd like folks to report it... GO */
1679 printf(("WAKEUP CALLED\n"));
1680 queue->numOutstanding++;
1681
1682 /* XXX need to glue the original buffer into this?? */
1683
1684 KernelWakeupFunc(&raidbp->rf_buf);
1685 break;
1686
1687 case RF_IO_TYPE_READ:
1688 case RF_IO_TYPE_WRITE:
1689
1690 if (req->tracerec) {
1691 RF_ETIMER_START(req->tracerec->timer);
1692 }
1693 InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp,
1694 op | bp->b_flags, queue->rf_cinfo->ci_dev,
1695 req->sectorOffset, req->numSector,
1696 req->buf, KernelWakeupFunc, (void *) req,
1697 queue->raidPtr->logBytesPerSector, req->b_proc);
1698
1699 if (rf_debugKernelAccess) {
1700 db1_printf(("dispatch: bp->b_blkno = %ld\n",
1701 (long) bp->b_blkno));
1702 }
1703 queue->numOutstanding++;
1704 queue->last_deq_sector = req->sectorOffset;
1705 /* acc wouldn't have been let in if there were any pending
1706 * reqs at any other priority */
1707 queue->curPriority = req->priority;
1708
1709 db1_printf(("Going for %c to unit %d row %d col %d\n",
1710 req->type, unit, queue->row, queue->col));
1711 db1_printf(("sector %d count %d (%d bytes) %d\n",
1712 (int) req->sectorOffset, (int) req->numSector,
1713 (int) (req->numSector <<
1714 queue->raidPtr->logBytesPerSector),
1715 (int) queue->raidPtr->logBytesPerSector));
1716 if ((raidbp->rf_buf.b_flags & B_READ) == 0) {
1717 raidbp->rf_buf.b_vp->v_numoutput++;
1718 }
1719 VOP_STRATEGY(&raidbp->rf_buf);
1720
1721 break;
1722
1723 default:
1724 panic("bad req->type in rf_DispatchKernelIO");
1725 }
1726 db1_printf(("Exiting from DispatchKernelIO\n"));
1727 /* splx(s); */ /* want to test this */
1728 return (0);
1729 }
1730 /* this is the callback function associated with a I/O invoked from
1731 kernel code.
1732 */
1733 static void
1734 KernelWakeupFunc(vbp)
1735 struct buf *vbp;
1736 {
1737 RF_DiskQueueData_t *req = NULL;
1738 RF_DiskQueue_t *queue;
1739 struct raidbuf *raidbp = (struct raidbuf *) vbp;
1740 struct buf *bp;
1741 struct raid_softc *rs;
1742 int unit;
1743 register int s;
1744
1745 s = splbio();
1746 db1_printf(("recovering the request queue:\n"));
1747 req = raidbp->req;
1748
1749 bp = raidbp->rf_obp;
1750
1751 queue = (RF_DiskQueue_t *) req->queue;
1752
1753 if (raidbp->rf_buf.b_flags & B_ERROR) {
1754 bp->b_flags |= B_ERROR;
1755 bp->b_error = raidbp->rf_buf.b_error ?
1756 raidbp->rf_buf.b_error : EIO;
1757 }
1758
1759 /* XXX methinks this could be wrong... */
1760 #if 1
1761 bp->b_resid = raidbp->rf_buf.b_resid;
1762 #endif
1763
1764 if (req->tracerec) {
1765 RF_ETIMER_STOP(req->tracerec->timer);
1766 RF_ETIMER_EVAL(req->tracerec->timer);
1767 RF_LOCK_MUTEX(rf_tracing_mutex);
1768 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
1769 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
1770 req->tracerec->num_phys_ios++;
1771 RF_UNLOCK_MUTEX(rf_tracing_mutex);
1772 }
1773 bp->b_bcount = raidbp->rf_buf.b_bcount; /* XXXX ?? */
1774
1775 unit = queue->raidPtr->raidid; /* *Much* simpler :-> */
1776
1777
1778 /* XXX Ok, let's get aggressive... If B_ERROR is set, let's go
1779 * ballistic, and mark the component as hosed... */
1780
1781 if (bp->b_flags & B_ERROR) {
1782 /* Mark the disk as dead */
1783 /* but only mark it once... */
1784 if (queue->raidPtr->Disks[queue->row][queue->col].status ==
1785 rf_ds_optimal) {
1786 printf("raid%d: IO Error. Marking %s as failed.\n",
1787 unit, queue->raidPtr->Disks[queue->row][queue->col].devname);
1788 queue->raidPtr->Disks[queue->row][queue->col].status =
1789 rf_ds_failed;
1790 queue->raidPtr->status[queue->row] = rf_rs_degraded;
1791 queue->raidPtr->numFailures++;
1792 queue->raidPtr->numNewFailures++;
1793 /* XXX here we should bump the version number for each component, and write that data out */
1794 } else { /* Disk is already dead... */
1795 /* printf("Disk already marked as dead!\n"); */
1796 }
1797
1798 }
1799
1800 rs = &raid_softc[unit];
1801 RAIDPUTBUF(rs, raidbp);
1802
1803
1804 if (bp->b_resid == 0) {
1805 /* XXX is this the right place for a disk_unbusy()??!??!?!? */
1806 disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid));
1807 }
1808
1809 rf_DiskIOComplete(queue, req, (bp->b_flags & B_ERROR) ? 1 : 0);
1810 (req->CompleteFunc) (req->argument, (bp->b_flags & B_ERROR) ? 1 : 0);
1811
1812 splx(s);
1813 }
1814
1815
1816
1817 /*
1818 * initialize a buf structure for doing an I/O in the kernel.
1819 */
1820 static void
1821 InitBP(bp, b_vp, rw_flag, dev, startSect, numSect, buf, cbFunc, cbArg,
1822 logBytesPerSector, b_proc)
1823 struct buf *bp;
1824 struct vnode *b_vp;
1825 unsigned rw_flag;
1826 dev_t dev;
1827 RF_SectorNum_t startSect;
1828 RF_SectorCount_t numSect;
1829 caddr_t buf;
1830 void (*cbFunc) (struct buf *);
1831 void *cbArg;
1832 int logBytesPerSector;
1833 struct proc *b_proc;
1834 {
1835 /* bp->b_flags = B_PHYS | rw_flag; */
1836 bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */
1837 bp->b_bcount = numSect << logBytesPerSector;
1838 bp->b_bufsize = bp->b_bcount;
1839 bp->b_error = 0;
1840 bp->b_dev = dev;
1841 bp->b_un.b_addr = buf;
1842 bp->b_blkno = startSect;
1843 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
1844 if (bp->b_bcount == 0) {
1845 panic("bp->b_bcount is zero in InitBP!!\n");
1846 }
1847 bp->b_proc = b_proc;
1848 bp->b_iodone = cbFunc;
1849 bp->b_vp = b_vp;
1850
1851 }
1852
1853 static void
1854 raidgetdefaultlabel(raidPtr, rs, lp)
1855 RF_Raid_t *raidPtr;
1856 struct raid_softc *rs;
1857 struct disklabel *lp;
1858 {
1859 db1_printf(("Building a default label...\n"));
1860 bzero(lp, sizeof(*lp));
1861
1862 /* fabricate a label... */
1863 lp->d_secperunit = raidPtr->totalSectors;
1864 lp->d_secsize = raidPtr->bytesPerSector;
1865 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
1866 lp->d_ntracks = 1;
1867 lp->d_ncylinders = raidPtr->totalSectors /
1868 (lp->d_nsectors * lp->d_ntracks);
1869 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1870
1871 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
1872 lp->d_type = DTYPE_RAID;
1873 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1874 lp->d_rpm = 3600;
1875 lp->d_interleave = 1;
1876 lp->d_flags = 0;
1877
1878 lp->d_partitions[RAW_PART].p_offset = 0;
1879 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
1880 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1881 lp->d_npartitions = RAW_PART + 1;
1882
1883 lp->d_magic = DISKMAGIC;
1884 lp->d_magic2 = DISKMAGIC;
1885 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
1886
1887 }
1888 /*
1889 * Read the disklabel from the raid device. If one is not present, fake one
1890 * up.
1891 */
1892 static void
1893 raidgetdisklabel(dev)
1894 dev_t dev;
1895 {
1896 int unit = raidunit(dev);
1897 struct raid_softc *rs = &raid_softc[unit];
1898 char *errstring;
1899 struct disklabel *lp = rs->sc_dkdev.dk_label;
1900 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
1901 RF_Raid_t *raidPtr;
1902
1903 db1_printf(("Getting the disklabel...\n"));
1904
1905 bzero(clp, sizeof(*clp));
1906
1907 raidPtr = raidPtrs[unit];
1908
1909 raidgetdefaultlabel(raidPtr, rs, lp);
1910
1911 /*
1912 * Call the generic disklabel extraction routine.
1913 */
1914 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
1915 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
1916 if (errstring)
1917 raidmakedisklabel(rs);
1918 else {
1919 int i;
1920 struct partition *pp;
1921
1922 /*
1923 * Sanity check whether the found disklabel is valid.
1924 *
1925 * This is necessary since total size of the raid device
1926 * may vary when an interleave is changed even though exactly
1927 * same componets are used, and old disklabel may used
1928 * if that is found.
1929 */
1930 if (lp->d_secperunit != rs->sc_size)
1931 printf("WARNING: %s: "
1932 "total sector size in disklabel (%d) != "
1933 "the size of raid (%ld)\n", rs->sc_xname,
1934 lp->d_secperunit, (long) rs->sc_size);
1935 for (i = 0; i < lp->d_npartitions; i++) {
1936 pp = &lp->d_partitions[i];
1937 if (pp->p_offset + pp->p_size > rs->sc_size)
1938 printf("WARNING: %s: end of partition `%c' "
1939 "exceeds the size of raid (%ld)\n",
1940 rs->sc_xname, 'a' + i, (long) rs->sc_size);
1941 }
1942 }
1943
1944 }
1945 /*
1946 * Take care of things one might want to take care of in the event
1947 * that a disklabel isn't present.
1948 */
1949 static void
1950 raidmakedisklabel(rs)
1951 struct raid_softc *rs;
1952 {
1953 struct disklabel *lp = rs->sc_dkdev.dk_label;
1954 db1_printf(("Making a label..\n"));
1955
1956 /*
1957 * For historical reasons, if there's no disklabel present
1958 * the raw partition must be marked FS_BSDFFS.
1959 */
1960
1961 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1962
1963 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1964
1965 lp->d_checksum = dkcksum(lp);
1966 }
1967 /*
1968 * Lookup the provided name in the filesystem. If the file exists,
1969 * is a valid block device, and isn't being used by anyone else,
1970 * set *vpp to the file's vnode.
1971 * You'll find the original of this in ccd.c
1972 */
1973 int
1974 raidlookup(path, p, vpp)
1975 char *path;
1976 struct proc *p;
1977 struct vnode **vpp; /* result */
1978 {
1979 struct nameidata nd;
1980 struct vnode *vp;
1981 struct vattr va;
1982 int error;
1983
1984 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p);
1985 if ((error = vn_open(&nd, FREAD | FWRITE, 0)) != 0) {
1986 #ifdef DEBUG
1987 printf("RAIDframe: vn_open returned %d\n", error);
1988 #endif
1989 return (error);
1990 }
1991 vp = nd.ni_vp;
1992 if (vp->v_usecount > 1) {
1993 VOP_UNLOCK(vp, 0);
1994 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
1995 return (EBUSY);
1996 }
1997 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) != 0) {
1998 VOP_UNLOCK(vp, 0);
1999 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
2000 return (error);
2001 }
2002 /* XXX: eventually we should handle VREG, too. */
2003 if (va.va_type != VBLK) {
2004 VOP_UNLOCK(vp, 0);
2005 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
2006 return (ENOTBLK);
2007 }
2008 VOP_UNLOCK(vp, 0);
2009 *vpp = vp;
2010 return (0);
2011 }
2012 /*
2013 * Wait interruptibly for an exclusive lock.
2014 *
2015 * XXX
2016 * Several drivers do this; it should be abstracted and made MP-safe.
2017 * (Hmm... where have we seen this warning before :-> GO )
2018 */
2019 static int
2020 raidlock(rs)
2021 struct raid_softc *rs;
2022 {
2023 int error;
2024
2025 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2026 rs->sc_flags |= RAIDF_WANTED;
2027 if ((error =
2028 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2029 return (error);
2030 }
2031 rs->sc_flags |= RAIDF_LOCKED;
2032 return (0);
2033 }
2034 /*
2035 * Unlock and wake up any waiters.
2036 */
2037 static void
2038 raidunlock(rs)
2039 struct raid_softc *rs;
2040 {
2041
2042 rs->sc_flags &= ~RAIDF_LOCKED;
2043 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2044 rs->sc_flags &= ~RAIDF_WANTED;
2045 wakeup(rs);
2046 }
2047 }
2048
2049
2050 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2051 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2052
2053 int
2054 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2055 {
2056 RF_ComponentLabel_t clabel;
2057 raidread_component_label(dev, b_vp, &clabel);
2058 clabel.mod_counter = mod_counter;
2059 clabel.clean = RF_RAID_CLEAN;
2060 raidwrite_component_label(dev, b_vp, &clabel);
2061 return(0);
2062 }
2063
2064
2065 int
2066 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2067 {
2068 RF_ComponentLabel_t clabel;
2069 raidread_component_label(dev, b_vp, &clabel);
2070 clabel.mod_counter = mod_counter;
2071 clabel.clean = RF_RAID_DIRTY;
2072 raidwrite_component_label(dev, b_vp, &clabel);
2073 return(0);
2074 }
2075
2076 /* ARGSUSED */
2077 int
2078 raidread_component_label(dev, b_vp, clabel)
2079 dev_t dev;
2080 struct vnode *b_vp;
2081 RF_ComponentLabel_t *clabel;
2082 {
2083 struct buf *bp;
2084 int error;
2085
2086 /* XXX should probably ensure that we don't try to do this if
2087 someone has changed rf_protected_sectors. */
2088
2089 /* get a block of the appropriate size... */
2090 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2091 bp->b_dev = dev;
2092
2093 /* get our ducks in a row for the read */
2094 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2095 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2096 bp->b_flags = B_BUSY | B_READ;
2097 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2098
2099 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2100
2101 error = biowait(bp);
2102
2103 if (!error) {
2104 memcpy(clabel, bp->b_un.b_addr,
2105 sizeof(RF_ComponentLabel_t));
2106 #if 0
2107 rf_print_component_label( clabel );
2108 #endif
2109 } else {
2110 #if 0
2111 printf("Failed to read RAID component label!\n");
2112 #endif
2113 }
2114
2115 bp->b_flags = B_INVAL | B_AGE;
2116 brelse(bp);
2117 return(error);
2118 }
2119 /* ARGSUSED */
2120 int
2121 raidwrite_component_label(dev, b_vp, clabel)
2122 dev_t dev;
2123 struct vnode *b_vp;
2124 RF_ComponentLabel_t *clabel;
2125 {
2126 struct buf *bp;
2127 int error;
2128
2129 /* get a block of the appropriate size... */
2130 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2131 bp->b_dev = dev;
2132
2133 /* get our ducks in a row for the write */
2134 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2135 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2136 bp->b_flags = B_BUSY | B_WRITE;
2137 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2138
2139 memset( bp->b_un.b_addr, 0, RF_COMPONENT_INFO_SIZE );
2140
2141 memcpy( bp->b_un.b_addr, clabel, sizeof(RF_ComponentLabel_t));
2142
2143 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2144 error = biowait(bp);
2145 bp->b_flags = B_INVAL | B_AGE;
2146 brelse(bp);
2147 if (error) {
2148 #if 1
2149 printf("Failed to write RAID component info!\n");
2150 #endif
2151 }
2152
2153 return(error);
2154 }
2155
2156 void
2157 rf_markalldirty(raidPtr)
2158 RF_Raid_t *raidPtr;
2159 {
2160 RF_ComponentLabel_t clabel;
2161 int r,c;
2162
2163 raidPtr->mod_counter++;
2164 for (r = 0; r < raidPtr->numRow; r++) {
2165 for (c = 0; c < raidPtr->numCol; c++) {
2166 if (raidPtr->Disks[r][c].status != rf_ds_failed) {
2167 raidread_component_label(
2168 raidPtr->Disks[r][c].dev,
2169 raidPtr->raid_cinfo[r][c].ci_vp,
2170 &clabel);
2171 if (clabel.status == rf_ds_spared) {
2172 /* XXX do something special...
2173 but whatever you do, don't
2174 try to access it!! */
2175 } else {
2176 #if 0
2177 clabel.status =
2178 raidPtr->Disks[r][c].status;
2179 raidwrite_component_label(
2180 raidPtr->Disks[r][c].dev,
2181 raidPtr->raid_cinfo[r][c].ci_vp,
2182 &clabel);
2183 #endif
2184 raidmarkdirty(
2185 raidPtr->Disks[r][c].dev,
2186 raidPtr->raid_cinfo[r][c].ci_vp,
2187 raidPtr->mod_counter);
2188 }
2189 }
2190 }
2191 }
2192 /* printf("Component labels marked dirty.\n"); */
2193 #if 0
2194 for( c = 0; c < raidPtr->numSpare ; c++) {
2195 sparecol = raidPtr->numCol + c;
2196 if (raidPtr->Disks[r][sparecol].status == rf_ds_used_spare) {
2197 /*
2198
2199 XXX this is where we get fancy and map this spare
2200 into it's correct spot in the array.
2201
2202 */
2203 /*
2204
2205 we claim this disk is "optimal" if it's
2206 rf_ds_used_spare, as that means it should be
2207 directly substitutable for the disk it replaced.
2208 We note that too...
2209
2210 */
2211
2212 for(i=0;i<raidPtr->numRow;i++) {
2213 for(j=0;j<raidPtr->numCol;j++) {
2214 if ((raidPtr->Disks[i][j].spareRow ==
2215 r) &&
2216 (raidPtr->Disks[i][j].spareCol ==
2217 sparecol)) {
2218 srow = r;
2219 scol = sparecol;
2220 break;
2221 }
2222 }
2223 }
2224
2225 raidread_component_label(
2226 raidPtr->Disks[r][sparecol].dev,
2227 raidPtr->raid_cinfo[r][sparecol].ci_vp,
2228 &clabel);
2229 /* make sure status is noted */
2230 clabel.version = RF_COMPONENT_LABEL_VERSION;
2231 clabel.mod_counter = raidPtr->mod_counter;
2232 clabel.serial_number = raidPtr->serial_number;
2233 clabel.row = srow;
2234 clabel.column = scol;
2235 clabel.num_rows = raidPtr->numRow;
2236 clabel.num_columns = raidPtr->numCol;
2237 clabel.clean = RF_RAID_DIRTY; /* changed in a bit*/
2238 clabel.status = rf_ds_optimal;
2239 raidwrite_component_label(
2240 raidPtr->Disks[r][sparecol].dev,
2241 raidPtr->raid_cinfo[r][sparecol].ci_vp,
2242 &clabel);
2243 raidmarkclean( raidPtr->Disks[r][sparecol].dev,
2244 raidPtr->raid_cinfo[r][sparecol].ci_vp);
2245 }
2246 }
2247
2248 #endif
2249 }
2250
2251
2252 void
2253 rf_update_component_labels(raidPtr)
2254 RF_Raid_t *raidPtr;
2255 {
2256 RF_ComponentLabel_t clabel;
2257 int sparecol;
2258 int r,c;
2259 int i,j;
2260 int srow, scol;
2261
2262 srow = -1;
2263 scol = -1;
2264
2265 /* XXX should do extra checks to make sure things really are clean,
2266 rather than blindly setting the clean bit... */
2267
2268 raidPtr->mod_counter++;
2269
2270 for (r = 0; r < raidPtr->numRow; r++) {
2271 for (c = 0; c < raidPtr->numCol; c++) {
2272 if (raidPtr->Disks[r][c].status == rf_ds_optimal) {
2273 raidread_component_label(
2274 raidPtr->Disks[r][c].dev,
2275 raidPtr->raid_cinfo[r][c].ci_vp,
2276 &clabel);
2277 /* make sure status is noted */
2278 clabel.status = rf_ds_optimal;
2279 /* bump the counter */
2280 clabel.mod_counter = raidPtr->mod_counter;
2281
2282 raidwrite_component_label(
2283 raidPtr->Disks[r][c].dev,
2284 raidPtr->raid_cinfo[r][c].ci_vp,
2285 &clabel);
2286 }
2287 /* else we don't touch it.. */
2288 }
2289 }
2290
2291 for( c = 0; c < raidPtr->numSpare ; c++) {
2292 sparecol = raidPtr->numCol + c;
2293 if (raidPtr->Disks[0][sparecol].status == rf_ds_used_spare) {
2294 /*
2295
2296 we claim this disk is "optimal" if it's
2297 rf_ds_used_spare, as that means it should be
2298 directly substitutable for the disk it replaced.
2299 We note that too...
2300
2301 */
2302
2303 for(i=0;i<raidPtr->numRow;i++) {
2304 for(j=0;j<raidPtr->numCol;j++) {
2305 if ((raidPtr->Disks[i][j].spareRow ==
2306 0) &&
2307 (raidPtr->Disks[i][j].spareCol ==
2308 sparecol)) {
2309 srow = i;
2310 scol = j;
2311 break;
2312 }
2313 }
2314 }
2315
2316 /* XXX shouldn't *really* need this... */
2317 raidread_component_label(
2318 raidPtr->Disks[0][sparecol].dev,
2319 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2320 &clabel);
2321 /* make sure status is noted */
2322
2323 raid_init_component_label(raidPtr, &clabel);
2324
2325 clabel.mod_counter = raidPtr->mod_counter;
2326 clabel.row = srow;
2327 clabel.column = scol;
2328 clabel.status = rf_ds_optimal;
2329
2330 raidwrite_component_label(
2331 raidPtr->Disks[0][sparecol].dev,
2332 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2333 &clabel);
2334 }
2335 }
2336 /* printf("Component labels updated\n"); */
2337 }
2338
2339
2340 void
2341 rf_final_update_component_labels(raidPtr)
2342 RF_Raid_t *raidPtr;
2343 {
2344 RF_ComponentLabel_t clabel;
2345 int sparecol;
2346 int r,c;
2347 int i,j;
2348 int srow, scol;
2349
2350 srow = -1;
2351 scol = -1;
2352
2353 /* XXX should do extra checks to make sure things really are clean,
2354 rather than blindly setting the clean bit... */
2355
2356 raidPtr->mod_counter++;
2357
2358 for (r = 0; r < raidPtr->numRow; r++) {
2359 for (c = 0; c < raidPtr->numCol; c++) {
2360 if (raidPtr->Disks[r][c].status == rf_ds_optimal) {
2361 raidread_component_label(
2362 raidPtr->Disks[r][c].dev,
2363 raidPtr->raid_cinfo[r][c].ci_vp,
2364 &clabel);
2365 /* make sure status is noted */
2366 clabel.status = rf_ds_optimal;
2367 /* bump the counter */
2368 clabel.mod_counter = raidPtr->mod_counter;
2369
2370 raidwrite_component_label(
2371 raidPtr->Disks[r][c].dev,
2372 raidPtr->raid_cinfo[r][c].ci_vp,
2373 &clabel);
2374 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2375 raidmarkclean(
2376 raidPtr->Disks[r][c].dev,
2377 raidPtr->raid_cinfo[r][c].ci_vp,
2378 raidPtr->mod_counter);
2379 }
2380 }
2381 /* else we don't touch it.. */
2382 }
2383 }
2384
2385 for( c = 0; c < raidPtr->numSpare ; c++) {
2386 sparecol = raidPtr->numCol + c;
2387 if (raidPtr->Disks[0][sparecol].status == rf_ds_used_spare) {
2388 /*
2389
2390 we claim this disk is "optimal" if it's
2391 rf_ds_used_spare, as that means it should be
2392 directly substitutable for the disk it replaced.
2393 We note that too...
2394
2395 */
2396
2397 for(i=0;i<raidPtr->numRow;i++) {
2398 for(j=0;j<raidPtr->numCol;j++) {
2399 if ((raidPtr->Disks[i][j].spareRow ==
2400 0) &&
2401 (raidPtr->Disks[i][j].spareCol ==
2402 sparecol)) {
2403 srow = i;
2404 scol = j;
2405 break;
2406 }
2407 }
2408 }
2409
2410 /* XXX shouldn't *really* need this... */
2411 raidread_component_label(
2412 raidPtr->Disks[0][sparecol].dev,
2413 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2414 &clabel);
2415 /* make sure status is noted */
2416
2417 raid_init_component_label(raidPtr, &clabel);
2418
2419 clabel.mod_counter = raidPtr->mod_counter;
2420 clabel.row = srow;
2421 clabel.column = scol;
2422 clabel.status = rf_ds_optimal;
2423
2424 raidwrite_component_label(
2425 raidPtr->Disks[0][sparecol].dev,
2426 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2427 &clabel);
2428 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2429 raidmarkclean( raidPtr->Disks[0][sparecol].dev,
2430 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2431 raidPtr->mod_counter);
2432 }
2433 }
2434 }
2435 /* printf("Component labels updated\n"); */
2436 }
2437
2438 void
2439 rf_close_component(raidPtr, vp, auto_configured)
2440 RF_Raid_t *raidPtr;
2441 struct vnode *vp;
2442 int auto_configured;
2443 {
2444 struct proc *p;
2445
2446 p = raidPtr->engine_thread;
2447
2448 if (vp != NULL) {
2449 if (auto_configured == 1) {
2450 VOP_CLOSE(vp, FREAD, NOCRED, 0);
2451 vput(vp);
2452
2453 } else {
2454 VOP_UNLOCK(vp, 0);
2455 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
2456 }
2457 } else {
2458 printf("vnode was NULL\n");
2459 }
2460 }
2461
2462
2463 void
2464 rf_UnconfigureVnodes(raidPtr)
2465 RF_Raid_t *raidPtr;
2466 {
2467 int r,c;
2468 struct proc *p;
2469 struct vnode *vp;
2470 int acd;
2471
2472
2473 /* We take this opportunity to close the vnodes like we should.. */
2474
2475 p = raidPtr->engine_thread;
2476
2477 for (r = 0; r < raidPtr->numRow; r++) {
2478 for (c = 0; c < raidPtr->numCol; c++) {
2479 printf("Closing vnode for row: %d col: %d\n", r, c);
2480 vp = raidPtr->raid_cinfo[r][c].ci_vp;
2481 acd = raidPtr->Disks[r][c].auto_configured;
2482 rf_close_component(raidPtr, vp, acd);
2483 raidPtr->raid_cinfo[r][c].ci_vp = NULL;
2484 raidPtr->Disks[r][c].auto_configured = 0;
2485 }
2486 }
2487 for (r = 0; r < raidPtr->numSpare; r++) {
2488 printf("Closing vnode for spare: %d\n", r);
2489 vp = raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp;
2490 acd = raidPtr->Disks[0][raidPtr->numCol + r].auto_configured;
2491 rf_close_component(raidPtr, vp, acd);
2492 raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp = NULL;
2493 raidPtr->Disks[0][raidPtr->numCol + r].auto_configured = 0;
2494 }
2495 }
2496
2497
2498 void
2499 rf_ReconThread(req)
2500 struct rf_recon_req *req;
2501 {
2502 int s;
2503 RF_Raid_t *raidPtr;
2504
2505 s = splbio();
2506 raidPtr = (RF_Raid_t *) req->raidPtr;
2507 raidPtr->recon_in_progress = 1;
2508
2509 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col,
2510 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2511
2512 /* XXX get rid of this! we don't need it at all.. */
2513 RF_Free(req, sizeof(*req));
2514
2515 raidPtr->recon_in_progress = 0;
2516 splx(s);
2517
2518 /* That's all... */
2519 kthread_exit(0); /* does not return */
2520 }
2521
2522 void
2523 rf_RewriteParityThread(raidPtr)
2524 RF_Raid_t *raidPtr;
2525 {
2526 int retcode;
2527 int s;
2528
2529 raidPtr->parity_rewrite_in_progress = 1;
2530 s = splbio();
2531 retcode = rf_RewriteParity(raidPtr);
2532 splx(s);
2533 if (retcode) {
2534 printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2535 } else {
2536 /* set the clean bit! If we shutdown correctly,
2537 the clean bit on each component label will get
2538 set */
2539 raidPtr->parity_good = RF_RAID_CLEAN;
2540 }
2541 raidPtr->parity_rewrite_in_progress = 0;
2542
2543 /* That's all... */
2544 kthread_exit(0); /* does not return */
2545 }
2546
2547
2548 void
2549 rf_CopybackThread(raidPtr)
2550 RF_Raid_t *raidPtr;
2551 {
2552 int s;
2553
2554 raidPtr->copyback_in_progress = 1;
2555 s = splbio();
2556 rf_CopybackReconstructedData(raidPtr);
2557 splx(s);
2558 raidPtr->copyback_in_progress = 0;
2559
2560 /* That's all... */
2561 kthread_exit(0); /* does not return */
2562 }
2563
2564
2565 void
2566 rf_ReconstructInPlaceThread(req)
2567 struct rf_recon_req *req;
2568 {
2569 int retcode;
2570 int s;
2571 RF_Raid_t *raidPtr;
2572
2573 s = splbio();
2574 raidPtr = req->raidPtr;
2575 raidPtr->recon_in_progress = 1;
2576 retcode = rf_ReconstructInPlace(raidPtr, req->row, req->col);
2577 RF_Free(req, sizeof(*req));
2578 raidPtr->recon_in_progress = 0;
2579 splx(s);
2580
2581 /* That's all... */
2582 kthread_exit(0); /* does not return */
2583 }
2584
2585 void
2586 rf_mountroot_hook(dev)
2587 struct device *dev;
2588 {
2589
2590 }
2591
2592
2593 RF_AutoConfig_t *
2594 rf_find_raid_components()
2595 {
2596 struct devnametobdevmaj *dtobdm;
2597 struct vnode *vp;
2598 struct disklabel label;
2599 struct device *dv;
2600 char *cd_name;
2601 dev_t dev;
2602 int error;
2603 int i;
2604 int good_one;
2605 RF_ComponentLabel_t *clabel;
2606 RF_AutoConfig_t *ac_list;
2607 RF_AutoConfig_t *ac;
2608
2609
2610 /* initialize the AutoConfig list */
2611 ac_list = NULL;
2612
2613 if (raidautoconfig) {
2614
2615 /* we begin by trolling through *all* the devices on the system */
2616
2617 for (dv = alldevs.tqh_first; dv != NULL;
2618 dv = dv->dv_list.tqe_next) {
2619
2620 /* we are only interested in disks... */
2621 if (dv->dv_class != DV_DISK)
2622 continue;
2623
2624 /* we don't care about floppies... */
2625 if (!strcmp(dv->dv_cfdata->cf_driver->cd_name,"fd")) {
2626 continue;
2627 }
2628
2629 /* need to find the device_name_to_block_device_major stuff */
2630 cd_name = dv->dv_cfdata->cf_driver->cd_name;
2631 dtobdm = dev_name2blk;
2632 while (dtobdm->d_name && strcmp(dtobdm->d_name, cd_name)) {
2633 dtobdm++;
2634 }
2635
2636 /* get a vnode for the raw partition of this disk */
2637
2638 dev = MAKEDISKDEV(dtobdm->d_maj, dv->dv_unit, RAW_PART);
2639 if (bdevvp(dev, &vp))
2640 panic("RAID can't alloc vnode");
2641
2642 error = VOP_OPEN(vp, FREAD, NOCRED, 0);
2643
2644 if (error) {
2645 /* "Who cares." Continue looking
2646 for something that exists*/
2647 vput(vp);
2648 continue;
2649 }
2650
2651 /* Ok, the disk exists. Go get the disklabel. */
2652 error = VOP_IOCTL(vp, DIOCGDINFO, (caddr_t)&label,
2653 FREAD, NOCRED, 0);
2654 if (error) {
2655 /*
2656 * XXX can't happen - open() would
2657 * have errored out (or faked up one)
2658 */
2659 printf("can't get label for dev %s%c (%d)!?!?\n",
2660 dv->dv_xname, 'a' + RAW_PART, error);
2661 }
2662
2663 /* don't need this any more. We'll allocate it again
2664 a little later if we really do... */
2665 VOP_CLOSE(vp, FREAD, NOCRED, 0);
2666 vput(vp);
2667
2668 for (i=0; i < label.d_npartitions; i++) {
2669 /* We only support partitions marked as RAID */
2670 if (label.d_partitions[i].p_fstype != FS_RAID)
2671 continue;
2672
2673 dev = MAKEDISKDEV(dtobdm->d_maj, dv->dv_unit, i);
2674 if (bdevvp(dev, &vp))
2675 panic("RAID can't alloc vnode");
2676
2677 error = VOP_OPEN(vp, FREAD, NOCRED, 0);
2678 if (error) {
2679 /* Whatever... */
2680 vput(vp);
2681 continue;
2682 }
2683
2684 good_one = 0;
2685
2686 clabel = (RF_ComponentLabel_t *)
2687 malloc(sizeof(RF_ComponentLabel_t),
2688 M_RAIDFRAME, M_NOWAIT);
2689 if (clabel == NULL) {
2690 /* XXX CLEANUP HERE */
2691 printf("RAID auto config: out of memory!\n");
2692 return(NULL); /* XXX probably should panic? */
2693 }
2694
2695 if (!raidread_component_label(dev, vp, clabel)) {
2696 /* Got the label. Does it look reasonable? */
2697 if (rf_reasonable_label(clabel) &&
2698 (clabel->partitionSize <=
2699 label.d_partitions[i].p_size)) {
2700 #if DEBUG
2701 printf("Component on: %s%c: %d\n",
2702 dv->dv_xname, 'a'+i,
2703 label.d_partitions[i].p_size);
2704 rf_print_component_label(clabel);
2705 #endif
2706 /* if it's reasonable, add it,
2707 else ignore it. */
2708 ac = (RF_AutoConfig_t *)
2709 malloc(sizeof(RF_AutoConfig_t),
2710 M_RAIDFRAME,
2711 M_NOWAIT);
2712 if (ac == NULL) {
2713 /* XXX should panic?? */
2714 return(NULL);
2715 }
2716
2717 sprintf(ac->devname, "%s%c",
2718 dv->dv_xname, 'a'+i);
2719 ac->dev = dev;
2720 ac->vp = vp;
2721 ac->clabel = clabel;
2722 ac->next = ac_list;
2723 ac_list = ac;
2724 good_one = 1;
2725 }
2726 }
2727 if (!good_one) {
2728 /* cleanup */
2729 free(clabel, M_RAIDFRAME);
2730 VOP_CLOSE(vp, FREAD, NOCRED, 0);
2731 vput(vp);
2732 }
2733 }
2734 }
2735 }
2736 return(ac_list);
2737 }
2738
2739 static int
2740 rf_reasonable_label(clabel)
2741 RF_ComponentLabel_t *clabel;
2742 {
2743
2744 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2745 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2746 ((clabel->clean == RF_RAID_CLEAN) ||
2747 (clabel->clean == RF_RAID_DIRTY)) &&
2748 clabel->row >=0 &&
2749 clabel->column >= 0 &&
2750 clabel->num_rows > 0 &&
2751 clabel->num_columns > 0 &&
2752 clabel->row < clabel->num_rows &&
2753 clabel->column < clabel->num_columns &&
2754 clabel->blockSize > 0 &&
2755 clabel->numBlocks > 0) {
2756 /* label looks reasonable enough... */
2757 return(1);
2758 }
2759 return(0);
2760 }
2761
2762
2763 void
2764 rf_print_component_label(clabel)
2765 RF_ComponentLabel_t *clabel;
2766 {
2767 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
2768 clabel->row, clabel->column,
2769 clabel->num_rows, clabel->num_columns);
2770 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
2771 clabel->version, clabel->serial_number,
2772 clabel->mod_counter);
2773 printf(" Clean: %s Status: %d\n",
2774 clabel->clean ? "Yes" : "No", clabel->status );
2775 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
2776 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
2777 printf(" RAID Level: %c blocksize: %d numBlocks: %d\n",
2778 (char) clabel->parityConfig, clabel->blockSize,
2779 clabel->numBlocks);
2780 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
2781 printf(" Last configured as: raid%d\n", clabel->last_unit );
2782 #if 0
2783 printf(" Config order: %d\n", clabel->config_order);
2784 #endif
2785
2786 }
2787
2788 RF_ConfigSet_t *
2789 rf_create_auto_sets(ac_list)
2790 RF_AutoConfig_t *ac_list;
2791 {
2792 RF_AutoConfig_t *ac;
2793 RF_ConfigSet_t *config_sets;
2794 RF_ConfigSet_t *cset;
2795 RF_AutoConfig_t *ac_next;
2796
2797
2798 config_sets = NULL;
2799
2800 /* Go through the AutoConfig list, and figure out which components
2801 belong to what sets. */
2802 ac = ac_list;
2803 while(ac!=NULL) {
2804 /* we're going to putz with ac->next, so save it here
2805 for use at the end of the loop */
2806 ac_next = ac->next;
2807
2808 if (config_sets == NULL) {
2809 /* will need at least this one... */
2810 config_sets = (RF_ConfigSet_t *)
2811 malloc(sizeof(RF_ConfigSet_t),
2812 M_RAIDFRAME, M_NOWAIT);
2813 if (config_sets == NULL) {
2814 panic("rf_create_auto_sets: No memory!\n");
2815 }
2816 /* this one is easy :) */
2817 config_sets->ac = ac;
2818 config_sets->next = NULL;
2819 config_sets->rootable = 0;
2820 ac->next = NULL;
2821 } else {
2822 /* which set does this component fit into? */
2823 cset = config_sets;
2824 while(cset!=NULL) {
2825 if (rf_does_it_fit(cset, ac)) {
2826 /* looks like it matches */
2827 ac->next = cset->ac;
2828 cset->ac = ac;
2829 break;
2830 }
2831 cset = cset->next;
2832 }
2833 if (cset==NULL) {
2834 /* didn't find a match above... new set..*/
2835 cset = (RF_ConfigSet_t *)
2836 malloc(sizeof(RF_ConfigSet_t),
2837 M_RAIDFRAME, M_NOWAIT);
2838 if (cset == NULL) {
2839 panic("rf_create_auto_sets: No memory!\n");
2840 }
2841 cset->ac = ac;
2842 ac->next = NULL;
2843 cset->next = config_sets;
2844 cset->rootable = 0;
2845 config_sets = cset;
2846 }
2847 }
2848 ac = ac_next;
2849 }
2850
2851
2852 return(config_sets);
2853 }
2854
2855 static int
2856 rf_does_it_fit(cset, ac)
2857 RF_ConfigSet_t *cset;
2858 RF_AutoConfig_t *ac;
2859 {
2860 RF_ComponentLabel_t *clabel1, *clabel2;
2861
2862 /* If this one matches the *first* one in the set, that's good
2863 enough, since the other members of the set would have been
2864 through here too... */
2865 /* note that we are not checking partitionSize here..
2866
2867 Note that we are also not checking the mod_counters here.
2868 If everything else matches execpt the mod_counter, that's
2869 good enough for this test. We will deal with the mod_counters
2870 a little later in the autoconfiguration process.
2871
2872 (clabel1->mod_counter == clabel2->mod_counter) &&
2873
2874 */
2875
2876 clabel1 = cset->ac->clabel;
2877 clabel2 = ac->clabel;
2878 if ((clabel1->version == clabel2->version) &&
2879 (clabel1->serial_number == clabel2->serial_number) &&
2880 (clabel1->num_rows == clabel2->num_rows) &&
2881 (clabel1->num_columns == clabel2->num_columns) &&
2882 (clabel1->sectPerSU == clabel2->sectPerSU) &&
2883 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
2884 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
2885 (clabel1->parityConfig == clabel2->parityConfig) &&
2886 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
2887 (clabel1->blockSize == clabel2->blockSize) &&
2888 (clabel1->numBlocks == clabel2->numBlocks) &&
2889 (clabel1->autoconfigure == clabel2->autoconfigure) &&
2890 (clabel1->root_partition == clabel2->root_partition) &&
2891 (clabel1->last_unit == clabel2->last_unit) &&
2892 (clabel1->config_order == clabel2->config_order)) {
2893 /* if it get's here, it almost *has* to be a match */
2894 } else {
2895 /* it's not consistent with somebody in the set..
2896 punt */
2897 return(0);
2898 }
2899 /* all was fine.. it must fit... */
2900 return(1);
2901 }
2902
2903 int
2904 rf_have_enough_components(cset)
2905 RF_ConfigSet_t *cset;
2906 {
2907 RF_AutoConfig_t *ac;
2908 RF_AutoConfig_t *auto_config;
2909 RF_ComponentLabel_t *clabel;
2910 int r,c;
2911 int num_rows;
2912 int num_cols;
2913 int num_missing;
2914
2915 /* check to see that we have enough 'live' components
2916 of this set. If so, we can configure it if necessary */
2917
2918 num_rows = cset->ac->clabel->num_rows;
2919 num_cols = cset->ac->clabel->num_columns;
2920
2921 /* XXX Check for duplicate components!?!?!? */
2922
2923 num_missing = 0;
2924 auto_config = cset->ac;
2925
2926 for(r=0; r<num_rows; r++) {
2927 for(c=0; c<num_cols; c++) {
2928 ac = auto_config;
2929 while(ac!=NULL) {
2930 if (ac->clabel==NULL) {
2931 /* big-time bad news. */
2932 goto fail;
2933 }
2934 if ((ac->clabel->row == r) &&
2935 (ac->clabel->column == c)) {
2936 /* it's this one... */
2937 #if DEBUG
2938 printf("Found: %s at %d,%d\n",
2939 ac->devname,r,c);
2940 #endif
2941 break;
2942 }
2943 ac=ac->next;
2944 }
2945 if (ac==NULL) {
2946 /* Didn't find one here! */
2947 num_missing++;
2948 }
2949 }
2950 }
2951
2952 clabel = cset->ac->clabel;
2953
2954 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
2955 ((clabel->parityConfig == '1') && (num_missing > 1)) ||
2956 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
2957 ((clabel->parityConfig == '5') && (num_missing > 1))) {
2958 /* XXX this needs to be made *much* more general */
2959 /* Too many failures */
2960 return(0);
2961 }
2962 /* otherwise, all is well, and we've got enough to take a kick
2963 at autoconfiguring this set */
2964 return(1);
2965 fail:
2966 return(0);
2967
2968 }
2969
2970 void
2971 rf_create_configuration(ac,config,raidPtr)
2972 RF_AutoConfig_t *ac;
2973 RF_Config_t *config;
2974 RF_Raid_t *raidPtr;
2975 {
2976 RF_ComponentLabel_t *clabel;
2977
2978 clabel = ac->clabel;
2979
2980 /* 1. Fill in the common stuff */
2981 config->numRow = clabel->num_rows;
2982 config->numCol = clabel->num_columns;
2983 config->numSpare = 0; /* XXX should this be set here? */
2984 config->sectPerSU = clabel->sectPerSU;
2985 config->SUsPerPU = clabel->SUsPerPU;
2986 config->SUsPerRU = clabel->SUsPerRU;
2987 config->parityConfig = clabel->parityConfig;
2988 /* XXX... */
2989 strcpy(config->diskQueueType,"fifo");
2990 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
2991 config->layoutSpecificSize = 0; /* XXX ?? */
2992
2993 while(ac!=NULL) {
2994 /* row/col values will be in range due to the checks
2995 in reasonable_label() */
2996 strcpy(config->devnames[ac->clabel->row][ac->clabel->column],
2997 ac->devname);
2998 ac = ac->next;
2999 }
3000
3001 }
3002
3003 int
3004 rf_set_autoconfig(raidPtr, new_value)
3005 RF_Raid_t *raidPtr;
3006 int new_value;
3007 {
3008 RF_ComponentLabel_t clabel;
3009 struct vnode *vp;
3010 dev_t dev;
3011 int row, column;
3012
3013 raidPtr->autoconfigure = new_value;
3014 for(row=0; row<raidPtr->numRow; row++) {
3015 for(column=0; column<raidPtr->numCol; column++) {
3016 dev = raidPtr->Disks[row][column].dev;
3017 vp = raidPtr->raid_cinfo[row][column].ci_vp;
3018 raidread_component_label(dev, vp, &clabel);
3019 clabel.autoconfigure = new_value;
3020 raidwrite_component_label(dev, vp, &clabel);
3021 }
3022 }
3023 return(new_value);
3024 }
3025
3026 int
3027 rf_set_rootpartition(raidPtr, new_value)
3028 RF_Raid_t *raidPtr;
3029 int new_value;
3030 {
3031 RF_ComponentLabel_t clabel;
3032 struct vnode *vp;
3033 dev_t dev;
3034 int row, column;
3035
3036 raidPtr->root_partition = new_value;
3037 for(row=0; row<raidPtr->numRow; row++) {
3038 for(column=0; column<raidPtr->numCol; column++) {
3039 dev = raidPtr->Disks[row][column].dev;
3040 vp = raidPtr->raid_cinfo[row][column].ci_vp;
3041 raidread_component_label(dev, vp, &clabel);
3042 clabel.root_partition = new_value;
3043 raidwrite_component_label(dev, vp, &clabel);
3044 }
3045 }
3046 return(new_value);
3047 }
3048
3049 void
3050 rf_release_all_vps(cset)
3051 RF_ConfigSet_t *cset;
3052 {
3053 RF_AutoConfig_t *ac;
3054
3055 ac = cset->ac;
3056 while(ac!=NULL) {
3057 /* Close the vp, and give it back */
3058 if (ac->vp) {
3059 VOP_CLOSE(ac->vp, FREAD, NOCRED, 0);
3060 vput(ac->vp);
3061 }
3062 ac = ac->next;
3063 }
3064 }
3065
3066
3067 void
3068 rf_cleanup_config_set(cset)
3069 RF_ConfigSet_t *cset;
3070 {
3071 RF_AutoConfig_t *ac;
3072 RF_AutoConfig_t *next_ac;
3073
3074 ac = cset->ac;
3075 while(ac!=NULL) {
3076 next_ac = ac->next;
3077 /* nuke the label */
3078 free(ac->clabel, M_RAIDFRAME);
3079 /* cleanup the config structure */
3080 free(ac, M_RAIDFRAME);
3081 /* "next.." */
3082 ac = next_ac;
3083 }
3084 /* and, finally, nuke the config set */
3085 free(cset, M_RAIDFRAME);
3086 }
3087
3088
3089 void
3090 raid_init_component_label(raidPtr, clabel)
3091 RF_Raid_t *raidPtr;
3092 RF_ComponentLabel_t *clabel;
3093 {
3094 /* current version number */
3095 clabel->version = RF_COMPONENT_LABEL_VERSION;
3096 clabel->serial_number = raidPtr->serial_number;
3097 clabel->mod_counter = raidPtr->mod_counter;
3098 clabel->num_rows = raidPtr->numRow;
3099 clabel->num_columns = raidPtr->numCol;
3100 clabel->clean = RF_RAID_DIRTY; /* not clean */
3101 clabel->status = rf_ds_optimal; /* "It's good!" */
3102
3103 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3104 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3105 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3106
3107 clabel->blockSize = raidPtr->bytesPerSector;
3108 clabel->numBlocks = raidPtr->sectorsPerDisk;
3109
3110 /* XXX not portable */
3111 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3112 clabel->maxOutstanding = raidPtr->maxOutstanding;
3113 clabel->autoconfigure = raidPtr->autoconfigure;
3114 clabel->root_partition = raidPtr->root_partition;
3115 clabel->last_unit = raidPtr->raidid;
3116 clabel->config_order = raidPtr->config_order;
3117 }
3118
3119 int
3120 rf_auto_config_set(cset,unit)
3121 RF_ConfigSet_t *cset;
3122 int *unit;
3123 {
3124 RF_Raid_t *raidPtr;
3125 RF_Config_t *config;
3126 int raidID;
3127 int retcode;
3128
3129 printf("RAID autoconfigure\n");
3130
3131 retcode = 0;
3132 *unit = -1;
3133
3134 /* 1. Create a config structure */
3135
3136 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3137 M_RAIDFRAME,
3138 M_NOWAIT);
3139 if (config==NULL) {
3140 printf("Out of mem!?!?\n");
3141 /* XXX do something more intelligent here. */
3142 return(1);
3143 }
3144 /* XXX raidID needs to be set correctly.. */
3145
3146 /*
3147 2. Figure out what RAID ID this one is supposed to live at
3148 See if we can get the same RAID dev that it was configured
3149 on last time..
3150 */
3151
3152 raidID = cset->ac->clabel->last_unit;
3153 if ((raidID < 0) || (raidID >= numraid)) {
3154 /* let's not wander off into lala land. */
3155 raidID = numraid - 1;
3156 }
3157 if (raidPtrs[raidID]->valid != 0) {
3158
3159 /*
3160 Nope... Go looking for an alternative...
3161 Start high so we don't immediately use raid0 if that's
3162 not taken.
3163 */
3164
3165 for(raidID = numraid; raidID >= 0; raidID--) {
3166 if (raidPtrs[raidID]->valid == 0) {
3167 /* can use this one! */
3168 break;
3169 }
3170 }
3171 }
3172
3173 if (raidID < 0) {
3174 /* punt... */
3175 printf("Unable to auto configure this set!\n");
3176 printf("(Out of RAID devs!)\n");
3177 return(1);
3178 }
3179 printf("Configuring raid%d:\n",raidID);
3180 raidPtr = raidPtrs[raidID];
3181
3182 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3183 raidPtr->raidid = raidID;
3184 raidPtr->openings = RAIDOUTSTANDING;
3185
3186 /* 3. Build the configuration structure */
3187 rf_create_configuration(cset->ac, config, raidPtr);
3188
3189 /* 4. Do the configuration */
3190 retcode = rf_Configure(raidPtr, config, cset->ac);
3191
3192 if (retcode == 0) {
3193
3194 raidinit(raidPtrs[raidID]);
3195
3196 rf_markalldirty(raidPtrs[raidID]);
3197 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3198 if (cset->ac->clabel->root_partition==1) {
3199 /* everything configured just fine. Make a note
3200 that this set is eligible to be root. */
3201 cset->rootable = 1;
3202 /* XXX do this here? */
3203 raidPtrs[raidID]->root_partition = 1;
3204 }
3205 }
3206
3207 /* 5. Cleanup */
3208 free(config, M_RAIDFRAME);
3209
3210 *unit = raidID;
3211 return(retcode);
3212 }
3213