rf_netbsdkintf.c revision 1.32 1 /* $NetBSD: rf_netbsdkintf.c,v 1.32 1999/12/03 02:43:22 oster Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the NetBSD
20 * Foundation, Inc. and its contributors.
21 * 4. Neither the name of The NetBSD Foundation nor the names of its
22 * contributors may be used to endorse or promote products derived
23 * from this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Copyright (c) 1988 University of Utah.
40 * Copyright (c) 1990, 1993
41 * The Regents of the University of California. All rights reserved.
42 *
43 * This code is derived from software contributed to Berkeley by
44 * the Systems Programming Group of the University of Utah Computer
45 * Science Department.
46 *
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions
49 * are met:
50 * 1. Redistributions of source code must retain the above copyright
51 * notice, this list of conditions and the following disclaimer.
52 * 2. Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * 3. All advertising materials mentioning features or use of this software
56 * must display the following acknowledgement:
57 * This product includes software developed by the University of
58 * California, Berkeley and its contributors.
59 * 4. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
62 *
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE.
74 *
75 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 *
77 * @(#)cd.c 8.2 (Berkeley) 11/16/93
78 */
79
80
81
82
83 /*
84 * Copyright (c) 1995 Carnegie-Mellon University.
85 * All rights reserved.
86 *
87 * Authors: Mark Holland, Jim Zelenka
88 *
89 * Permission to use, copy, modify and distribute this software and
90 * its documentation is hereby granted, provided that both the copyright
91 * notice and this permission notice appear in all copies of the
92 * software, derivative works or modified versions, and any portions
93 * thereof, and that both notices appear in supporting documentation.
94 *
95 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
96 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
97 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
98 *
99 * Carnegie Mellon requests users of this software to return to
100 *
101 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
102 * School of Computer Science
103 * Carnegie Mellon University
104 * Pittsburgh PA 15213-3890
105 *
106 * any improvements or extensions that they make and grant Carnegie the
107 * rights to redistribute these changes.
108 */
109
110 /***********************************************************
111 *
112 * rf_kintf.c -- the kernel interface routines for RAIDframe
113 *
114 ***********************************************************/
115
116 #include <sys/errno.h>
117 #include <sys/param.h>
118 #include <sys/pool.h>
119 #include <sys/queue.h>
120 #include <sys/disk.h>
121 #include <sys/device.h>
122 #include <sys/stat.h>
123 #include <sys/ioctl.h>
124 #include <sys/fcntl.h>
125 #include <sys/systm.h>
126 #include <sys/namei.h>
127 #include <sys/vnode.h>
128 #include <sys/param.h>
129 #include <sys/types.h>
130 #include <machine/types.h>
131 #include <sys/disklabel.h>
132 #include <sys/conf.h>
133 #include <sys/lock.h>
134 #include <sys/buf.h>
135 #include <sys/user.h>
136
137 #include "raid.h"
138 #include "rf_raid.h"
139 #include "rf_raidframe.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_diskqueue.h"
143 #include "rf_acctrace.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_debugMem.h"
147 #include "rf_kintf.h"
148 #include "rf_options.h"
149 #include "rf_driver.h"
150 #include "rf_parityscan.h"
151 #include "rf_debugprint.h"
152 #include "rf_threadstuff.h"
153
154 int rf_kdebug_level = 0;
155
156 #ifdef DEBUG
157 #define db0_printf(a) printf a
158 #define db_printf(a) if (rf_kdebug_level > 0) printf a
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #define db2_printf(a) if (rf_kdebug_level > 1) printf a
161 #define db3_printf(a) if (rf_kdebug_level > 2) printf a
162 #define db4_printf(a) if (rf_kdebug_level > 3) printf a
163 #define db5_printf(a) if (rf_kdebug_level > 4) printf a
164 #else /* DEBUG */
165 #define db0_printf(a) printf a
166 #define db1_printf(a) { }
167 #define db2_printf(a) { }
168 #define db3_printf(a) { }
169 #define db4_printf(a) { }
170 #define db5_printf(a) { }
171 #endif /* DEBUG */
172
173 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
174
175 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
176
177 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
178 * spare table */
179 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
180 * installation process */
181
182 static struct rf_recon_req *recon_queue = NULL; /* used to communicate
183 * reconstruction
184 * requests */
185
186
187 decl_simple_lock_data(, recon_queue_mutex)
188 #define LOCK_RECON_Q_MUTEX() simple_lock(&recon_queue_mutex)
189 #define UNLOCK_RECON_Q_MUTEX() simple_unlock(&recon_queue_mutex)
190
191 /* prototypes */
192 static void KernelWakeupFunc(struct buf * bp);
193 static void InitBP(struct buf * bp, struct vnode *, unsigned rw_flag,
194 dev_t dev, RF_SectorNum_t startSect,
195 RF_SectorCount_t numSect, caddr_t buf,
196 void (*cbFunc) (struct buf *), void *cbArg,
197 int logBytesPerSector, struct proc * b_proc);
198
199 #define Dprintf0(s) if (rf_queueDebug) \
200 rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
201 #define Dprintf1(s,a) if (rf_queueDebug) \
202 rf_debug_printf(s,a,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
203 #define Dprintf2(s,a,b) if (rf_queueDebug) \
204 rf_debug_printf(s,a,b,NULL,NULL,NULL,NULL,NULL,NULL)
205 #define Dprintf3(s,a,b,c) if (rf_queueDebug) \
206 rf_debug_printf(s,a,b,c,NULL,NULL,NULL,NULL,NULL)
207
208 int raidmarkclean(dev_t dev, struct vnode *b_vp, int);
209 int raidmarkdirty(dev_t dev, struct vnode *b_vp, int);
210
211 void raidattach __P((int));
212 int raidsize __P((dev_t));
213
214 void rf_DiskIOComplete(RF_DiskQueue_t *, RF_DiskQueueData_t *, int);
215 void rf_CopybackReconstructedData(RF_Raid_t * raidPtr);
216 static int raidinit __P((dev_t, RF_Raid_t *, int));
217
218 int raidopen __P((dev_t, int, int, struct proc *));
219 int raidclose __P((dev_t, int, int, struct proc *));
220 int raidioctl __P((dev_t, u_long, caddr_t, int, struct proc *));
221 int raidwrite __P((dev_t, struct uio *, int));
222 int raidread __P((dev_t, struct uio *, int));
223 void raidstrategy __P((struct buf *));
224 int raiddump __P((dev_t, daddr_t, caddr_t, size_t));
225
226 int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
227 int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
228 void rf_update_component_labels( RF_Raid_t *);
229 /*
230 * Pilfered from ccd.c
231 */
232
233 struct raidbuf {
234 struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */
235 struct buf *rf_obp; /* ptr. to original I/O buf */
236 int rf_flags; /* misc. flags */
237 RF_DiskQueueData_t *req;/* the request that this was part of.. */
238 };
239
240
241 #define RAIDGETBUF(rs) pool_get(&(rs)->sc_cbufpool, PR_NOWAIT)
242 #define RAIDPUTBUF(rs, cbp) pool_put(&(rs)->sc_cbufpool, cbp)
243
244 /* XXX Not sure if the following should be replacing the raidPtrs above,
245 or if it should be used in conjunction with that... */
246
247 struct raid_softc {
248 int sc_flags; /* flags */
249 int sc_cflags; /* configuration flags */
250 size_t sc_size; /* size of the raid device */
251 dev_t sc_dev; /* our device.. */
252 char sc_xname[20]; /* XXX external name */
253 struct disk sc_dkdev; /* generic disk device info */
254 struct pool sc_cbufpool; /* component buffer pool */
255 };
256 /* sc_flags */
257 #define RAIDF_INITED 0x01 /* unit has been initialized */
258 #define RAIDF_WLABEL 0x02 /* label area is writable */
259 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
260 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
261 #define RAIDF_LOCKED 0x80 /* unit is locked */
262
263 #define raidunit(x) DISKUNIT(x)
264 static int numraid = 0;
265
266 /*
267 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
268 * Be aware that large numbers can allow the driver to consume a lot of
269 * kernel memory, especially on writes, and in degraded mode reads.
270 *
271 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
272 * a single 64K write will typically require 64K for the old data,
273 * 64K for the old parity, and 64K for the new parity, for a total
274 * of 192K (if the parity buffer is not re-used immediately).
275 * Even it if is used immedately, that's still 128K, which when multiplied
276 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
277 *
278 * Now in degraded mode, for example, a 64K read on the above setup may
279 * require data reconstruction, which will require *all* of the 4 remaining
280 * disks to participate -- 4 * 32K/disk == 128K again.
281 */
282
283 #ifndef RAIDOUTSTANDING
284 #define RAIDOUTSTANDING 6
285 #endif
286
287 #define RAIDLABELDEV(dev) \
288 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
289
290 /* declared here, and made public, for the benefit of KVM stuff.. */
291 struct raid_softc *raid_softc;
292
293 static void raidgetdefaultlabel __P((RF_Raid_t *, struct raid_softc *,
294 struct disklabel *));
295 static void raidgetdisklabel __P((dev_t));
296 static void raidmakedisklabel __P((struct raid_softc *));
297
298 static int raidlock __P((struct raid_softc *));
299 static void raidunlock __P((struct raid_softc *));
300 int raidlookup __P((char *, struct proc * p, struct vnode **));
301
302 static void rf_markalldirty __P((RF_Raid_t *));
303
304 void
305 raidattach(num)
306 int num;
307 {
308 int raidID;
309 int i, rc;
310
311 #ifdef DEBUG
312 printf("raidattach: Asked for %d units\n", num);
313 #endif
314
315 if (num <= 0) {
316 #ifdef DIAGNOSTIC
317 panic("raidattach: count <= 0");
318 #endif
319 return;
320 }
321 /* This is where all the initialization stuff gets done. */
322
323 /* Make some space for requested number of units... */
324
325 RF_Calloc(raidPtrs, num, sizeof(RF_Raid_t *), (RF_Raid_t **));
326 if (raidPtrs == NULL) {
327 panic("raidPtrs is NULL!!\n");
328 }
329
330 rc = rf_mutex_init(&rf_sparet_wait_mutex);
331 if (rc) {
332 RF_PANIC();
333 }
334
335 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
336 recon_queue = NULL;
337
338 for (i = 0; i < numraid; i++)
339 raidPtrs[i] = NULL;
340 rc = rf_BootRaidframe();
341 if (rc == 0)
342 printf("Kernelized RAIDframe activated\n");
343 else
344 panic("Serious error booting RAID!!\n");
345
346 /* put together some datastructures like the CCD device does.. This
347 * lets us lock the device and what-not when it gets opened. */
348
349 raid_softc = (struct raid_softc *)
350 malloc(num * sizeof(struct raid_softc),
351 M_RAIDFRAME, M_NOWAIT);
352 if (raid_softc == NULL) {
353 printf("WARNING: no memory for RAIDframe driver\n");
354 return;
355 }
356 numraid = num;
357 bzero(raid_softc, num * sizeof(struct raid_softc));
358
359 for (raidID = 0; raidID < num; raidID++) {
360 RF_Calloc(raidPtrs[raidID], 1, sizeof(RF_Raid_t),
361 (RF_Raid_t *));
362 if (raidPtrs[raidID] == NULL) {
363 printf("raidPtrs[%d] is NULL\n", raidID);
364 }
365 }
366 }
367
368
369 int
370 raidsize(dev)
371 dev_t dev;
372 {
373 struct raid_softc *rs;
374 struct disklabel *lp;
375 int part, unit, omask, size;
376
377 unit = raidunit(dev);
378 if (unit >= numraid)
379 return (-1);
380 rs = &raid_softc[unit];
381
382 if ((rs->sc_flags & RAIDF_INITED) == 0)
383 return (-1);
384
385 part = DISKPART(dev);
386 omask = rs->sc_dkdev.dk_openmask & (1 << part);
387 lp = rs->sc_dkdev.dk_label;
388
389 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curproc))
390 return (-1);
391
392 if (lp->d_partitions[part].p_fstype != FS_SWAP)
393 size = -1;
394 else
395 size = lp->d_partitions[part].p_size *
396 (lp->d_secsize / DEV_BSIZE);
397
398 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curproc))
399 return (-1);
400
401 return (size);
402
403 }
404
405 int
406 raiddump(dev, blkno, va, size)
407 dev_t dev;
408 daddr_t blkno;
409 caddr_t va;
410 size_t size;
411 {
412 /* Not implemented. */
413 return ENXIO;
414 }
415 /* ARGSUSED */
416 int
417 raidopen(dev, flags, fmt, p)
418 dev_t dev;
419 int flags, fmt;
420 struct proc *p;
421 {
422 int unit = raidunit(dev);
423 struct raid_softc *rs;
424 struct disklabel *lp;
425 int part, pmask;
426 int error = 0;
427
428 if (unit >= numraid)
429 return (ENXIO);
430 rs = &raid_softc[unit];
431
432 if ((error = raidlock(rs)) != 0)
433 return (error);
434 lp = rs->sc_dkdev.dk_label;
435
436 part = DISKPART(dev);
437 pmask = (1 << part);
438
439 db1_printf(("Opening raid device number: %d partition: %d\n",
440 unit, part));
441
442
443 if ((rs->sc_flags & RAIDF_INITED) &&
444 (rs->sc_dkdev.dk_openmask == 0))
445 raidgetdisklabel(dev);
446
447 /* make sure that this partition exists */
448
449 if (part != RAW_PART) {
450 db1_printf(("Not a raw partition..\n"));
451 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
452 ((part >= lp->d_npartitions) ||
453 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
454 error = ENXIO;
455 raidunlock(rs);
456 db1_printf(("Bailing out...\n"));
457 return (error);
458 }
459 }
460 /* Prevent this unit from being unconfigured while open. */
461 switch (fmt) {
462 case S_IFCHR:
463 rs->sc_dkdev.dk_copenmask |= pmask;
464 break;
465
466 case S_IFBLK:
467 rs->sc_dkdev.dk_bopenmask |= pmask;
468 break;
469 }
470
471 if ((rs->sc_dkdev.dk_openmask == 0) &&
472 ((rs->sc_flags & RAIDF_INITED) != 0)) {
473 /* First one... mark things as dirty... Note that we *MUST*
474 have done a configure before this. I DO NOT WANT TO BE
475 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
476 THAT THEY BELONG TOGETHER!!!!! */
477 /* XXX should check to see if we're only open for reading
478 here... If so, we needn't do this, but then need some
479 other way of keeping track of what's happened.. */
480
481 rf_markalldirty( raidPtrs[unit] );
482 }
483
484
485 rs->sc_dkdev.dk_openmask =
486 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
487
488 raidunlock(rs);
489
490 return (error);
491
492
493 }
494 /* ARGSUSED */
495 int
496 raidclose(dev, flags, fmt, p)
497 dev_t dev;
498 int flags, fmt;
499 struct proc *p;
500 {
501 int unit = raidunit(dev);
502 struct raid_softc *rs;
503 int error = 0;
504 int part;
505
506 if (unit >= numraid)
507 return (ENXIO);
508 rs = &raid_softc[unit];
509
510 if ((error = raidlock(rs)) != 0)
511 return (error);
512
513 part = DISKPART(dev);
514
515 /* ...that much closer to allowing unconfiguration... */
516 switch (fmt) {
517 case S_IFCHR:
518 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
519 break;
520
521 case S_IFBLK:
522 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
523 break;
524 }
525 rs->sc_dkdev.dk_openmask =
526 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
527
528 if ((rs->sc_dkdev.dk_openmask == 0) &&
529 ((rs->sc_flags & RAIDF_INITED) != 0)) {
530 /* Last one... device is not unconfigured yet.
531 Device shutdown has taken care of setting the
532 clean bits if RAIDF_INITED is not set
533 mark things as clean... */
534 rf_update_component_labels( raidPtrs[unit] );
535 }
536
537 raidunlock(rs);
538 return (0);
539
540 }
541
542 void
543 raidstrategy(bp)
544 register struct buf *bp;
545 {
546 register int s;
547
548 unsigned int raidID = raidunit(bp->b_dev);
549 RF_Raid_t *raidPtr;
550 struct raid_softc *rs = &raid_softc[raidID];
551 struct disklabel *lp;
552 int wlabel;
553
554 #if 0
555 db1_printf(("Strategy: 0x%x 0x%x\n", bp, bp->b_data));
556 db1_printf(("Strategy(2): bp->b_bufsize%d\n", (int) bp->b_bufsize));
557 db1_printf(("bp->b_count=%d\n", (int) bp->b_bcount));
558 db1_printf(("bp->b_resid=%d\n", (int) bp->b_resid));
559 db1_printf(("bp->b_blkno=%d\n", (int) bp->b_blkno));
560
561 if (bp->b_flags & B_READ)
562 db1_printf(("READ\n"));
563 else
564 db1_printf(("WRITE\n"));
565 #endif
566 if ((rs->sc_flags & RAIDF_INITED) ==0) {
567 bp->b_error = ENXIO;
568 bp->b_flags = B_ERROR;
569 bp->b_resid = bp->b_bcount;
570 biodone(bp);
571 return;
572 }
573 if (raidID >= numraid || !raidPtrs[raidID]) {
574 bp->b_error = ENODEV;
575 bp->b_flags |= B_ERROR;
576 bp->b_resid = bp->b_bcount;
577 biodone(bp);
578 return;
579 }
580 raidPtr = raidPtrs[raidID];
581 if (!raidPtr->valid) {
582 bp->b_error = ENODEV;
583 bp->b_flags |= B_ERROR;
584 bp->b_resid = bp->b_bcount;
585 biodone(bp);
586 return;
587 }
588 if (bp->b_bcount == 0) {
589 db1_printf(("b_bcount is zero..\n"));
590 biodone(bp);
591 return;
592 }
593 lp = rs->sc_dkdev.dk_label;
594
595 /*
596 * Do bounds checking and adjust transfer. If there's an
597 * error, the bounds check will flag that for us.
598 */
599
600 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
601 if (DISKPART(bp->b_dev) != RAW_PART)
602 if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
603 db1_printf(("Bounds check failed!!:%d %d\n",
604 (int) bp->b_blkno, (int) wlabel));
605 biodone(bp);
606 return;
607 }
608 s = splbio(); /* XXX Needed? */
609 db1_printf(("Beginning strategy...\n"));
610
611 bp->b_resid = 0;
612 bp->b_error = rf_DoAccessKernel(raidPtrs[raidID], bp,
613 NULL, NULL, NULL);
614 if (bp->b_error) {
615 bp->b_flags |= B_ERROR;
616 db1_printf(("bp->b_flags HAS B_ERROR SET!!!: %d\n",
617 bp->b_error));
618 }
619 splx(s);
620 #if 0
621 db1_printf(("Strategy exiting: 0x%x 0x%x %d %d\n",
622 bp, bp->b_data,
623 (int) bp->b_bcount, (int) bp->b_resid));
624 #endif
625 }
626 /* ARGSUSED */
627 int
628 raidread(dev, uio, flags)
629 dev_t dev;
630 struct uio *uio;
631 int flags;
632 {
633 int unit = raidunit(dev);
634 struct raid_softc *rs;
635 int part;
636
637 if (unit >= numraid)
638 return (ENXIO);
639 rs = &raid_softc[unit];
640
641 if ((rs->sc_flags & RAIDF_INITED) == 0)
642 return (ENXIO);
643 part = DISKPART(dev);
644
645 db1_printf(("raidread: unit: %d partition: %d\n", unit, part));
646
647 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
648
649 }
650 /* ARGSUSED */
651 int
652 raidwrite(dev, uio, flags)
653 dev_t dev;
654 struct uio *uio;
655 int flags;
656 {
657 int unit = raidunit(dev);
658 struct raid_softc *rs;
659
660 if (unit >= numraid)
661 return (ENXIO);
662 rs = &raid_softc[unit];
663
664 if ((rs->sc_flags & RAIDF_INITED) == 0)
665 return (ENXIO);
666 db1_printf(("raidwrite\n"));
667 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
668
669 }
670
671 int
672 raidioctl(dev, cmd, data, flag, p)
673 dev_t dev;
674 u_long cmd;
675 caddr_t data;
676 int flag;
677 struct proc *p;
678 {
679 int unit = raidunit(dev);
680 int error = 0;
681 int part, pmask;
682 struct raid_softc *rs;
683 RF_Config_t *k_cfg, *u_cfg;
684 u_char *specific_buf;
685 int retcode = 0;
686 int row;
687 int column;
688 int s;
689 struct rf_recon_req *rrcopy, *rr;
690 RF_ComponentLabel_t *component_label;
691 RF_ComponentLabel_t ci_label;
692 RF_ComponentLabel_t **c_label_ptr;
693 RF_SingleComponent_t *sparePtr,*componentPtr;
694 RF_SingleComponent_t hot_spare;
695 RF_SingleComponent_t component;
696
697 if (unit >= numraid)
698 return (ENXIO);
699 rs = &raid_softc[unit];
700
701 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
702 (int) DISKPART(dev), (int) unit, (int) cmd));
703
704 /* Must be open for writes for these commands... */
705 switch (cmd) {
706 case DIOCSDINFO:
707 case DIOCWDINFO:
708 case DIOCWLABEL:
709 if ((flag & FWRITE) == 0)
710 return (EBADF);
711 }
712
713 /* Must be initialized for these... */
714 switch (cmd) {
715 case DIOCGDINFO:
716 case DIOCSDINFO:
717 case DIOCWDINFO:
718 case DIOCGPART:
719 case DIOCWLABEL:
720 case DIOCGDEFLABEL:
721 case RAIDFRAME_SHUTDOWN:
722 case RAIDFRAME_REWRITEPARITY:
723 case RAIDFRAME_GET_INFO:
724 case RAIDFRAME_RESET_ACCTOTALS:
725 case RAIDFRAME_GET_ACCTOTALS:
726 case RAIDFRAME_KEEP_ACCTOTALS:
727 case RAIDFRAME_GET_SIZE:
728 case RAIDFRAME_FAIL_DISK:
729 case RAIDFRAME_COPYBACK:
730 case RAIDFRAME_CHECKRECON:
731 case RAIDFRAME_GET_COMPONENT_LABEL:
732 case RAIDFRAME_SET_COMPONENT_LABEL:
733 case RAIDFRAME_ADD_HOT_SPARE:
734 case RAIDFRAME_REMOVE_HOT_SPARE:
735 case RAIDFRAME_INIT_LABELS:
736 case RAIDFRAME_REBUILD_IN_PLACE:
737 case RAIDFRAME_CHECK_PARITY:
738 if ((rs->sc_flags & RAIDF_INITED) == 0)
739 return (ENXIO);
740 }
741
742 switch (cmd) {
743
744
745 /* configure the system */
746 case RAIDFRAME_CONFIGURE:
747
748 db3_printf(("rf_ioctl: RAIDFRAME_CONFIGURE\n"));
749 /* copy-in the configuration information */
750 /* data points to a pointer to the configuration structure */
751 u_cfg = *((RF_Config_t **) data);
752 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
753 if (k_cfg == NULL) {
754 db3_printf(("rf_ioctl: ENOMEM for config. Code is %d\n", retcode));
755 return (ENOMEM);
756 }
757 retcode = copyin((caddr_t) u_cfg, (caddr_t) k_cfg,
758 sizeof(RF_Config_t));
759 if (retcode) {
760 db3_printf(("rf_ioctl: retcode=%d copyin.1\n",
761 retcode));
762 return (retcode);
763 }
764 /* allocate a buffer for the layout-specific data, and copy it
765 * in */
766 if (k_cfg->layoutSpecificSize) {
767 if (k_cfg->layoutSpecificSize > 10000) {
768 /* sanity check */
769 db3_printf(("rf_ioctl: EINVAL %d\n", retcode));
770 return (EINVAL);
771 }
772 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
773 (u_char *));
774 if (specific_buf == NULL) {
775 RF_Free(k_cfg, sizeof(RF_Config_t));
776 db3_printf(("rf_ioctl: ENOMEM %d\n", retcode));
777 return (ENOMEM);
778 }
779 retcode = copyin(k_cfg->layoutSpecific,
780 (caddr_t) specific_buf,
781 k_cfg->layoutSpecificSize);
782 if (retcode) {
783 db3_printf(("rf_ioctl: retcode=%d copyin.2\n",
784 retcode));
785 return (retcode);
786 }
787 } else
788 specific_buf = NULL;
789 k_cfg->layoutSpecific = specific_buf;
790
791 /* should do some kind of sanity check on the configuration.
792 * Store the sum of all the bytes in the last byte? */
793
794 /* configure the system */
795
796 raidPtrs[unit]->raidid = unit;
797
798 retcode = rf_Configure(raidPtrs[unit], k_cfg);
799
800 /* allow this many simultaneous IO's to this RAID device */
801 raidPtrs[unit]->openings = RAIDOUTSTANDING;
802
803 if (retcode == 0) {
804 retcode = raidinit(dev, raidPtrs[unit], unit);
805 rf_markalldirty( raidPtrs[unit] );
806 }
807 /* free the buffers. No return code here. */
808 if (k_cfg->layoutSpecificSize) {
809 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
810 }
811 RF_Free(k_cfg, sizeof(RF_Config_t));
812
813 db3_printf(("rf_ioctl: retcode=%d RAIDFRAME_CONFIGURE\n",
814 retcode));
815
816 return (retcode);
817
818 /* shutdown the system */
819 case RAIDFRAME_SHUTDOWN:
820
821 if ((error = raidlock(rs)) != 0)
822 return (error);
823
824 /*
825 * If somebody has a partition mounted, we shouldn't
826 * shutdown.
827 */
828
829 part = DISKPART(dev);
830 pmask = (1 << part);
831 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
832 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
833 (rs->sc_dkdev.dk_copenmask & pmask))) {
834 raidunlock(rs);
835 return (EBUSY);
836 }
837
838 if (rf_debugKernelAccess) {
839 printf("call shutdown\n");
840 }
841
842 retcode = rf_Shutdown(raidPtrs[unit]);
843
844 db1_printf(("Done main shutdown\n"));
845
846 pool_destroy(&rs->sc_cbufpool);
847 db1_printf(("Done freeing component buffer freelist\n"));
848
849 /* It's no longer initialized... */
850 rs->sc_flags &= ~RAIDF_INITED;
851
852 /* Detach the disk. */
853 disk_detach(&rs->sc_dkdev);
854
855 raidunlock(rs);
856
857 return (retcode);
858 case RAIDFRAME_GET_COMPONENT_LABEL:
859 c_label_ptr = (RF_ComponentLabel_t **) data;
860 /* need to read the component label for the disk indicated
861 by row,column in component_label
862 XXX need to sanity check these values!!!
863 */
864
865 /* For practice, let's get it directly fromdisk, rather
866 than from the in-core copy */
867 RF_Malloc( component_label, sizeof( RF_ComponentLabel_t ),
868 (RF_ComponentLabel_t *));
869 if (component_label == NULL)
870 return (ENOMEM);
871
872 bzero((char *) component_label, sizeof(RF_ComponentLabel_t));
873
874 retcode = copyin( *c_label_ptr, component_label,
875 sizeof(RF_ComponentLabel_t));
876
877 if (retcode) {
878 return(retcode);
879 }
880
881 row = component_label->row;
882 column = component_label->column;
883
884 if ((row < 0) || (row >= raidPtrs[unit]->numRow) ||
885 (column < 0) || (column >= raidPtrs[unit]->numCol)) {
886 return(EINVAL);
887 }
888
889 raidread_component_label(
890 raidPtrs[unit]->Disks[row][column].dev,
891 raidPtrs[unit]->raid_cinfo[row][column].ci_vp,
892 component_label );
893
894 retcode = copyout((caddr_t) component_label,
895 (caddr_t) *c_label_ptr,
896 sizeof(RF_ComponentLabel_t));
897 RF_Free( component_label, sizeof(RF_ComponentLabel_t));
898 return (retcode);
899
900 case RAIDFRAME_SET_COMPONENT_LABEL:
901 component_label = (RF_ComponentLabel_t *) data;
902
903 /* XXX check the label for valid stuff... */
904 /* Note that some things *should not* get modified --
905 the user should be re-initing the labels instead of
906 trying to patch things.
907 */
908
909 printf("Got component label:\n");
910 printf("Version: %d\n",component_label->version);
911 printf("Serial Number: %d\n",component_label->serial_number);
912 printf("Mod counter: %d\n",component_label->mod_counter);
913 printf("Row: %d\n", component_label->row);
914 printf("Column: %d\n", component_label->column);
915 printf("Num Rows: %d\n", component_label->num_rows);
916 printf("Num Columns: %d\n", component_label->num_columns);
917 printf("Clean: %d\n", component_label->clean);
918 printf("Status: %d\n", component_label->status);
919
920 row = component_label->row;
921 column = component_label->column;
922
923 if ((row < 0) || (row >= raidPtrs[unit]->numRow) ||
924 (column < 0) || (column >= raidPtrs[unit]->numCol)) {
925 return(EINVAL);
926 }
927
928 /* XXX this isn't allowed to do anything for now :-) */
929 #if 0
930 raidwrite_component_label(
931 raidPtrs[unit]->Disks[row][column].dev,
932 raidPtrs[unit]->raid_cinfo[row][column].ci_vp,
933 component_label );
934 #endif
935 return (0);
936
937 case RAIDFRAME_INIT_LABELS:
938 component_label = (RF_ComponentLabel_t *) data;
939 /*
940 we only want the serial number from
941 the above. We get all the rest of the information
942 from the config that was used to create this RAID
943 set.
944 */
945
946 raidPtrs[unit]->serial_number = component_label->serial_number;
947 /* current version number */
948 ci_label.version = RF_COMPONENT_LABEL_VERSION;
949 ci_label.serial_number = component_label->serial_number;
950 ci_label.mod_counter = raidPtrs[unit]->mod_counter;
951 ci_label.num_rows = raidPtrs[unit]->numRow;
952 ci_label.num_columns = raidPtrs[unit]->numCol;
953 ci_label.clean = RF_RAID_DIRTY; /* not clean */
954 ci_label.status = rf_ds_optimal; /* "It's good!" */
955
956 for(row=0;row<raidPtrs[unit]->numRow;row++) {
957 ci_label.row = row;
958 for(column=0;column<raidPtrs[unit]->numCol;column++) {
959 ci_label.column = column;
960 raidwrite_component_label(
961 raidPtrs[unit]->Disks[row][column].dev,
962 raidPtrs[unit]->raid_cinfo[row][column].ci_vp,
963 &ci_label );
964 }
965 }
966
967 return (retcode);
968
969 /* initialize all parity */
970 case RAIDFRAME_REWRITEPARITY:
971
972 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
973 /* Parity for RAID 0 is trivially correct */
974 raidPtrs[unit]->parity_good = RF_RAID_CLEAN;
975 return(0);
976 }
977
978 /* borrow the thread of the requesting process */
979
980 s = splbio();
981 retcode = rf_RewriteParity(raidPtrs[unit]);
982 splx(s);
983 /* return I/O Error if the parity rewrite fails */
984
985 if (retcode) {
986 retcode = EIO;
987 } else {
988 /* set the clean bit! If we shutdown correctly,
989 the clean bit on each component label will get
990 set */
991 raidPtrs[unit]->parity_good = RF_RAID_CLEAN;
992 }
993 return (retcode);
994
995
996 case RAIDFRAME_ADD_HOT_SPARE:
997 sparePtr = (RF_SingleComponent_t *) data;
998 memcpy( &hot_spare, sparePtr, sizeof(RF_SingleComponent_t));
999 printf("Adding spare\n");
1000 retcode = rf_add_hot_spare(raidPtrs[unit], &hot_spare);
1001 return(retcode);
1002
1003 case RAIDFRAME_REMOVE_HOT_SPARE:
1004 return(retcode);
1005
1006 case RAIDFRAME_REBUILD_IN_PLACE:
1007
1008 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
1009 /* Can't do this on a RAID 0!! */
1010 return(EINVAL);
1011 }
1012
1013 componentPtr = (RF_SingleComponent_t *) data;
1014 memcpy( &component, componentPtr,
1015 sizeof(RF_SingleComponent_t));
1016 row = component.row;
1017 column = component.column;
1018 printf("Rebuild: %d %d\n",row, column);
1019 if ((row < 0) || (row >= raidPtrs[unit]->numRow) ||
1020 (column < 0) || (column >= raidPtrs[unit]->numCol)) {
1021 return(EINVAL);
1022 }
1023 printf("Attempting a rebuild in place\n");
1024 s = splbio();
1025 retcode = rf_ReconstructInPlace(raidPtrs[unit], row, column);
1026 splx(s);
1027 return(retcode);
1028
1029 case RAIDFRAME_GET_INFO:
1030 {
1031 RF_Raid_t *raid = raidPtrs[unit];
1032 RF_DeviceConfig_t *cfg, **ucfgp;
1033 int i, j, d;
1034
1035 if (!raid->valid)
1036 return (ENODEV);
1037 ucfgp = (RF_DeviceConfig_t **) data;
1038 RF_Malloc(cfg, sizeof(RF_DeviceConfig_t),
1039 (RF_DeviceConfig_t *));
1040 if (cfg == NULL)
1041 return (ENOMEM);
1042 bzero((char *) cfg, sizeof(RF_DeviceConfig_t));
1043 cfg->rows = raid->numRow;
1044 cfg->cols = raid->numCol;
1045 cfg->ndevs = raid->numRow * raid->numCol;
1046 if (cfg->ndevs >= RF_MAX_DISKS) {
1047 cfg->ndevs = 0;
1048 return (ENOMEM);
1049 }
1050 cfg->nspares = raid->numSpare;
1051 if (cfg->nspares >= RF_MAX_DISKS) {
1052 cfg->nspares = 0;
1053 return (ENOMEM);
1054 }
1055 cfg->maxqdepth = raid->maxQueueDepth;
1056 d = 0;
1057 for (i = 0; i < cfg->rows; i++) {
1058 for (j = 0; j < cfg->cols; j++) {
1059 cfg->devs[d] = raid->Disks[i][j];
1060 d++;
1061 }
1062 }
1063 for (j = cfg->cols, i = 0; i < cfg->nspares; i++, j++) {
1064 cfg->spares[i] = raid->Disks[0][j];
1065 }
1066 retcode = copyout((caddr_t) cfg, (caddr_t) * ucfgp,
1067 sizeof(RF_DeviceConfig_t));
1068 RF_Free(cfg, sizeof(RF_DeviceConfig_t));
1069
1070 return (retcode);
1071 }
1072 break;
1073 case RAIDFRAME_CHECK_PARITY:
1074 *(int *) data = raidPtrs[unit]->parity_good;
1075 return (0);
1076 case RAIDFRAME_RESET_ACCTOTALS:
1077 {
1078 RF_Raid_t *raid = raidPtrs[unit];
1079
1080 bzero(&raid->acc_totals, sizeof(raid->acc_totals));
1081 return (0);
1082 }
1083 break;
1084
1085 case RAIDFRAME_GET_ACCTOTALS:
1086 {
1087 RF_AccTotals_t *totals = (RF_AccTotals_t *) data;
1088 RF_Raid_t *raid = raidPtrs[unit];
1089
1090 *totals = raid->acc_totals;
1091 return (0);
1092 }
1093 break;
1094
1095 case RAIDFRAME_KEEP_ACCTOTALS:
1096 {
1097 RF_Raid_t *raid = raidPtrs[unit];
1098 int *keep = (int *) data;
1099
1100 raid->keep_acc_totals = *keep;
1101 return (0);
1102 }
1103 break;
1104
1105 case RAIDFRAME_GET_SIZE:
1106 *(int *) data = raidPtrs[unit]->totalSectors;
1107 return (0);
1108
1109 #define RAIDFRAME_RECON 1
1110 /* XXX The above should probably be set somewhere else!! GO */
1111 #if RAIDFRAME_RECON > 0
1112
1113 /* fail a disk & optionally start reconstruction */
1114 case RAIDFRAME_FAIL_DISK:
1115
1116 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
1117 /* Can't do this on a RAID 0!! */
1118 return(EINVAL);
1119 }
1120
1121 rr = (struct rf_recon_req *) data;
1122
1123 if (rr->row < 0 || rr->row >= raidPtrs[unit]->numRow
1124 || rr->col < 0 || rr->col >= raidPtrs[unit]->numCol)
1125 return (EINVAL);
1126
1127 printf("raid%d: Failing the disk: row: %d col: %d\n",
1128 unit, rr->row, rr->col);
1129
1130 /* make a copy of the recon request so that we don't rely on
1131 * the user's buffer */
1132 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1133 bcopy(rr, rrcopy, sizeof(*rr));
1134 rrcopy->raidPtr = (void *) raidPtrs[unit];
1135
1136 LOCK_RECON_Q_MUTEX();
1137 rrcopy->next = recon_queue;
1138 recon_queue = rrcopy;
1139 wakeup(&recon_queue);
1140 UNLOCK_RECON_Q_MUTEX();
1141
1142 return (0);
1143
1144 /* invoke a copyback operation after recon on whatever disk
1145 * needs it, if any */
1146 case RAIDFRAME_COPYBACK:
1147
1148 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
1149 /* This makes no sense on a RAID 0!! */
1150 return(EINVAL);
1151 }
1152
1153 /* borrow the current thread to get this done */
1154
1155 s = splbio();
1156 rf_CopybackReconstructedData(raidPtrs[unit]);
1157 splx(s);
1158 return (0);
1159
1160 /* return the percentage completion of reconstruction */
1161 case RAIDFRAME_CHECKRECON:
1162 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
1163 /* This makes no sense on a RAID 0 */
1164 return(EINVAL);
1165 }
1166
1167 row = *(int *) data;
1168 if (row < 0 || row >= raidPtrs[unit]->numRow)
1169 return (EINVAL);
1170 if (raidPtrs[unit]->status[row] != rf_rs_reconstructing)
1171 *(int *) data = 100;
1172 else
1173 *(int *) data = raidPtrs[unit]->reconControl[row]->percentComplete;
1174 return (0);
1175
1176 /* the sparetable daemon calls this to wait for the kernel to
1177 * need a spare table. this ioctl does not return until a
1178 * spare table is needed. XXX -- calling mpsleep here in the
1179 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1180 * -- I should either compute the spare table in the kernel,
1181 * or have a different -- XXX XXX -- interface (a different
1182 * character device) for delivering the table -- XXX */
1183 #if 0
1184 case RAIDFRAME_SPARET_WAIT:
1185 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1186 while (!rf_sparet_wait_queue)
1187 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1188 waitreq = rf_sparet_wait_queue;
1189 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1190 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1191
1192 *((RF_SparetWait_t *) data) = *waitreq; /* structure assignment */
1193
1194 RF_Free(waitreq, sizeof(*waitreq));
1195 return (0);
1196
1197
1198 /* wakes up a process waiting on SPARET_WAIT and puts an error
1199 * code in it that will cause the dameon to exit */
1200 case RAIDFRAME_ABORT_SPARET_WAIT:
1201 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1202 waitreq->fcol = -1;
1203 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1204 waitreq->next = rf_sparet_wait_queue;
1205 rf_sparet_wait_queue = waitreq;
1206 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1207 wakeup(&rf_sparet_wait_queue);
1208 return (0);
1209
1210 /* used by the spare table daemon to deliver a spare table
1211 * into the kernel */
1212 case RAIDFRAME_SEND_SPARET:
1213
1214 /* install the spare table */
1215 retcode = rf_SetSpareTable(raidPtrs[unit], *(void **) data);
1216
1217 /* respond to the requestor. the return status of the spare
1218 * table installation is passed in the "fcol" field */
1219 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1220 waitreq->fcol = retcode;
1221 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1222 waitreq->next = rf_sparet_resp_queue;
1223 rf_sparet_resp_queue = waitreq;
1224 wakeup(&rf_sparet_resp_queue);
1225 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1226
1227 return (retcode);
1228 #endif
1229
1230
1231 #endif /* RAIDFRAME_RECON > 0 */
1232
1233 default:
1234 break; /* fall through to the os-specific code below */
1235
1236 }
1237
1238 if (!raidPtrs[unit]->valid)
1239 return (EINVAL);
1240
1241 /*
1242 * Add support for "regular" device ioctls here.
1243 */
1244
1245 switch (cmd) {
1246 case DIOCGDINFO:
1247 db1_printf(("DIOCGDINFO %d %d\n", (int) dev, (int) DISKPART(dev)));
1248 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1249 break;
1250
1251 case DIOCGPART:
1252 db1_printf(("DIOCGPART: %d %d\n", (int) dev, (int) DISKPART(dev)));
1253 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1254 ((struct partinfo *) data)->part =
1255 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1256 break;
1257
1258 case DIOCWDINFO:
1259 db1_printf(("DIOCWDINFO\n"));
1260 case DIOCSDINFO:
1261 db1_printf(("DIOCSDINFO\n"));
1262 if ((error = raidlock(rs)) != 0)
1263 return (error);
1264
1265 rs->sc_flags |= RAIDF_LABELLING;
1266
1267 error = setdisklabel(rs->sc_dkdev.dk_label,
1268 (struct disklabel *) data, 0, rs->sc_dkdev.dk_cpulabel);
1269 if (error == 0) {
1270 if (cmd == DIOCWDINFO)
1271 error = writedisklabel(RAIDLABELDEV(dev),
1272 raidstrategy, rs->sc_dkdev.dk_label,
1273 rs->sc_dkdev.dk_cpulabel);
1274 }
1275 rs->sc_flags &= ~RAIDF_LABELLING;
1276
1277 raidunlock(rs);
1278
1279 if (error)
1280 return (error);
1281 break;
1282
1283 case DIOCWLABEL:
1284 db1_printf(("DIOCWLABEL\n"));
1285 if (*(int *) data != 0)
1286 rs->sc_flags |= RAIDF_WLABEL;
1287 else
1288 rs->sc_flags &= ~RAIDF_WLABEL;
1289 break;
1290
1291 case DIOCGDEFLABEL:
1292 db1_printf(("DIOCGDEFLABEL\n"));
1293 raidgetdefaultlabel(raidPtrs[unit], rs,
1294 (struct disklabel *) data);
1295 break;
1296
1297 default:
1298 retcode = ENOTTY; /* XXXX ?? OR EINVAL ? */
1299 }
1300 return (retcode);
1301
1302 }
1303
1304
1305 /* raidinit -- complete the rest of the initialization for the
1306 RAIDframe device. */
1307
1308
1309 static int
1310 raidinit(dev, raidPtr, unit)
1311 dev_t dev;
1312 RF_Raid_t *raidPtr;
1313 int unit;
1314 {
1315 int retcode;
1316 /* int ix; */
1317 /* struct raidbuf *raidbp; */
1318 struct raid_softc *rs;
1319
1320 retcode = 0;
1321
1322 rs = &raid_softc[unit];
1323 pool_init(&rs->sc_cbufpool, sizeof(struct raidbuf), 0,
1324 0, 0, "raidpl", 0, NULL, NULL, M_RAIDFRAME);
1325
1326
1327 /* XXX should check return code first... */
1328 rs->sc_flags |= RAIDF_INITED;
1329
1330 sprintf(rs->sc_xname, "raid%d", unit); /* XXX doesn't check bounds. */
1331
1332 rs->sc_dkdev.dk_name = rs->sc_xname;
1333
1334 /* disk_attach actually creates space for the CPU disklabel, among
1335 * other things, so it's critical to call this *BEFORE* we try putzing
1336 * with disklabels. */
1337
1338 disk_attach(&rs->sc_dkdev);
1339
1340 /* XXX There may be a weird interaction here between this, and
1341 * protectedSectors, as used in RAIDframe. */
1342
1343 rs->sc_size = raidPtr->totalSectors;
1344 rs->sc_dev = dev;
1345
1346 return (retcode);
1347 }
1348
1349 /*
1350 * This kernel thread never exits. It is created once, and persists
1351 * until the system reboots.
1352 */
1353
1354 void
1355 rf_ReconKernelThread()
1356 {
1357 struct rf_recon_req *req;
1358 int s;
1359
1360 /* XXX not sure what spl() level we should be at here... probably
1361 * splbio() */
1362 s = splbio();
1363
1364 while (1) {
1365 /* grab the next reconstruction request from the queue */
1366 LOCK_RECON_Q_MUTEX();
1367 while (!recon_queue) {
1368 UNLOCK_RECON_Q_MUTEX();
1369 tsleep(&recon_queue, PRIBIO,
1370 "raidframe recon", 0);
1371 LOCK_RECON_Q_MUTEX();
1372 }
1373 req = recon_queue;
1374 recon_queue = recon_queue->next;
1375 UNLOCK_RECON_Q_MUTEX();
1376
1377 /*
1378 * If flags specifies that we should start recon, this call
1379 * will not return until reconstruction completes, fails,
1380 * or is aborted.
1381 */
1382 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col,
1383 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
1384
1385 RF_Free(req, sizeof(*req));
1386 }
1387 }
1388 /* wake up the daemon & tell it to get us a spare table
1389 * XXX
1390 * the entries in the queues should be tagged with the raidPtr
1391 * so that in the extremely rare case that two recons happen at once,
1392 * we know for which device were requesting a spare table
1393 * XXX
1394 */
1395 int
1396 rf_GetSpareTableFromDaemon(req)
1397 RF_SparetWait_t *req;
1398 {
1399 int retcode;
1400
1401 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1402 req->next = rf_sparet_wait_queue;
1403 rf_sparet_wait_queue = req;
1404 wakeup(&rf_sparet_wait_queue);
1405
1406 /* mpsleep unlocks the mutex */
1407 while (!rf_sparet_resp_queue) {
1408 tsleep(&rf_sparet_resp_queue, PRIBIO,
1409 "raidframe getsparetable", 0);
1410 }
1411 req = rf_sparet_resp_queue;
1412 rf_sparet_resp_queue = req->next;
1413 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1414
1415 retcode = req->fcol;
1416 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1417 * alloc'd */
1418 return (retcode);
1419 }
1420 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1421 * bp & passes it down.
1422 * any calls originating in the kernel must use non-blocking I/O
1423 * do some extra sanity checking to return "appropriate" error values for
1424 * certain conditions (to make some standard utilities work)
1425 */
1426 int
1427 rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg)
1428 RF_Raid_t *raidPtr;
1429 struct buf *bp;
1430 RF_RaidAccessFlags_t flags;
1431 void (*cbFunc) (struct buf *);
1432 void *cbArg;
1433 {
1434 RF_SectorCount_t num_blocks, pb, sum;
1435 RF_RaidAddr_t raid_addr;
1436 int retcode;
1437 struct partition *pp;
1438 daddr_t blocknum;
1439 int unit;
1440 struct raid_softc *rs;
1441 int do_async;
1442
1443 /* XXX The dev_t used here should be for /dev/[r]raid* !!! */
1444
1445 unit = raidPtr->raidid;
1446 rs = &raid_softc[unit];
1447
1448 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1449 * partition.. Need to make it absolute to the underlying device.. */
1450
1451 blocknum = bp->b_blkno;
1452 if (DISKPART(bp->b_dev) != RAW_PART) {
1453 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1454 blocknum += pp->p_offset;
1455 db1_printf(("updated: %d %d\n", DISKPART(bp->b_dev),
1456 pp->p_offset));
1457 } else {
1458 db1_printf(("Is raw..\n"));
1459 }
1460 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, (int) blocknum));
1461
1462 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1463 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1464
1465 /* *THIS* is where we adjust what block we're going to... but DO NOT
1466 * TOUCH bp->b_blkno!!! */
1467 raid_addr = blocknum;
1468
1469 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1470 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1471 sum = raid_addr + num_blocks + pb;
1472 if (1 || rf_debugKernelAccess) {
1473 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1474 (int) raid_addr, (int) sum, (int) num_blocks,
1475 (int) pb, (int) bp->b_resid));
1476 }
1477 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1478 || (sum < num_blocks) || (sum < pb)) {
1479 bp->b_error = ENOSPC;
1480 bp->b_flags |= B_ERROR;
1481 bp->b_resid = bp->b_bcount;
1482 biodone(bp);
1483 return (bp->b_error);
1484 }
1485 /*
1486 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1487 */
1488
1489 if (bp->b_bcount & raidPtr->sectorMask) {
1490 bp->b_error = EINVAL;
1491 bp->b_flags |= B_ERROR;
1492 bp->b_resid = bp->b_bcount;
1493 biodone(bp);
1494 return (bp->b_error);
1495 }
1496 db1_printf(("Calling DoAccess..\n"));
1497
1498
1499 /* Put a throttle on the number of requests we handle simultanously */
1500
1501 RF_LOCK_MUTEX(raidPtr->mutex);
1502
1503 while(raidPtr->openings <= 0) {
1504 RF_UNLOCK_MUTEX(raidPtr->mutex);
1505 (void)tsleep(&raidPtr->openings, PRIBIO, "rfdwait", 0);
1506 RF_LOCK_MUTEX(raidPtr->mutex);
1507 }
1508 raidPtr->openings--;
1509
1510 RF_UNLOCK_MUTEX(raidPtr->mutex);
1511
1512 /*
1513 * Everything is async.
1514 */
1515 do_async = 1;
1516
1517 /* don't ever condition on bp->b_flags & B_WRITE. always condition on
1518 * B_READ instead */
1519 retcode = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1520 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1521 do_async, raid_addr, num_blocks,
1522 bp->b_un.b_addr,
1523 bp, NULL, NULL, RF_DAG_NONBLOCKING_IO | flags,
1524 NULL, cbFunc, cbArg);
1525 #if 0
1526 db1_printf(("After call to DoAccess: 0x%x 0x%x %d\n", bp,
1527 bp->b_data, (int) bp->b_resid));
1528 #endif
1529
1530 return (retcode);
1531 }
1532 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1533
1534 int
1535 rf_DispatchKernelIO(queue, req)
1536 RF_DiskQueue_t *queue;
1537 RF_DiskQueueData_t *req;
1538 {
1539 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1540 struct buf *bp;
1541 struct raidbuf *raidbp = NULL;
1542 struct raid_softc *rs;
1543 int unit;
1544
1545 /* XXX along with the vnode, we also need the softc associated with
1546 * this device.. */
1547
1548 req->queue = queue;
1549
1550 unit = queue->raidPtr->raidid;
1551
1552 db1_printf(("DispatchKernelIO unit: %d\n", unit));
1553
1554 if (unit >= numraid) {
1555 printf("Invalid unit number: %d %d\n", unit, numraid);
1556 panic("Invalid Unit number in rf_DispatchKernelIO\n");
1557 }
1558 rs = &raid_softc[unit];
1559
1560 /* XXX is this the right place? */
1561 disk_busy(&rs->sc_dkdev);
1562
1563 bp = req->bp;
1564 #if 1
1565 /* XXX when there is a physical disk failure, someone is passing us a
1566 * buffer that contains old stuff!! Attempt to deal with this problem
1567 * without taking a performance hit... (not sure where the real bug
1568 * is. It's buried in RAIDframe somewhere) :-( GO ) */
1569
1570 if (bp->b_flags & B_ERROR) {
1571 bp->b_flags &= ~B_ERROR;
1572 }
1573 if (bp->b_error != 0) {
1574 bp->b_error = 0;
1575 }
1576 #endif
1577 raidbp = RAIDGETBUF(rs);
1578
1579 raidbp->rf_flags = 0; /* XXX not really used anywhere... */
1580
1581 /*
1582 * context for raidiodone
1583 */
1584 raidbp->rf_obp = bp;
1585 raidbp->req = req;
1586
1587 LIST_INIT(&raidbp->rf_buf.b_dep);
1588
1589 switch (req->type) {
1590 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1591 /* Dprintf2("rf_DispatchKernelIO: NOP to r %d c %d\n",
1592 * queue->row, queue->col); */
1593 /* XXX need to do something extra here.. */
1594 /* I'm leaving this in, as I've never actually seen it used,
1595 * and I'd like folks to report it... GO */
1596 printf(("WAKEUP CALLED\n"));
1597 queue->numOutstanding++;
1598
1599 /* XXX need to glue the original buffer into this?? */
1600
1601 KernelWakeupFunc(&raidbp->rf_buf);
1602 break;
1603
1604 case RF_IO_TYPE_READ:
1605 case RF_IO_TYPE_WRITE:
1606
1607 if (req->tracerec) {
1608 RF_ETIMER_START(req->tracerec->timer);
1609 }
1610 InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp,
1611 op | bp->b_flags, queue->rf_cinfo->ci_dev,
1612 req->sectorOffset, req->numSector,
1613 req->buf, KernelWakeupFunc, (void *) req,
1614 queue->raidPtr->logBytesPerSector, req->b_proc);
1615
1616 if (rf_debugKernelAccess) {
1617 db1_printf(("dispatch: bp->b_blkno = %ld\n",
1618 (long) bp->b_blkno));
1619 }
1620 queue->numOutstanding++;
1621 queue->last_deq_sector = req->sectorOffset;
1622 /* acc wouldn't have been let in if there were any pending
1623 * reqs at any other priority */
1624 queue->curPriority = req->priority;
1625 /* Dprintf3("rf_DispatchKernelIO: %c to row %d col %d\n",
1626 * req->type, queue->row, queue->col); */
1627
1628 db1_printf(("Going for %c to unit %d row %d col %d\n",
1629 req->type, unit, queue->row, queue->col));
1630 db1_printf(("sector %d count %d (%d bytes) %d\n",
1631 (int) req->sectorOffset, (int) req->numSector,
1632 (int) (req->numSector <<
1633 queue->raidPtr->logBytesPerSector),
1634 (int) queue->raidPtr->logBytesPerSector));
1635 if ((raidbp->rf_buf.b_flags & B_READ) == 0) {
1636 raidbp->rf_buf.b_vp->v_numoutput++;
1637 }
1638 VOP_STRATEGY(&raidbp->rf_buf);
1639
1640 break;
1641
1642 default:
1643 panic("bad req->type in rf_DispatchKernelIO");
1644 }
1645 db1_printf(("Exiting from DispatchKernelIO\n"));
1646 return (0);
1647 }
1648 /* this is the callback function associated with a I/O invoked from
1649 kernel code.
1650 */
1651 static void
1652 KernelWakeupFunc(vbp)
1653 struct buf *vbp;
1654 {
1655 RF_DiskQueueData_t *req = NULL;
1656 RF_DiskQueue_t *queue;
1657 struct raidbuf *raidbp = (struct raidbuf *) vbp;
1658 struct buf *bp;
1659 struct raid_softc *rs;
1660 int unit;
1661 register int s;
1662
1663 s = splbio(); /* XXX */
1664 db1_printf(("recovering the request queue:\n"));
1665 req = raidbp->req;
1666
1667 bp = raidbp->rf_obp;
1668 #if 0
1669 db1_printf(("bp=0x%x\n", bp));
1670 #endif
1671
1672 queue = (RF_DiskQueue_t *) req->queue;
1673
1674 if (raidbp->rf_buf.b_flags & B_ERROR) {
1675 #if 0
1676 printf("Setting bp->b_flags!!! %d\n", raidbp->rf_buf.b_error);
1677 #endif
1678 bp->b_flags |= B_ERROR;
1679 bp->b_error = raidbp->rf_buf.b_error ?
1680 raidbp->rf_buf.b_error : EIO;
1681 }
1682 #if 0
1683 db1_printf(("raidbp->rf_buf.b_bcount=%d\n", (int) raidbp->rf_buf.b_bcount));
1684 db1_printf(("raidbp->rf_buf.b_bufsize=%d\n", (int) raidbp->rf_buf.b_bufsize));
1685 db1_printf(("raidbp->rf_buf.b_resid=%d\n", (int) raidbp->rf_buf.b_resid));
1686 db1_printf(("raidbp->rf_buf.b_data=0x%x\n", raidbp->rf_buf.b_data));
1687 #endif
1688
1689 /* XXX methinks this could be wrong... */
1690 #if 1
1691 bp->b_resid = raidbp->rf_buf.b_resid;
1692 #endif
1693
1694 if (req->tracerec) {
1695 RF_ETIMER_STOP(req->tracerec->timer);
1696 RF_ETIMER_EVAL(req->tracerec->timer);
1697 RF_LOCK_MUTEX(rf_tracing_mutex);
1698 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
1699 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
1700 req->tracerec->num_phys_ios++;
1701 RF_UNLOCK_MUTEX(rf_tracing_mutex);
1702 }
1703 bp->b_bcount = raidbp->rf_buf.b_bcount; /* XXXX ?? */
1704
1705 unit = queue->raidPtr->raidid; /* *Much* simpler :-> */
1706
1707
1708 /* XXX Ok, let's get aggressive... If B_ERROR is set, let's go
1709 * ballistic, and mark the component as hosed... */
1710 #if 1
1711 if (bp->b_flags & B_ERROR) {
1712 /* Mark the disk as dead */
1713 /* but only mark it once... */
1714 if (queue->raidPtr->Disks[queue->row][queue->col].status ==
1715 rf_ds_optimal) {
1716 printf("raid%d: IO Error. Marking %s as failed.\n",
1717 unit, queue->raidPtr->Disks[queue->row][queue->col].devname);
1718 queue->raidPtr->Disks[queue->row][queue->col].status =
1719 rf_ds_failed;
1720 queue->raidPtr->status[queue->row] = rf_rs_degraded;
1721 queue->raidPtr->numFailures++;
1722 /* XXX here we should bump the version number for each component, and write that data out */
1723 } else { /* Disk is already dead... */
1724 /* printf("Disk already marked as dead!\n"); */
1725 }
1726
1727 }
1728 #endif
1729
1730 rs = &raid_softc[unit];
1731 RAIDPUTBUF(rs, raidbp);
1732
1733
1734 if (bp->b_resid == 0) {
1735 db1_printf(("Disk is no longer busy for this buffer... %d %ld %ld\n",
1736 unit, bp->b_resid, bp->b_bcount));
1737 /* XXX is this the right place for a disk_unbusy()??!??!?!? */
1738 disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid));
1739 } else {
1740 db1_printf(("b_resid is still %ld\n", bp->b_resid));
1741 }
1742
1743 rf_DiskIOComplete(queue, req, (bp->b_flags & B_ERROR) ? 1 : 0);
1744 (req->CompleteFunc) (req->argument, (bp->b_flags & B_ERROR) ? 1 : 0);
1745 /* printf("Exiting KernelWakeupFunc\n"); */
1746
1747 splx(s); /* XXX */
1748 }
1749
1750
1751
1752 /*
1753 * initialize a buf structure for doing an I/O in the kernel.
1754 */
1755 static void
1756 InitBP(
1757 struct buf * bp,
1758 struct vnode * b_vp,
1759 unsigned rw_flag,
1760 dev_t dev,
1761 RF_SectorNum_t startSect,
1762 RF_SectorCount_t numSect,
1763 caddr_t buf,
1764 void (*cbFunc) (struct buf *),
1765 void *cbArg,
1766 int logBytesPerSector,
1767 struct proc * b_proc)
1768 {
1769 /* bp->b_flags = B_PHYS | rw_flag; */
1770 bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */
1771 bp->b_bcount = numSect << logBytesPerSector;
1772 bp->b_bufsize = bp->b_bcount;
1773 bp->b_error = 0;
1774 bp->b_dev = dev;
1775 db1_printf(("bp->b_dev is %d\n", dev));
1776 bp->b_un.b_addr = buf;
1777 #if 0
1778 db1_printf(("bp->b_data=0x%x\n", bp->b_data));
1779 #endif
1780 bp->b_blkno = startSect;
1781 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
1782 db1_printf(("b_bcount is: %d\n", (int) bp->b_bcount));
1783 if (bp->b_bcount == 0) {
1784 panic("bp->b_bcount is zero in InitBP!!\n");
1785 }
1786 bp->b_proc = b_proc;
1787 bp->b_iodone = cbFunc;
1788 bp->b_vp = b_vp;
1789
1790 }
1791
1792 static void
1793 raidgetdefaultlabel(raidPtr, rs, lp)
1794 RF_Raid_t *raidPtr;
1795 struct raid_softc *rs;
1796 struct disklabel *lp;
1797 {
1798 db1_printf(("Building a default label...\n"));
1799 bzero(lp, sizeof(*lp));
1800
1801 /* fabricate a label... */
1802 lp->d_secperunit = raidPtr->totalSectors;
1803 lp->d_secsize = raidPtr->bytesPerSector;
1804 lp->d_nsectors = 1024 * (1024 / raidPtr->bytesPerSector);
1805 lp->d_ntracks = 1;
1806 lp->d_ncylinders = raidPtr->totalSectors / lp->d_nsectors;
1807 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1808
1809 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
1810 lp->d_type = DTYPE_RAID;
1811 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1812 lp->d_rpm = 3600;
1813 lp->d_interleave = 1;
1814 lp->d_flags = 0;
1815
1816 lp->d_partitions[RAW_PART].p_offset = 0;
1817 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
1818 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1819 lp->d_npartitions = RAW_PART + 1;
1820
1821 lp->d_magic = DISKMAGIC;
1822 lp->d_magic2 = DISKMAGIC;
1823 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
1824
1825 }
1826 /*
1827 * Read the disklabel from the raid device. If one is not present, fake one
1828 * up.
1829 */
1830 static void
1831 raidgetdisklabel(dev)
1832 dev_t dev;
1833 {
1834 int unit = raidunit(dev);
1835 struct raid_softc *rs = &raid_softc[unit];
1836 char *errstring;
1837 struct disklabel *lp = rs->sc_dkdev.dk_label;
1838 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
1839 RF_Raid_t *raidPtr;
1840
1841 db1_printf(("Getting the disklabel...\n"));
1842
1843 bzero(clp, sizeof(*clp));
1844
1845 raidPtr = raidPtrs[unit];
1846
1847 raidgetdefaultlabel(raidPtr, rs, lp);
1848
1849 /*
1850 * Call the generic disklabel extraction routine.
1851 */
1852 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
1853 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
1854 if (errstring)
1855 raidmakedisklabel(rs);
1856 else {
1857 int i;
1858 struct partition *pp;
1859
1860 /*
1861 * Sanity check whether the found disklabel is valid.
1862 *
1863 * This is necessary since total size of the raid device
1864 * may vary when an interleave is changed even though exactly
1865 * same componets are used, and old disklabel may used
1866 * if that is found.
1867 */
1868 if (lp->d_secperunit != rs->sc_size)
1869 printf("WARNING: %s: "
1870 "total sector size in disklabel (%d) != "
1871 "the size of raid (%ld)\n", rs->sc_xname,
1872 lp->d_secperunit, (long) rs->sc_size);
1873 for (i = 0; i < lp->d_npartitions; i++) {
1874 pp = &lp->d_partitions[i];
1875 if (pp->p_offset + pp->p_size > rs->sc_size)
1876 printf("WARNING: %s: end of partition `%c' "
1877 "exceeds the size of raid (%ld)\n",
1878 rs->sc_xname, 'a' + i, (long) rs->sc_size);
1879 }
1880 }
1881
1882 }
1883 /*
1884 * Take care of things one might want to take care of in the event
1885 * that a disklabel isn't present.
1886 */
1887 static void
1888 raidmakedisklabel(rs)
1889 struct raid_softc *rs;
1890 {
1891 struct disklabel *lp = rs->sc_dkdev.dk_label;
1892 db1_printf(("Making a label..\n"));
1893
1894 /*
1895 * For historical reasons, if there's no disklabel present
1896 * the raw partition must be marked FS_BSDFFS.
1897 */
1898
1899 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1900
1901 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1902
1903 lp->d_checksum = dkcksum(lp);
1904 }
1905 /*
1906 * Lookup the provided name in the filesystem. If the file exists,
1907 * is a valid block device, and isn't being used by anyone else,
1908 * set *vpp to the file's vnode.
1909 * You'll find the original of this in ccd.c
1910 */
1911 int
1912 raidlookup(path, p, vpp)
1913 char *path;
1914 struct proc *p;
1915 struct vnode **vpp; /* result */
1916 {
1917 struct nameidata nd;
1918 struct vnode *vp;
1919 struct vattr va;
1920 int error;
1921
1922 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p);
1923 if ((error = vn_open(&nd, FREAD | FWRITE, 0)) != 0) {
1924 #ifdef DEBUG
1925 printf("RAIDframe: vn_open returned %d\n", error);
1926 #endif
1927 return (error);
1928 }
1929 vp = nd.ni_vp;
1930 if (vp->v_usecount > 1) {
1931 VOP_UNLOCK(vp, 0);
1932 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
1933 return (EBUSY);
1934 }
1935 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) != 0) {
1936 VOP_UNLOCK(vp, 0);
1937 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
1938 return (error);
1939 }
1940 /* XXX: eventually we should handle VREG, too. */
1941 if (va.va_type != VBLK) {
1942 VOP_UNLOCK(vp, 0);
1943 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
1944 return (ENOTBLK);
1945 }
1946 VOP_UNLOCK(vp, 0);
1947 *vpp = vp;
1948 return (0);
1949 }
1950 /*
1951 * Wait interruptibly for an exclusive lock.
1952 *
1953 * XXX
1954 * Several drivers do this; it should be abstracted and made MP-safe.
1955 * (Hmm... where have we seen this warning before :-> GO )
1956 */
1957 static int
1958 raidlock(rs)
1959 struct raid_softc *rs;
1960 {
1961 int error;
1962
1963 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
1964 rs->sc_flags |= RAIDF_WANTED;
1965 if ((error =
1966 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
1967 return (error);
1968 }
1969 rs->sc_flags |= RAIDF_LOCKED;
1970 return (0);
1971 }
1972 /*
1973 * Unlock and wake up any waiters.
1974 */
1975 static void
1976 raidunlock(rs)
1977 struct raid_softc *rs;
1978 {
1979
1980 rs->sc_flags &= ~RAIDF_LOCKED;
1981 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
1982 rs->sc_flags &= ~RAIDF_WANTED;
1983 wakeup(rs);
1984 }
1985 }
1986
1987
1988 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
1989 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
1990
1991 int
1992 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
1993 {
1994 RF_ComponentLabel_t component_label;
1995 raidread_component_label(dev, b_vp, &component_label);
1996 component_label.mod_counter = mod_counter;
1997 component_label.clean = RF_RAID_CLEAN;
1998 raidwrite_component_label(dev, b_vp, &component_label);
1999 return(0);
2000 }
2001
2002
2003 int
2004 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2005 {
2006 RF_ComponentLabel_t component_label;
2007 raidread_component_label(dev, b_vp, &component_label);
2008 component_label.mod_counter = mod_counter;
2009 component_label.clean = RF_RAID_DIRTY;
2010 raidwrite_component_label(dev, b_vp, &component_label);
2011 return(0);
2012 }
2013
2014 /* ARGSUSED */
2015 int
2016 raidread_component_label(dev, b_vp, component_label)
2017 dev_t dev;
2018 struct vnode *b_vp;
2019 RF_ComponentLabel_t *component_label;
2020 {
2021 struct buf *bp;
2022 int error;
2023
2024 /* XXX should probably ensure that we don't try to do this if
2025 someone has changed rf_protected_sectors. */
2026
2027 /* get a block of the appropriate size... */
2028 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2029 bp->b_dev = dev;
2030
2031 /* get our ducks in a row for the read */
2032 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2033 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2034 bp->b_flags = B_BUSY | B_READ;
2035 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2036
2037 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2038
2039 error = biowait(bp);
2040
2041 if (!error) {
2042 memcpy(component_label, bp->b_un.b_addr,
2043 sizeof(RF_ComponentLabel_t));
2044 #if 0
2045 printf("raidread_component_label: got component label:\n");
2046 printf("Version: %d\n",component_label->version);
2047 printf("Serial Number: %d\n",component_label->serial_number);
2048 printf("Mod counter: %d\n",component_label->mod_counter);
2049 printf("Row: %d\n", component_label->row);
2050 printf("Column: %d\n", component_label->column);
2051 printf("Num Rows: %d\n", component_label->num_rows);
2052 printf("Num Columns: %d\n", component_label->num_columns);
2053 printf("Clean: %d\n", component_label->clean);
2054 printf("Status: %d\n", component_label->status);
2055 #endif
2056 } else {
2057 printf("Failed to read RAID component label!\n");
2058 }
2059
2060 bp->b_flags = B_INVAL | B_AGE;
2061 brelse(bp);
2062 return(error);
2063 }
2064 /* ARGSUSED */
2065 int
2066 raidwrite_component_label(dev, b_vp, component_label)
2067 dev_t dev;
2068 struct vnode *b_vp;
2069 RF_ComponentLabel_t *component_label;
2070 {
2071 struct buf *bp;
2072 int error;
2073
2074 /* get a block of the appropriate size... */
2075 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2076 bp->b_dev = dev;
2077
2078 /* get our ducks in a row for the write */
2079 bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2080 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2081 bp->b_flags = B_BUSY | B_WRITE;
2082 bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2083
2084 memset( bp->b_un.b_addr, 0, RF_COMPONENT_INFO_SIZE );
2085
2086 memcpy( bp->b_un.b_addr, component_label, sizeof(RF_ComponentLabel_t));
2087
2088 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2089 error = biowait(bp);
2090 bp->b_flags = B_INVAL | B_AGE;
2091 brelse(bp);
2092 if (error) {
2093 printf("Failed to write RAID component info!\n");
2094 }
2095
2096 return(error);
2097 }
2098
2099 void
2100 rf_markalldirty( raidPtr )
2101 RF_Raid_t *raidPtr;
2102 {
2103 RF_ComponentLabel_t c_label;
2104 int r,c;
2105
2106 raidPtr->mod_counter++;
2107 for (r = 0; r < raidPtr->numRow; r++) {
2108 for (c = 0; c < raidPtr->numCol; c++) {
2109 if (raidPtr->Disks[r][c].status != rf_ds_failed) {
2110 raidread_component_label(
2111 raidPtr->Disks[r][c].dev,
2112 raidPtr->raid_cinfo[r][c].ci_vp,
2113 &c_label);
2114 if (c_label.status == rf_ds_spared) {
2115 /* XXX do something special...
2116 but whatever you do, don't
2117 try to access it!! */
2118 } else {
2119 #if 0
2120 c_label.status =
2121 raidPtr->Disks[r][c].status;
2122 raidwrite_component_label(
2123 raidPtr->Disks[r][c].dev,
2124 raidPtr->raid_cinfo[r][c].ci_vp,
2125 &c_label);
2126 #endif
2127 raidmarkdirty(
2128 raidPtr->Disks[r][c].dev,
2129 raidPtr->raid_cinfo[r][c].ci_vp,
2130 raidPtr->mod_counter);
2131 }
2132 }
2133 }
2134 }
2135 /* printf("Component labels marked dirty.\n"); */
2136 #if 0
2137 for( c = 0; c < raidPtr->numSpare ; c++) {
2138 sparecol = raidPtr->numCol + c;
2139 if (raidPtr->Disks[r][sparecol].status == rf_ds_used_spare) {
2140 /*
2141
2142 XXX this is where we get fancy and map this spare
2143 into it's correct spot in the array.
2144
2145 */
2146 /*
2147
2148 we claim this disk is "optimal" if it's
2149 rf_ds_used_spare, as that means it should be
2150 directly substitutable for the disk it replaced.
2151 We note that too...
2152
2153 */
2154
2155 for(i=0;i<raidPtr->numRow;i++) {
2156 for(j=0;j<raidPtr->numCol;j++) {
2157 if ((raidPtr->Disks[i][j].spareRow ==
2158 r) &&
2159 (raidPtr->Disks[i][j].spareCol ==
2160 sparecol)) {
2161 srow = r;
2162 scol = sparecol;
2163 break;
2164 }
2165 }
2166 }
2167
2168 raidread_component_label(
2169 raidPtr->Disks[r][sparecol].dev,
2170 raidPtr->raid_cinfo[r][sparecol].ci_vp,
2171 &c_label);
2172 /* make sure status is noted */
2173 c_label.version = RF_COMPONENT_LABEL_VERSION;
2174 c_label.mod_counter = raidPtr->mod_counter;
2175 c_label.serial_number = raidPtr->serial_number;
2176 c_label.row = srow;
2177 c_label.column = scol;
2178 c_label.num_rows = raidPtr->numRow;
2179 c_label.num_columns = raidPtr->numCol;
2180 c_label.clean = RF_RAID_DIRTY; /* changed in a bit*/
2181 c_label.status = rf_ds_optimal;
2182 raidwrite_component_label(
2183 raidPtr->Disks[r][sparecol].dev,
2184 raidPtr->raid_cinfo[r][sparecol].ci_vp,
2185 &c_label);
2186 raidmarkclean( raidPtr->Disks[r][sparecol].dev,
2187 raidPtr->raid_cinfo[r][sparecol].ci_vp);
2188 }
2189 }
2190
2191 #endif
2192 }
2193
2194
2195 void
2196 rf_update_component_labels( raidPtr )
2197 RF_Raid_t *raidPtr;
2198 {
2199 RF_ComponentLabel_t c_label;
2200 int sparecol;
2201 int r,c;
2202 int i,j;
2203 int srow, scol;
2204
2205 srow = -1;
2206 scol = -1;
2207
2208 /* XXX should do extra checks to make sure things really are clean,
2209 rather than blindly setting the clean bit... */
2210
2211 raidPtr->mod_counter++;
2212
2213 for (r = 0; r < raidPtr->numRow; r++) {
2214 for (c = 0; c < raidPtr->numCol; c++) {
2215 if (raidPtr->Disks[r][c].status == rf_ds_optimal) {
2216 raidread_component_label(
2217 raidPtr->Disks[r][c].dev,
2218 raidPtr->raid_cinfo[r][c].ci_vp,
2219 &c_label);
2220 /* make sure status is noted */
2221 c_label.status = rf_ds_optimal;
2222 raidwrite_component_label(
2223 raidPtr->Disks[r][c].dev,
2224 raidPtr->raid_cinfo[r][c].ci_vp,
2225 &c_label);
2226 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2227 raidmarkclean(
2228 raidPtr->Disks[r][c].dev,
2229 raidPtr->raid_cinfo[r][c].ci_vp,
2230 raidPtr->mod_counter);
2231 }
2232 }
2233 /* else we don't touch it.. */
2234 #if 0
2235 else if (raidPtr->Disks[r][c].status !=
2236 rf_ds_failed) {
2237 raidread_component_label(
2238 raidPtr->Disks[r][c].dev,
2239 raidPtr->raid_cinfo[r][c].ci_vp,
2240 &c_label);
2241 /* make sure status is noted */
2242 c_label.status =
2243 raidPtr->Disks[r][c].status;
2244 raidwrite_component_label(
2245 raidPtr->Disks[r][c].dev,
2246 raidPtr->raid_cinfo[r][c].ci_vp,
2247 &c_label);
2248 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2249 raidmarkclean(
2250 raidPtr->Disks[r][c].dev,
2251 raidPtr->raid_cinfo[r][c].ci_vp,
2252 raidPtr->mod_counter);
2253 }
2254 }
2255 #endif
2256 }
2257 }
2258
2259 for( c = 0; c < raidPtr->numSpare ; c++) {
2260 sparecol = raidPtr->numCol + c;
2261 if (raidPtr->Disks[0][sparecol].status == rf_ds_used_spare) {
2262 /*
2263
2264 we claim this disk is "optimal" if it's
2265 rf_ds_used_spare, as that means it should be
2266 directly substitutable for the disk it replaced.
2267 We note that too...
2268
2269 */
2270
2271 for(i=0;i<raidPtr->numRow;i++) {
2272 for(j=0;j<raidPtr->numCol;j++) {
2273 if ((raidPtr->Disks[i][j].spareRow ==
2274 0) &&
2275 (raidPtr->Disks[i][j].spareCol ==
2276 sparecol)) {
2277 srow = i;
2278 scol = j;
2279 break;
2280 }
2281 }
2282 }
2283
2284 raidread_component_label(
2285 raidPtr->Disks[0][sparecol].dev,
2286 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2287 &c_label);
2288 /* make sure status is noted */
2289 c_label.version = RF_COMPONENT_LABEL_VERSION;
2290 c_label.mod_counter = raidPtr->mod_counter;
2291 c_label.serial_number = raidPtr->serial_number;
2292 c_label.row = srow;
2293 c_label.column = scol;
2294 c_label.num_rows = raidPtr->numRow;
2295 c_label.num_columns = raidPtr->numCol;
2296 c_label.clean = RF_RAID_DIRTY; /* changed in a bit*/
2297 c_label.status = rf_ds_optimal;
2298 raidwrite_component_label(
2299 raidPtr->Disks[0][sparecol].dev,
2300 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2301 &c_label);
2302 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2303 raidmarkclean( raidPtr->Disks[0][sparecol].dev,
2304 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2305 raidPtr->mod_counter);
2306 }
2307 }
2308 }
2309 /* printf("Component labels updated\n"); */
2310 }
2311