rf_netbsdkintf.c revision 1.29.8.2 1 /* $NetBSD: rf_netbsdkintf.c,v 1.29.8.2 1999/12/22 00:13:24 wrstuden Exp $ */
2 /*-
3 * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Greg Oster; Jason R. Thorpe.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the NetBSD
20 * Foundation, Inc. and its contributors.
21 * 4. Neither the name of The NetBSD Foundation nor the names of its
22 * contributors may be used to endorse or promote products derived
23 * from this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Copyright (c) 1988 University of Utah.
40 * Copyright (c) 1990, 1993
41 * The Regents of the University of California. All rights reserved.
42 *
43 * This code is derived from software contributed to Berkeley by
44 * the Systems Programming Group of the University of Utah Computer
45 * Science Department.
46 *
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions
49 * are met:
50 * 1. Redistributions of source code must retain the above copyright
51 * notice, this list of conditions and the following disclaimer.
52 * 2. Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * 3. All advertising materials mentioning features or use of this software
56 * must display the following acknowledgement:
57 * This product includes software developed by the University of
58 * California, Berkeley and its contributors.
59 * 4. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
62 *
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE.
74 *
75 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 *
77 * @(#)cd.c 8.2 (Berkeley) 11/16/93
78 */
79
80
81
82
83 /*
84 * Copyright (c) 1995 Carnegie-Mellon University.
85 * All rights reserved.
86 *
87 * Authors: Mark Holland, Jim Zelenka
88 *
89 * Permission to use, copy, modify and distribute this software and
90 * its documentation is hereby granted, provided that both the copyright
91 * notice and this permission notice appear in all copies of the
92 * software, derivative works or modified versions, and any portions
93 * thereof, and that both notices appear in supporting documentation.
94 *
95 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
96 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
97 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
98 *
99 * Carnegie Mellon requests users of this software to return to
100 *
101 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
102 * School of Computer Science
103 * Carnegie Mellon University
104 * Pittsburgh PA 15213-3890
105 *
106 * any improvements or extensions that they make and grant Carnegie the
107 * rights to redistribute these changes.
108 */
109
110 /***********************************************************
111 *
112 * rf_kintf.c -- the kernel interface routines for RAIDframe
113 *
114 ***********************************************************/
115
116 #include <sys/errno.h>
117 #include <sys/param.h>
118 #include <sys/pool.h>
119 #include <sys/queue.h>
120 #include <sys/disk.h>
121 #include <sys/device.h>
122 #include <sys/stat.h>
123 #include <sys/ioctl.h>
124 #include <sys/fcntl.h>
125 #include <sys/systm.h>
126 #include <sys/namei.h>
127 #include <sys/vnode.h>
128 #include <sys/param.h>
129 #include <sys/types.h>
130 #include <machine/types.h>
131 #include <sys/disklabel.h>
132 #include <sys/conf.h>
133 #include <sys/lock.h>
134 #include <sys/buf.h>
135 #include <sys/user.h>
136
137 #include "raid.h"
138 #include "rf_raid.h"
139 #include "rf_raidframe.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_diskqueue.h"
143 #include "rf_acctrace.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_debugMem.h"
147 #include "rf_kintf.h"
148 #include "rf_options.h"
149 #include "rf_driver.h"
150 #include "rf_parityscan.h"
151 #include "rf_debugprint.h"
152 #include "rf_threadstuff.h"
153
154 int rf_kdebug_level = 0;
155
156 #define RFK_BOOT_NONE 0
157 #define RFK_BOOT_GOOD 1
158 #define RFK_BOOT_BAD 2
159 static int rf_kbooted = RFK_BOOT_NONE;
160
161 #ifdef DEBUG
162 #define db0_printf(a) printf a
163 #define db_printf(a) if (rf_kdebug_level > 0) printf a
164 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
165 #define db2_printf(a) if (rf_kdebug_level > 1) printf a
166 #define db3_printf(a) if (rf_kdebug_level > 2) printf a
167 #define db4_printf(a) if (rf_kdebug_level > 3) printf a
168 #define db5_printf(a) if (rf_kdebug_level > 4) printf a
169 #else /* DEBUG */
170 #define db0_printf(a) printf a
171 #define db1_printf(a) { }
172 #define db2_printf(a) { }
173 #define db3_printf(a) { }
174 #define db4_printf(a) { }
175 #define db5_printf(a) { }
176 #endif /* DEBUG */
177
178 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
179
180 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
181
182 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
183 * spare table */
184 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
185 * installation process */
186
187 static struct rf_recon_req *recon_queue = NULL; /* used to communicate
188 * reconstruction
189 * requests */
190
191
192 decl_simple_lock_data(, recon_queue_mutex)
193 #define LOCK_RECON_Q_MUTEX() simple_lock(&recon_queue_mutex)
194 #define UNLOCK_RECON_Q_MUTEX() simple_unlock(&recon_queue_mutex)
195
196 /* prototypes */
197 static void KernelWakeupFunc(struct buf * bp);
198 static void InitBP(struct buf * bp, struct vnode *, unsigned rw_flag,
199 dev_t dev, RF_SectorNum_t startSect,
200 RF_SectorCount_t numSect, caddr_t buf,
201 void (*cbFunc) (struct buf *), void *cbArg,
202 int logBytesPerSector, struct proc * b_proc);
203
204 #define Dprintf0(s) if (rf_queueDebug) \
205 rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
206 #define Dprintf1(s,a) if (rf_queueDebug) \
207 rf_debug_printf(s,a,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
208 #define Dprintf2(s,a,b) if (rf_queueDebug) \
209 rf_debug_printf(s,a,b,NULL,NULL,NULL,NULL,NULL,NULL)
210 #define Dprintf3(s,a,b,c) if (rf_queueDebug) \
211 rf_debug_printf(s,a,b,c,NULL,NULL,NULL,NULL,NULL)
212
213 int raidmarkclean(dev_t dev, struct vnode *b_vp, int);
214 int raidmarkdirty(dev_t dev, struct vnode *b_vp, int);
215
216 void raidattach __P((int));
217 int raidsize __P((dev_t));
218
219 void rf_DiskIOComplete(RF_DiskQueue_t *, RF_DiskQueueData_t *, int);
220 void rf_CopybackReconstructedData(RF_Raid_t * raidPtr);
221 static int raidinit __P((dev_t, RF_Raid_t *, int));
222
223 int raidopen __P((dev_t, int, int, struct proc *));
224 int raidclose __P((dev_t, int, int, struct proc *));
225 int raidioctl __P((dev_t, u_long, caddr_t, int, struct proc *));
226 int raidwrite __P((dev_t, struct uio *, int));
227 int raidread __P((dev_t, struct uio *, int));
228 void raidstrategy __P((struct buf *));
229 int raiddump __P((dev_t, daddr_t, caddr_t, size_t));
230
231 int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
232 int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
233 void rf_update_component_labels( RF_Raid_t *);
234 /*
235 * Pilfered from ccd.c
236 */
237
238 struct raidbuf {
239 struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */
240 struct buf *rf_obp; /* ptr. to original I/O buf */
241 int rf_flags; /* misc. flags */
242 RF_DiskQueueData_t *req;/* the request that this was part of.. */
243 };
244
245
246 #define RAIDGETBUF(rs) pool_get(&(rs)->sc_cbufpool, PR_NOWAIT)
247 #define RAIDPUTBUF(rs, cbp) pool_put(&(rs)->sc_cbufpool, cbp)
248
249 /* XXX Not sure if the following should be replacing the raidPtrs above,
250 or if it should be used in conjunction with that... */
251
252 struct raid_softc {
253 int sc_flags; /* flags */
254 int sc_cflags; /* configuration flags */
255 size_t sc_size; /* size of the raid device */
256 dev_t sc_dev; /* our device.. */
257 char sc_xname[20]; /* XXX external name */
258 struct disk sc_dkdev; /* generic disk device info */
259 struct pool sc_cbufpool; /* component buffer pool */
260 };
261 /* sc_flags */
262 #define RAIDF_INITED 0x01 /* unit has been initialized */
263 #define RAIDF_WLABEL 0x02 /* label area is writable */
264 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
265 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
266 #define RAIDF_LOCKED 0x80 /* unit is locked */
267
268 #define raidunit(x) DISKUNIT(x)
269 static int numraid = 0;
270
271 /*
272 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
273 * Be aware that large numbers can allow the driver to consume a lot of
274 * kernel memory, especially on writes, and in degraded mode reads.
275 *
276 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
277 * a single 64K write will typically require 64K for the old data,
278 * 64K for the old parity, and 64K for the new parity, for a total
279 * of 192K (if the parity buffer is not re-used immediately).
280 * Even it if is used immedately, that's still 128K, which when multiplied
281 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
282 *
283 * Now in degraded mode, for example, a 64K read on the above setup may
284 * require data reconstruction, which will require *all* of the 4 remaining
285 * disks to participate -- 4 * 32K/disk == 128K again.
286 */
287
288 #ifndef RAIDOUTSTANDING
289 #define RAIDOUTSTANDING 6
290 #endif
291
292 #define RAIDLABELDEV(dev) \
293 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
294
295 /* declared here, and made public, for the benefit of KVM stuff.. */
296 struct raid_softc *raid_softc;
297
298 static void raidgetdefaultlabel __P((RF_Raid_t *, struct raid_softc *,
299 struct disklabel *));
300 static void raidgetdisklabel __P((dev_t));
301 static void raidmakedisklabel __P((struct raid_softc *));
302
303 static int raidlock __P((struct raid_softc *));
304 static void raidunlock __P((struct raid_softc *));
305 int raidlookup __P((char *, struct proc * p, struct vnode **));
306
307 static void rf_markalldirty __P((RF_Raid_t *));
308
309 void
310 raidattach(num)
311 int num;
312 {
313 int raidID;
314 int i, rc;
315
316 #ifdef DEBUG
317 printf("raidattach: Asked for %d units\n", num);
318 #endif
319
320 if (num <= 0) {
321 #ifdef DIAGNOSTIC
322 panic("raidattach: count <= 0");
323 #endif
324 return;
325 }
326 /* This is where all the initialization stuff gets done. */
327
328 /* Make some space for requested number of units... */
329
330 RF_Calloc(raidPtrs, num, sizeof(RF_Raid_t *), (RF_Raid_t **));
331 if (raidPtrs == NULL) {
332 panic("raidPtrs is NULL!!\n");
333 }
334
335 rc = rf_mutex_init(&rf_sparet_wait_mutex);
336 if (rc) {
337 RF_PANIC();
338 }
339
340 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
341 recon_queue = NULL;
342
343 for (i = 0; i < numraid; i++)
344 raidPtrs[i] = NULL;
345 rc = rf_BootRaidframe();
346 if (rc == 0)
347 printf("Kernelized RAIDframe activated\n");
348 else
349 panic("Serious error booting RAID!!\n");
350
351 rf_kbooted = RFK_BOOT_GOOD;
352
353 /* put together some datastructures like the CCD device does.. This
354 * lets us lock the device and what-not when it gets opened. */
355
356 raid_softc = (struct raid_softc *)
357 malloc(num * sizeof(struct raid_softc),
358 M_RAIDFRAME, M_NOWAIT);
359 if (raid_softc == NULL) {
360 printf("WARNING: no memory for RAIDframe driver\n");
361 return;
362 }
363 numraid = num;
364 bzero(raid_softc, num * sizeof(struct raid_softc));
365
366 for (raidID = 0; raidID < num; raidID++) {
367 RF_Calloc(raidPtrs[raidID], 1, sizeof(RF_Raid_t),
368 (RF_Raid_t *));
369 if (raidPtrs[raidID] == NULL) {
370 printf("raidPtrs[%d] is NULL\n", raidID);
371 }
372 }
373 }
374
375
376 int
377 raidsize(dev)
378 dev_t dev;
379 {
380 struct raid_softc *rs;
381 struct disklabel *lp;
382 int part, unit, omask, size;
383
384 unit = raidunit(dev);
385 if (unit >= numraid)
386 return (-1);
387 rs = &raid_softc[unit];
388
389 if ((rs->sc_flags & RAIDF_INITED) == 0)
390 return (-1);
391
392 part = DISKPART(dev);
393 omask = rs->sc_dkdev.dk_openmask & (1 << part);
394 lp = rs->sc_dkdev.dk_label;
395
396 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curproc))
397 return (-1);
398
399 if (lp->d_partitions[part].p_fstype != FS_SWAP)
400 size = -1;
401 else
402 size = lp->d_partitions[part].p_size *
403 (lp->d_secsize / DEF_BSIZE);
404
405 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curproc))
406 return (-1);
407
408 return (size);
409
410 }
411
412 int
413 raiddump(dev, blkno, va, size)
414 dev_t dev;
415 daddr_t blkno;
416 caddr_t va;
417 size_t size;
418 {
419 /* Not implemented. */
420 return ENXIO;
421 }
422 /* ARGSUSED */
423 int
424 raidopen(dev, flags, fmt, p)
425 dev_t dev;
426 int flags, fmt;
427 struct proc *p;
428 {
429 int unit = raidunit(dev);
430 struct raid_softc *rs;
431 struct disklabel *lp;
432 int part, pmask;
433 int error = 0;
434
435 if (unit >= numraid)
436 return (ENXIO);
437 rs = &raid_softc[unit];
438
439 if ((error = raidlock(rs)) != 0)
440 return (error);
441 lp = rs->sc_dkdev.dk_label;
442
443 part = DISKPART(dev);
444 pmask = (1 << part);
445
446 db1_printf(("Opening raid device number: %d partition: %d\n",
447 unit, part));
448
449
450 if ((rs->sc_flags & RAIDF_INITED) &&
451 (rs->sc_dkdev.dk_openmask == 0))
452 raidgetdisklabel(dev);
453
454 /* make sure that this partition exists */
455
456 if (part != RAW_PART) {
457 db1_printf(("Not a raw partition..\n"));
458 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
459 ((part >= lp->d_npartitions) ||
460 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
461 error = ENXIO;
462 raidunlock(rs);
463 db1_printf(("Bailing out...\n"));
464 return (error);
465 }
466 }
467 /* Prevent this unit from being unconfigured while open. */
468 switch (fmt) {
469 case S_IFCHR:
470 rs->sc_dkdev.dk_copenmask |= pmask;
471 break;
472
473 case S_IFBLK:
474 rs->sc_dkdev.dk_bopenmask |= pmask;
475 break;
476 }
477
478 if ((rs->sc_dkdev.dk_openmask == 0) &&
479 ((rs->sc_flags & RAIDF_INITED) != 0)) {
480 /* First one... mark things as dirty... Note that we *MUST*
481 have done a configure before this. I DO NOT WANT TO BE
482 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
483 THAT THEY BELONG TOGETHER!!!!! */
484 /* XXX should check to see if we're only open for reading
485 here... If so, we needn't do this, but then need some
486 other way of keeping track of what's happened.. */
487
488 rf_markalldirty( raidPtrs[unit] );
489 }
490
491
492 rs->sc_dkdev.dk_openmask =
493 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
494
495 raidunlock(rs);
496
497 return (error);
498
499
500 }
501 /* ARGSUSED */
502 int
503 raidclose(dev, flags, fmt, p)
504 dev_t dev;
505 int flags, fmt;
506 struct proc *p;
507 {
508 int unit = raidunit(dev);
509 struct raid_softc *rs;
510 int error = 0;
511 int part;
512
513 if (unit >= numraid)
514 return (ENXIO);
515 rs = &raid_softc[unit];
516
517 if ((error = raidlock(rs)) != 0)
518 return (error);
519
520 part = DISKPART(dev);
521
522 /* ...that much closer to allowing unconfiguration... */
523 switch (fmt) {
524 case S_IFCHR:
525 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
526 break;
527
528 case S_IFBLK:
529 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
530 break;
531 }
532 rs->sc_dkdev.dk_openmask =
533 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
534
535 if ((rs->sc_dkdev.dk_openmask == 0) &&
536 ((rs->sc_flags & RAIDF_INITED) != 0)) {
537 /* Last one... device is not unconfigured yet.
538 Device shutdown has taken care of setting the
539 clean bits if RAIDF_INITED is not set
540 mark things as clean... */
541 rf_update_component_labels( raidPtrs[unit] );
542 }
543
544 raidunlock(rs);
545 return (0);
546
547 }
548
549 void
550 raidstrategy(bp)
551 register struct buf *bp;
552 {
553 register int s;
554
555 unsigned int raidID = raidunit(bp->b_dev);
556 RF_Raid_t *raidPtr;
557 struct raid_softc *rs = &raid_softc[raidID];
558 struct disklabel *lp;
559 int wlabel;
560
561 #if 0
562 db1_printf(("Strategy: 0x%x 0x%x\n", bp, bp->b_data));
563 db1_printf(("Strategy(2): bp->b_bufsize%d\n", (int) bp->b_bufsize));
564 db1_printf(("bp->b_count=%d\n", (int) bp->b_bcount));
565 db1_printf(("bp->b_resid=%d\n", (int) bp->b_resid));
566 db1_printf(("bp->b_blkno=%d\n", (int) bp->b_blkno));
567
568 if (bp->b_flags & B_READ)
569 db1_printf(("READ\n"));
570 else
571 db1_printf(("WRITE\n"));
572 #endif
573 if (rf_kbooted != RFK_BOOT_GOOD)
574 return;
575 if (raidID >= numraid || !raidPtrs[raidID]) {
576 bp->b_error = ENODEV;
577 bp->b_flags |= B_ERROR;
578 bp->b_resid = bp->b_bcount;
579 biodone(bp);
580 return;
581 }
582 raidPtr = raidPtrs[raidID];
583 if (!raidPtr->valid) {
584 bp->b_error = ENODEV;
585 bp->b_flags |= B_ERROR;
586 bp->b_resid = bp->b_bcount;
587 biodone(bp);
588 return;
589 }
590 if (bp->b_bcount == 0) {
591 db1_printf(("b_bcount is zero..\n"));
592 biodone(bp);
593 return;
594 }
595 lp = rs->sc_dkdev.dk_label;
596
597 /*
598 * Do bounds checking and adjust transfer. If there's an
599 * error, the bounds check will flag that for us.
600 */
601
602 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
603 if (DISKPART(bp->b_dev) != RAW_PART)
604 if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
605 db1_printf(("Bounds check failed!!:%d %d\n",
606 (int) bp->b_blkno, (int) wlabel));
607 biodone(bp);
608 return;
609 }
610 s = splbio(); /* XXX Needed? */
611 db1_printf(("Beginning strategy...\n"));
612
613 bp->b_resid = 0;
614 bp->b_error = rf_DoAccessKernel(raidPtrs[raidID], bp,
615 NULL, NULL, NULL);
616 if (bp->b_error) {
617 bp->b_flags |= B_ERROR;
618 db1_printf(("bp->b_flags HAS B_ERROR SET!!!: %d\n",
619 bp->b_error));
620 }
621 splx(s);
622 #if 0
623 db1_printf(("Strategy exiting: 0x%x 0x%x %d %d\n",
624 bp, bp->b_data,
625 (int) bp->b_bcount, (int) bp->b_resid));
626 #endif
627 }
628 /* ARGSUSED */
629 int
630 raidread(dev, uio, flags)
631 dev_t dev;
632 struct uio *uio;
633 int flags;
634 {
635 int unit = raidunit(dev);
636 struct raid_softc *rs;
637 int part;
638
639 if (unit >= numraid)
640 return (ENXIO);
641 rs = &raid_softc[unit];
642
643 if ((rs->sc_flags & RAIDF_INITED) == 0)
644 return (ENXIO);
645 part = DISKPART(dev);
646
647 db1_printf(("raidread: unit: %d partition: %d\n", unit, part));
648
649 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
650
651 }
652 /* ARGSUSED */
653 int
654 raidwrite(dev, uio, flags)
655 dev_t dev;
656 struct uio *uio;
657 int flags;
658 {
659 int unit = raidunit(dev);
660 struct raid_softc *rs;
661
662 if (unit >= numraid)
663 return (ENXIO);
664 rs = &raid_softc[unit];
665
666 if ((rs->sc_flags & RAIDF_INITED) == 0)
667 return (ENXIO);
668 db1_printf(("raidwrite\n"));
669 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
670
671 }
672
673 int
674 raidioctl(dev, cmd, data, flag, p)
675 dev_t dev;
676 u_long cmd;
677 caddr_t data;
678 int flag;
679 struct proc *p;
680 {
681 int unit = raidunit(dev);
682 int error = 0;
683 int part, pmask;
684 struct raid_softc *rs;
685 RF_Config_t *k_cfg, *u_cfg;
686 u_char *specific_buf;
687 int retcode = 0;
688 int row;
689 int column;
690 int s;
691 struct rf_recon_req *rrcopy, *rr;
692 RF_ComponentLabel_t *component_label;
693 RF_ComponentLabel_t ci_label;
694 RF_ComponentLabel_t **c_label_ptr;
695 RF_SingleComponent_t *sparePtr,*componentPtr;
696 RF_SingleComponent_t hot_spare;
697 RF_SingleComponent_t component;
698
699 if (unit >= numraid)
700 return (ENXIO);
701 rs = &raid_softc[unit];
702
703 db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
704 (int) DISKPART(dev), (int) unit, (int) cmd));
705
706 /* Must be open for writes for these commands... */
707 switch (cmd) {
708 case DIOCSDINFO:
709 case DIOCWDINFO:
710 case DIOCWLABEL:
711 if ((flag & FWRITE) == 0)
712 return (EBADF);
713 }
714
715 /* Must be initialized for these... */
716 switch (cmd) {
717 case DIOCGDINFO:
718 case DIOCSDINFO:
719 case DIOCWDINFO:
720 case DIOCGPART:
721 case DIOCWLABEL:
722 case DIOCGDEFLABEL:
723 case RAIDFRAME_SHUTDOWN:
724 case RAIDFRAME_REWRITEPARITY:
725 case RAIDFRAME_GET_INFO:
726 case RAIDFRAME_RESET_ACCTOTALS:
727 case RAIDFRAME_GET_ACCTOTALS:
728 case RAIDFRAME_KEEP_ACCTOTALS:
729 case RAIDFRAME_GET_SIZE:
730 case RAIDFRAME_FAIL_DISK:
731 case RAIDFRAME_COPYBACK:
732 case RAIDFRAME_CHECKRECON:
733 case RAIDFRAME_GET_COMPONENT_LABEL:
734 case RAIDFRAME_SET_COMPONENT_LABEL:
735 case RAIDFRAME_ADD_HOT_SPARE:
736 case RAIDFRAME_REMOVE_HOT_SPARE:
737 case RAIDFRAME_INIT_LABELS:
738 case RAIDFRAME_REBUILD_IN_PLACE:
739 case RAIDFRAME_CHECK_PARITY:
740 if ((rs->sc_flags & RAIDF_INITED) == 0)
741 return (ENXIO);
742 }
743
744 switch (cmd) {
745
746
747 /* configure the system */
748 case RAIDFRAME_CONFIGURE:
749
750 db3_printf(("rf_ioctl: RAIDFRAME_CONFIGURE\n"));
751 /* copy-in the configuration information */
752 /* data points to a pointer to the configuration structure */
753 u_cfg = *((RF_Config_t **) data);
754 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
755 if (k_cfg == NULL) {
756 db3_printf(("rf_ioctl: ENOMEM for config. Code is %d\n", retcode));
757 return (ENOMEM);
758 }
759 retcode = copyin((caddr_t) u_cfg, (caddr_t) k_cfg,
760 sizeof(RF_Config_t));
761 if (retcode) {
762 db3_printf(("rf_ioctl: retcode=%d copyin.1\n",
763 retcode));
764 return (retcode);
765 }
766 /* allocate a buffer for the layout-specific data, and copy it
767 * in */
768 if (k_cfg->layoutSpecificSize) {
769 if (k_cfg->layoutSpecificSize > 10000) {
770 /* sanity check */
771 db3_printf(("rf_ioctl: EINVAL %d\n", retcode));
772 return (EINVAL);
773 }
774 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
775 (u_char *));
776 if (specific_buf == NULL) {
777 RF_Free(k_cfg, sizeof(RF_Config_t));
778 db3_printf(("rf_ioctl: ENOMEM %d\n", retcode));
779 return (ENOMEM);
780 }
781 retcode = copyin(k_cfg->layoutSpecific,
782 (caddr_t) specific_buf,
783 k_cfg->layoutSpecificSize);
784 if (retcode) {
785 db3_printf(("rf_ioctl: retcode=%d copyin.2\n",
786 retcode));
787 return (retcode);
788 }
789 } else
790 specific_buf = NULL;
791 k_cfg->layoutSpecific = specific_buf;
792
793 /* should do some kind of sanity check on the configuration.
794 * Store the sum of all the bytes in the last byte? */
795
796 /* configure the system */
797
798 raidPtrs[unit]->raidid = unit;
799
800 retcode = rf_Configure(raidPtrs[unit], k_cfg);
801
802 /* allow this many simultaneous IO's to this RAID device */
803 raidPtrs[unit]->openings = RAIDOUTSTANDING;
804
805 if (retcode == 0) {
806 retcode = raidinit(dev, raidPtrs[unit], unit);
807 rf_markalldirty( raidPtrs[unit] );
808 }
809 /* free the buffers. No return code here. */
810 if (k_cfg->layoutSpecificSize) {
811 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
812 }
813 RF_Free(k_cfg, sizeof(RF_Config_t));
814
815 db3_printf(("rf_ioctl: retcode=%d RAIDFRAME_CONFIGURE\n",
816 retcode));
817
818 return (retcode);
819
820 /* shutdown the system */
821 case RAIDFRAME_SHUTDOWN:
822
823 if ((error = raidlock(rs)) != 0)
824 return (error);
825
826 /*
827 * If somebody has a partition mounted, we shouldn't
828 * shutdown.
829 */
830
831 part = DISKPART(dev);
832 pmask = (1 << part);
833 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
834 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
835 (rs->sc_dkdev.dk_copenmask & pmask))) {
836 raidunlock(rs);
837 return (EBUSY);
838 }
839
840 if (rf_debugKernelAccess) {
841 printf("call shutdown\n");
842 }
843
844 retcode = rf_Shutdown(raidPtrs[unit]);
845
846 db1_printf(("Done main shutdown\n"));
847
848 pool_destroy(&rs->sc_cbufpool);
849 db1_printf(("Done freeing component buffer freelist\n"));
850
851 /* It's no longer initialized... */
852 rs->sc_flags &= ~RAIDF_INITED;
853
854 /* Detach the disk. */
855 disk_detach(&rs->sc_dkdev);
856
857 raidunlock(rs);
858
859 return (retcode);
860 case RAIDFRAME_GET_COMPONENT_LABEL:
861 c_label_ptr = (RF_ComponentLabel_t **) data;
862 /* need to read the component label for the disk indicated
863 by row,column in component_label
864 XXX need to sanity check these values!!!
865 */
866
867 /* For practice, let's get it directly fromdisk, rather
868 than from the in-core copy */
869 RF_Malloc( component_label, sizeof( RF_ComponentLabel_t ),
870 (RF_ComponentLabel_t *));
871 if (component_label == NULL)
872 return (ENOMEM);
873
874 bzero((char *) component_label, sizeof(RF_ComponentLabel_t));
875
876 retcode = copyin( *c_label_ptr, component_label,
877 sizeof(RF_ComponentLabel_t));
878
879 if (retcode) {
880 return(retcode);
881 }
882
883 row = component_label->row;
884 column = component_label->column;
885
886 if ((row < 0) || (row >= raidPtrs[unit]->numRow) ||
887 (column < 0) || (column >= raidPtrs[unit]->numCol)) {
888 return(EINVAL);
889 }
890
891 raidread_component_label(
892 raidPtrs[unit]->Disks[row][column].dev,
893 raidPtrs[unit]->raid_cinfo[row][column].ci_vp,
894 component_label );
895
896 retcode = copyout((caddr_t) component_label,
897 (caddr_t) *c_label_ptr,
898 sizeof(RF_ComponentLabel_t));
899 RF_Free( component_label, sizeof(RF_ComponentLabel_t));
900 return (retcode);
901
902 case RAIDFRAME_SET_COMPONENT_LABEL:
903 component_label = (RF_ComponentLabel_t *) data;
904
905 /* XXX check the label for valid stuff... */
906 /* Note that some things *should not* get modified --
907 the user should be re-initing the labels instead of
908 trying to patch things.
909 */
910
911 printf("Got component label:\n");
912 printf("Version: %d\n",component_label->version);
913 printf("Serial Number: %d\n",component_label->serial_number);
914 printf("Mod counter: %d\n",component_label->mod_counter);
915 printf("Row: %d\n", component_label->row);
916 printf("Column: %d\n", component_label->column);
917 printf("Num Rows: %d\n", component_label->num_rows);
918 printf("Num Columns: %d\n", component_label->num_columns);
919 printf("Clean: %d\n", component_label->clean);
920 printf("Status: %d\n", component_label->status);
921
922 row = component_label->row;
923 column = component_label->column;
924
925 if ((row < 0) || (row >= raidPtrs[unit]->numRow) ||
926 (column < 0) || (column >= raidPtrs[unit]->numCol)) {
927 return(EINVAL);
928 }
929
930 /* XXX this isn't allowed to do anything for now :-) */
931 #if 0
932 raidwrite_component_label(
933 raidPtrs[unit]->Disks[row][column].dev,
934 raidPtrs[unit]->raid_cinfo[row][column].ci_vp,
935 component_label );
936 #endif
937 return (0);
938
939 case RAIDFRAME_INIT_LABELS:
940 component_label = (RF_ComponentLabel_t *) data;
941 /*
942 we only want the serial number from
943 the above. We get all the rest of the information
944 from the config that was used to create this RAID
945 set.
946 */
947
948 raidPtrs[unit]->serial_number = component_label->serial_number;
949 /* current version number */
950 ci_label.version = RF_COMPONENT_LABEL_VERSION;
951 ci_label.serial_number = component_label->serial_number;
952 ci_label.mod_counter = raidPtrs[unit]->mod_counter;
953 ci_label.num_rows = raidPtrs[unit]->numRow;
954 ci_label.num_columns = raidPtrs[unit]->numCol;
955 ci_label.clean = RF_RAID_DIRTY; /* not clean */
956 ci_label.status = rf_ds_optimal; /* "It's good!" */
957
958 for(row=0;row<raidPtrs[unit]->numRow;row++) {
959 ci_label.row = row;
960 for(column=0;column<raidPtrs[unit]->numCol;column++) {
961 ci_label.column = column;
962 raidwrite_component_label(
963 raidPtrs[unit]->Disks[row][column].dev,
964 raidPtrs[unit]->raid_cinfo[row][column].ci_vp,
965 &ci_label );
966 }
967 }
968
969 return (retcode);
970
971 /* initialize all parity */
972 case RAIDFRAME_REWRITEPARITY:
973
974 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
975 /* Parity for RAID 0 is trivially correct */
976 raidPtrs[unit]->parity_good = RF_RAID_CLEAN;
977 return(0);
978 }
979
980 /* borrow the thread of the requesting process */
981
982 s = splbio();
983 retcode = rf_RewriteParity(raidPtrs[unit]);
984 splx(s);
985 /* return I/O Error if the parity rewrite fails */
986
987 if (retcode) {
988 retcode = EIO;
989 } else {
990 /* set the clean bit! If we shutdown correctly,
991 the clean bit on each component label will get
992 set */
993 raidPtrs[unit]->parity_good = RF_RAID_CLEAN;
994 }
995 return (retcode);
996
997
998 case RAIDFRAME_ADD_HOT_SPARE:
999 sparePtr = (RF_SingleComponent_t *) data;
1000 memcpy( &hot_spare, sparePtr, sizeof(RF_SingleComponent_t));
1001 printf("Adding spare\n");
1002 retcode = rf_add_hot_spare(raidPtrs[unit], &hot_spare);
1003 return(retcode);
1004
1005 case RAIDFRAME_REMOVE_HOT_SPARE:
1006 return(retcode);
1007
1008 case RAIDFRAME_REBUILD_IN_PLACE:
1009
1010 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
1011 /* Can't do this on a RAID 0!! */
1012 return(EINVAL);
1013 }
1014
1015 componentPtr = (RF_SingleComponent_t *) data;
1016 memcpy( &component, componentPtr,
1017 sizeof(RF_SingleComponent_t));
1018 row = component.row;
1019 column = component.column;
1020 printf("Rebuild: %d %d\n",row, column);
1021 if ((row < 0) || (row >= raidPtrs[unit]->numRow) ||
1022 (column < 0) || (column >= raidPtrs[unit]->numCol)) {
1023 return(EINVAL);
1024 }
1025 printf("Attempting a rebuild in place\n");
1026 s = splbio();
1027 retcode = rf_ReconstructInPlace(raidPtrs[unit], row, column);
1028 splx(s);
1029 return(retcode);
1030
1031 case RAIDFRAME_GET_INFO:
1032 {
1033 RF_Raid_t *raid = raidPtrs[unit];
1034 RF_DeviceConfig_t *cfg, **ucfgp;
1035 int i, j, d;
1036
1037 if (!raid->valid)
1038 return (ENODEV);
1039 ucfgp = (RF_DeviceConfig_t **) data;
1040 RF_Malloc(cfg, sizeof(RF_DeviceConfig_t),
1041 (RF_DeviceConfig_t *));
1042 if (cfg == NULL)
1043 return (ENOMEM);
1044 bzero((char *) cfg, sizeof(RF_DeviceConfig_t));
1045 cfg->rows = raid->numRow;
1046 cfg->cols = raid->numCol;
1047 cfg->ndevs = raid->numRow * raid->numCol;
1048 if (cfg->ndevs >= RF_MAX_DISKS) {
1049 cfg->ndevs = 0;
1050 return (ENOMEM);
1051 }
1052 cfg->nspares = raid->numSpare;
1053 if (cfg->nspares >= RF_MAX_DISKS) {
1054 cfg->nspares = 0;
1055 return (ENOMEM);
1056 }
1057 cfg->maxqdepth = raid->maxQueueDepth;
1058 d = 0;
1059 for (i = 0; i < cfg->rows; i++) {
1060 for (j = 0; j < cfg->cols; j++) {
1061 cfg->devs[d] = raid->Disks[i][j];
1062 d++;
1063 }
1064 }
1065 for (j = cfg->cols, i = 0; i < cfg->nspares; i++, j++) {
1066 cfg->spares[i] = raid->Disks[0][j];
1067 }
1068 retcode = copyout((caddr_t) cfg, (caddr_t) * ucfgp,
1069 sizeof(RF_DeviceConfig_t));
1070 RF_Free(cfg, sizeof(RF_DeviceConfig_t));
1071
1072 return (retcode);
1073 }
1074 break;
1075 case RAIDFRAME_CHECK_PARITY:
1076 *(int *) data = raidPtrs[unit]->parity_good;
1077 return (0);
1078 case RAIDFRAME_RESET_ACCTOTALS:
1079 {
1080 RF_Raid_t *raid = raidPtrs[unit];
1081
1082 bzero(&raid->acc_totals, sizeof(raid->acc_totals));
1083 return (0);
1084 }
1085 break;
1086
1087 case RAIDFRAME_GET_ACCTOTALS:
1088 {
1089 RF_AccTotals_t *totals = (RF_AccTotals_t *) data;
1090 RF_Raid_t *raid = raidPtrs[unit];
1091
1092 *totals = raid->acc_totals;
1093 return (0);
1094 }
1095 break;
1096
1097 case RAIDFRAME_KEEP_ACCTOTALS:
1098 {
1099 RF_Raid_t *raid = raidPtrs[unit];
1100 int *keep = (int *) data;
1101
1102 raid->keep_acc_totals = *keep;
1103 return (0);
1104 }
1105 break;
1106
1107 case RAIDFRAME_GET_SIZE:
1108 *(int *) data = raidPtrs[unit]->totalSectors;
1109 return (0);
1110
1111 #define RAIDFRAME_RECON 1
1112 /* XXX The above should probably be set somewhere else!! GO */
1113 #if RAIDFRAME_RECON > 0
1114
1115 /* fail a disk & optionally start reconstruction */
1116 case RAIDFRAME_FAIL_DISK:
1117
1118 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
1119 /* Can't do this on a RAID 0!! */
1120 return(EINVAL);
1121 }
1122
1123 rr = (struct rf_recon_req *) data;
1124
1125 if (rr->row < 0 || rr->row >= raidPtrs[unit]->numRow
1126 || rr->col < 0 || rr->col >= raidPtrs[unit]->numCol)
1127 return (EINVAL);
1128
1129 printf("raid%d: Failing the disk: row: %d col: %d\n",
1130 unit, rr->row, rr->col);
1131
1132 /* make a copy of the recon request so that we don't rely on
1133 * the user's buffer */
1134 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1135 bcopy(rr, rrcopy, sizeof(*rr));
1136 rrcopy->raidPtr = (void *) raidPtrs[unit];
1137
1138 LOCK_RECON_Q_MUTEX();
1139 rrcopy->next = recon_queue;
1140 recon_queue = rrcopy;
1141 wakeup(&recon_queue);
1142 UNLOCK_RECON_Q_MUTEX();
1143
1144 return (0);
1145
1146 /* invoke a copyback operation after recon on whatever disk
1147 * needs it, if any */
1148 case RAIDFRAME_COPYBACK:
1149
1150 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
1151 /* This makes no sense on a RAID 0!! */
1152 return(EINVAL);
1153 }
1154
1155 /* borrow the current thread to get this done */
1156
1157 s = splbio();
1158 rf_CopybackReconstructedData(raidPtrs[unit]);
1159 splx(s);
1160 return (0);
1161
1162 /* return the percentage completion of reconstruction */
1163 case RAIDFRAME_CHECKRECON:
1164 if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) {
1165 /* This makes no sense on a RAID 0 */
1166 return(EINVAL);
1167 }
1168
1169 row = *(int *) data;
1170 if (row < 0 || row >= raidPtrs[unit]->numRow)
1171 return (EINVAL);
1172 if (raidPtrs[unit]->status[row] != rf_rs_reconstructing)
1173 *(int *) data = 100;
1174 else
1175 *(int *) data = raidPtrs[unit]->reconControl[row]->percentComplete;
1176 return (0);
1177
1178 /* the sparetable daemon calls this to wait for the kernel to
1179 * need a spare table. this ioctl does not return until a
1180 * spare table is needed. XXX -- calling mpsleep here in the
1181 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1182 * -- I should either compute the spare table in the kernel,
1183 * or have a different -- XXX XXX -- interface (a different
1184 * character device) for delivering the table -- XXX */
1185 #if 0
1186 case RAIDFRAME_SPARET_WAIT:
1187 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1188 while (!rf_sparet_wait_queue)
1189 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1190 waitreq = rf_sparet_wait_queue;
1191 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1192 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1193
1194 *((RF_SparetWait_t *) data) = *waitreq; /* structure assignment */
1195
1196 RF_Free(waitreq, sizeof(*waitreq));
1197 return (0);
1198
1199
1200 /* wakes up a process waiting on SPARET_WAIT and puts an error
1201 * code in it that will cause the dameon to exit */
1202 case RAIDFRAME_ABORT_SPARET_WAIT:
1203 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1204 waitreq->fcol = -1;
1205 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1206 waitreq->next = rf_sparet_wait_queue;
1207 rf_sparet_wait_queue = waitreq;
1208 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1209 wakeup(&rf_sparet_wait_queue);
1210 return (0);
1211
1212 /* used by the spare table daemon to deliver a spare table
1213 * into the kernel */
1214 case RAIDFRAME_SEND_SPARET:
1215
1216 /* install the spare table */
1217 retcode = rf_SetSpareTable(raidPtrs[unit], *(void **) data);
1218
1219 /* respond to the requestor. the return status of the spare
1220 * table installation is passed in the "fcol" field */
1221 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1222 waitreq->fcol = retcode;
1223 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1224 waitreq->next = rf_sparet_resp_queue;
1225 rf_sparet_resp_queue = waitreq;
1226 wakeup(&rf_sparet_resp_queue);
1227 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1228
1229 return (retcode);
1230 #endif
1231
1232
1233 #endif /* RAIDFRAME_RECON > 0 */
1234
1235 default:
1236 break; /* fall through to the os-specific code below */
1237
1238 }
1239
1240 if (!raidPtrs[unit]->valid)
1241 return (EINVAL);
1242
1243 /*
1244 * Add support for "regular" device ioctls here.
1245 */
1246
1247 switch (cmd) {
1248 case DIOCGDINFO:
1249 db1_printf(("DIOCGDINFO %d %d\n", (int) dev, (int) DISKPART(dev)));
1250 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1251 break;
1252
1253 case DIOCGPART:
1254 db1_printf(("DIOCGPART: %d %d\n", (int) dev, (int) DISKPART(dev)));
1255 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1256 ((struct partinfo *) data)->part =
1257 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1258 break;
1259
1260 case DIOCWDINFO:
1261 db1_printf(("DIOCWDINFO\n"));
1262 case DIOCSDINFO:
1263 db1_printf(("DIOCSDINFO\n"));
1264 if ((error = raidlock(rs)) != 0)
1265 return (error);
1266
1267 rs->sc_flags |= RAIDF_LABELLING;
1268
1269 error = setdisklabel(rs->sc_dkdev.dk_label,
1270 (struct disklabel *) data, 0, rs->sc_dkdev.dk_cpulabel);
1271 if (error == 0) {
1272 if (cmd == DIOCWDINFO)
1273 error = writedisklabel(RAIDLABELDEV(dev),
1274 raidstrategy, rs->sc_dkdev.dk_label,
1275 rs->sc_dkdev.dk_cpulabel);
1276 }
1277 rs->sc_flags &= ~RAIDF_LABELLING;
1278
1279 raidunlock(rs);
1280
1281 if (error)
1282 return (error);
1283 break;
1284
1285 case DIOCWLABEL:
1286 db1_printf(("DIOCWLABEL\n"));
1287 if (*(int *) data != 0)
1288 rs->sc_flags |= RAIDF_WLABEL;
1289 else
1290 rs->sc_flags &= ~RAIDF_WLABEL;
1291 break;
1292
1293 case DIOCGDEFLABEL:
1294 db1_printf(("DIOCGDEFLABEL\n"));
1295 raidgetdefaultlabel(raidPtrs[unit], rs,
1296 (struct disklabel *) data);
1297 break;
1298
1299 default:
1300 retcode = ENOTTY; /* XXXX ?? OR EINVAL ? */
1301 }
1302 return (retcode);
1303
1304 }
1305
1306
1307 /* raidinit -- complete the rest of the initialization for the
1308 RAIDframe device. */
1309
1310
1311 static int
1312 raidinit(dev, raidPtr, unit)
1313 dev_t dev;
1314 RF_Raid_t *raidPtr;
1315 int unit;
1316 {
1317 int retcode;
1318 /* int ix; */
1319 /* struct raidbuf *raidbp; */
1320 struct raid_softc *rs;
1321
1322 retcode = 0;
1323
1324 rs = &raid_softc[unit];
1325 pool_init(&rs->sc_cbufpool, sizeof(struct raidbuf), 0,
1326 0, 0, "raidpl", 0, NULL, NULL, M_RAIDFRAME);
1327
1328
1329 /* XXX should check return code first... */
1330 rs->sc_flags |= RAIDF_INITED;
1331
1332 sprintf(rs->sc_xname, "raid%d", unit); /* XXX doesn't check bounds. */
1333
1334 rs->sc_dkdev.dk_name = rs->sc_xname;
1335
1336 /* disk_attach actually creates space for the CPU disklabel, among
1337 * other things, so it's critical to call this *BEFORE* we try putzing
1338 * with disklabels. */
1339
1340 disk_attach(&rs->sc_dkdev);
1341
1342 /* XXX There may be a weird interaction here between this, and
1343 * protectedSectors, as used in RAIDframe. */
1344
1345 rs->sc_size = raidPtr->totalSectors;
1346 rs->sc_dev = dev;
1347
1348 return (retcode);
1349 }
1350
1351 /*
1352 * This kernel thread never exits. It is created once, and persists
1353 * until the system reboots.
1354 */
1355
1356 void
1357 rf_ReconKernelThread()
1358 {
1359 struct rf_recon_req *req;
1360 int s;
1361
1362 /* XXX not sure what spl() level we should be at here... probably
1363 * splbio() */
1364 s = splbio();
1365
1366 while (1) {
1367 /* grab the next reconstruction request from the queue */
1368 LOCK_RECON_Q_MUTEX();
1369 while (!recon_queue) {
1370 UNLOCK_RECON_Q_MUTEX();
1371 tsleep(&recon_queue, PRIBIO,
1372 "raidframe recon", 0);
1373 LOCK_RECON_Q_MUTEX();
1374 }
1375 req = recon_queue;
1376 recon_queue = recon_queue->next;
1377 UNLOCK_RECON_Q_MUTEX();
1378
1379 /*
1380 * If flags specifies that we should start recon, this call
1381 * will not return until reconstruction completes, fails,
1382 * or is aborted.
1383 */
1384 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col,
1385 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
1386
1387 RF_Free(req, sizeof(*req));
1388 }
1389 }
1390 /* wake up the daemon & tell it to get us a spare table
1391 * XXX
1392 * the entries in the queues should be tagged with the raidPtr
1393 * so that in the extremely rare case that two recons happen at once,
1394 * we know for which device were requesting a spare table
1395 * XXX
1396 */
1397 int
1398 rf_GetSpareTableFromDaemon(req)
1399 RF_SparetWait_t *req;
1400 {
1401 int retcode;
1402
1403 RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1404 req->next = rf_sparet_wait_queue;
1405 rf_sparet_wait_queue = req;
1406 wakeup(&rf_sparet_wait_queue);
1407
1408 /* mpsleep unlocks the mutex */
1409 while (!rf_sparet_resp_queue) {
1410 tsleep(&rf_sparet_resp_queue, PRIBIO,
1411 "raidframe getsparetable", 0);
1412 }
1413 req = rf_sparet_resp_queue;
1414 rf_sparet_resp_queue = req->next;
1415 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1416
1417 retcode = req->fcol;
1418 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1419 * alloc'd */
1420 return (retcode);
1421 }
1422 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1423 * bp & passes it down.
1424 * any calls originating in the kernel must use non-blocking I/O
1425 * do some extra sanity checking to return "appropriate" error values for
1426 * certain conditions (to make some standard utilities work)
1427 */
1428 int
1429 rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg)
1430 RF_Raid_t *raidPtr;
1431 struct buf *bp;
1432 RF_RaidAccessFlags_t flags;
1433 void (*cbFunc) (struct buf *);
1434 void *cbArg;
1435 {
1436 RF_SectorCount_t num_blocks, pb, sum;
1437 RF_RaidAddr_t raid_addr;
1438 int retcode;
1439 struct partition *pp;
1440 daddr_t blocknum;
1441 int unit;
1442 struct raid_softc *rs;
1443 int do_async;
1444
1445 /* XXX The dev_t used here should be for /dev/[r]raid* !!! */
1446
1447 unit = raidPtr->raidid;
1448 rs = &raid_softc[unit];
1449
1450 /* Ok, for the bp we have here, bp->b_blkno is relative to the
1451 * partition.. Need to make it absolute to the underlying device.. */
1452
1453 blocknum = bp->b_blkno;
1454 if (DISKPART(bp->b_dev) != RAW_PART) {
1455 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1456 blocknum += pp->p_offset;
1457 db1_printf(("updated: %d %d\n", DISKPART(bp->b_dev),
1458 pp->p_offset));
1459 } else {
1460 db1_printf(("Is raw..\n"));
1461 }
1462 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, (int) blocknum));
1463
1464 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1465 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1466
1467 /* *THIS* is where we adjust what block we're going to... but DO NOT
1468 * TOUCH bp->b_blkno!!! */
1469 raid_addr = blocknum;
1470
1471 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1472 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1473 sum = raid_addr + num_blocks + pb;
1474 if (1 || rf_debugKernelAccess) {
1475 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1476 (int) raid_addr, (int) sum, (int) num_blocks,
1477 (int) pb, (int) bp->b_resid));
1478 }
1479 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1480 || (sum < num_blocks) || (sum < pb)) {
1481 bp->b_error = ENOSPC;
1482 bp->b_flags |= B_ERROR;
1483 bp->b_resid = bp->b_bcount;
1484 biodone(bp);
1485 return (bp->b_error);
1486 }
1487 /*
1488 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1489 */
1490
1491 if (bp->b_bcount & raidPtr->sectorMask) {
1492 bp->b_error = EINVAL;
1493 bp->b_flags |= B_ERROR;
1494 bp->b_resid = bp->b_bcount;
1495 biodone(bp);
1496 return (bp->b_error);
1497 }
1498 db1_printf(("Calling DoAccess..\n"));
1499
1500
1501 /* Put a throttle on the number of requests we handle simultanously */
1502
1503 RF_LOCK_MUTEX(raidPtr->mutex);
1504
1505 while(raidPtr->openings <= 0) {
1506 RF_UNLOCK_MUTEX(raidPtr->mutex);
1507 (void)tsleep(&raidPtr->openings, PRIBIO, "rfdwait", 0);
1508 RF_LOCK_MUTEX(raidPtr->mutex);
1509 }
1510 raidPtr->openings--;
1511
1512 RF_UNLOCK_MUTEX(raidPtr->mutex);
1513
1514 /*
1515 * Everything is async.
1516 */
1517 do_async = 1;
1518
1519 /* don't ever condition on bp->b_flags & B_WRITE. always condition on
1520 * B_READ instead */
1521 retcode = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1522 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1523 do_async, raid_addr, num_blocks,
1524 bp->b_un.b_addr,
1525 bp, NULL, NULL, RF_DAG_NONBLOCKING_IO | flags,
1526 NULL, cbFunc, cbArg);
1527 #if 0
1528 db1_printf(("After call to DoAccess: 0x%x 0x%x %d\n", bp,
1529 bp->b_data, (int) bp->b_resid));
1530 #endif
1531
1532 return (retcode);
1533 }
1534 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1535
1536 int
1537 rf_DispatchKernelIO(queue, req)
1538 RF_DiskQueue_t *queue;
1539 RF_DiskQueueData_t *req;
1540 {
1541 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1542 struct buf *bp;
1543 struct raidbuf *raidbp = NULL;
1544 struct raid_softc *rs;
1545 int unit;
1546
1547 /* XXX along with the vnode, we also need the softc associated with
1548 * this device.. */
1549
1550 req->queue = queue;
1551
1552 unit = queue->raidPtr->raidid;
1553
1554 db1_printf(("DispatchKernelIO unit: %d\n", unit));
1555
1556 if (unit >= numraid) {
1557 printf("Invalid unit number: %d %d\n", unit, numraid);
1558 panic("Invalid Unit number in rf_DispatchKernelIO\n");
1559 }
1560 rs = &raid_softc[unit];
1561
1562 /* XXX is this the right place? */
1563 disk_busy(&rs->sc_dkdev);
1564
1565 bp = req->bp;
1566 #if 1
1567 /* XXX when there is a physical disk failure, someone is passing us a
1568 * buffer that contains old stuff!! Attempt to deal with this problem
1569 * without taking a performance hit... (not sure where the real bug
1570 * is. It's buried in RAIDframe somewhere) :-( GO ) */
1571
1572 if (bp->b_flags & B_ERROR) {
1573 bp->b_flags &= ~B_ERROR;
1574 }
1575 if (bp->b_error != 0) {
1576 bp->b_error = 0;
1577 }
1578 #endif
1579 raidbp = RAIDGETBUF(rs);
1580
1581 raidbp->rf_flags = 0; /* XXX not really used anywhere... */
1582
1583 /*
1584 * context for raidiodone
1585 */
1586 raidbp->rf_obp = bp;
1587 raidbp->req = req;
1588
1589 switch (req->type) {
1590 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1591 /* Dprintf2("rf_DispatchKernelIO: NOP to r %d c %d\n",
1592 * queue->row, queue->col); */
1593 /* XXX need to do something extra here.. */
1594 /* I'm leaving this in, as I've never actually seen it used,
1595 * and I'd like folks to report it... GO */
1596 printf(("WAKEUP CALLED\n"));
1597 queue->numOutstanding++;
1598
1599 /* XXX need to glue the original buffer into this?? */
1600
1601 KernelWakeupFunc(&raidbp->rf_buf);
1602 break;
1603
1604 case RF_IO_TYPE_READ:
1605 case RF_IO_TYPE_WRITE:
1606
1607 if (req->tracerec) {
1608 RF_ETIMER_START(req->tracerec->timer);
1609 }
1610 InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp,
1611 op | bp->b_flags, queue->rf_cinfo->ci_dev,
1612 req->sectorOffset, req->numSector,
1613 req->buf, KernelWakeupFunc, (void *) req,
1614 queue->raidPtr->logBytesPerSector, req->b_proc);
1615
1616 if (rf_debugKernelAccess) {
1617 db1_printf(("dispatch: bp->b_blkno = %ld\n",
1618 (long) bp->b_blkno));
1619 }
1620 queue->numOutstanding++;
1621 queue->last_deq_sector = req->sectorOffset;
1622 /* acc wouldn't have been let in if there were any pending
1623 * reqs at any other priority */
1624 queue->curPriority = req->priority;
1625 /* Dprintf3("rf_DispatchKernelIO: %c to row %d col %d\n",
1626 * req->type, queue->row, queue->col); */
1627
1628 db1_printf(("Going for %c to unit %d row %d col %d\n",
1629 req->type, unit, queue->row, queue->col));
1630 db1_printf(("sector %d count %d (%d bytes) %d\n",
1631 (int) req->sectorOffset, (int) req->numSector,
1632 (int) (req->numSector <<
1633 queue->raidPtr->logBytesPerSector),
1634 (int) queue->raidPtr->logBytesPerSector));
1635 if ((raidbp->rf_buf.b_flags & B_READ) == 0) {
1636 raidbp->rf_buf.b_vp->v_numoutput++;
1637 }
1638 VOP_STRATEGY(&raidbp->rf_buf);
1639
1640 break;
1641
1642 default:
1643 panic("bad req->type in rf_DispatchKernelIO");
1644 }
1645 db1_printf(("Exiting from DispatchKernelIO\n"));
1646 return (0);
1647 }
1648 /* this is the callback function associated with a I/O invoked from
1649 kernel code.
1650 */
1651 static void
1652 KernelWakeupFunc(vbp)
1653 struct buf *vbp;
1654 {
1655 RF_DiskQueueData_t *req = NULL;
1656 RF_DiskQueue_t *queue;
1657 struct raidbuf *raidbp = (struct raidbuf *) vbp;
1658 struct buf *bp;
1659 struct raid_softc *rs;
1660 int unit;
1661 register int s;
1662
1663 s = splbio(); /* XXX */
1664 db1_printf(("recovering the request queue:\n"));
1665 req = raidbp->req;
1666
1667 bp = raidbp->rf_obp;
1668 #if 0
1669 db1_printf(("bp=0x%x\n", bp));
1670 #endif
1671
1672 queue = (RF_DiskQueue_t *) req->queue;
1673
1674 if (raidbp->rf_buf.b_flags & B_ERROR) {
1675 #if 0
1676 printf("Setting bp->b_flags!!! %d\n", raidbp->rf_buf.b_error);
1677 #endif
1678 bp->b_flags |= B_ERROR;
1679 bp->b_error = raidbp->rf_buf.b_error ?
1680 raidbp->rf_buf.b_error : EIO;
1681 }
1682 #if 0
1683 db1_printf(("raidbp->rf_buf.b_bcount=%d\n", (int) raidbp->rf_buf.b_bcount));
1684 db1_printf(("raidbp->rf_buf.b_bufsize=%d\n", (int) raidbp->rf_buf.b_bufsize));
1685 db1_printf(("raidbp->rf_buf.b_resid=%d\n", (int) raidbp->rf_buf.b_resid));
1686 db1_printf(("raidbp->rf_buf.b_data=0x%x\n", raidbp->rf_buf.b_data));
1687 #endif
1688
1689 /* XXX methinks this could be wrong... */
1690 #if 1
1691 bp->b_resid = raidbp->rf_buf.b_resid;
1692 #endif
1693
1694 if (req->tracerec) {
1695 RF_ETIMER_STOP(req->tracerec->timer);
1696 RF_ETIMER_EVAL(req->tracerec->timer);
1697 RF_LOCK_MUTEX(rf_tracing_mutex);
1698 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
1699 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
1700 req->tracerec->num_phys_ios++;
1701 RF_UNLOCK_MUTEX(rf_tracing_mutex);
1702 }
1703 bp->b_bcount = raidbp->rf_buf.b_bcount; /* XXXX ?? */
1704
1705 unit = queue->raidPtr->raidid; /* *Much* simpler :-> */
1706
1707
1708 /* XXX Ok, let's get aggressive... If B_ERROR is set, let's go
1709 * ballistic, and mark the component as hosed... */
1710 #if 1
1711 if (bp->b_flags & B_ERROR) {
1712 /* Mark the disk as dead */
1713 /* but only mark it once... */
1714 if (queue->raidPtr->Disks[queue->row][queue->col].status ==
1715 rf_ds_optimal) {
1716 printf("raid%d: IO Error. Marking %s as failed.\n",
1717 unit, queue->raidPtr->Disks[queue->row][queue->col].devname);
1718 queue->raidPtr->Disks[queue->row][queue->col].status =
1719 rf_ds_failed;
1720 queue->raidPtr->status[queue->row] = rf_rs_degraded;
1721 queue->raidPtr->numFailures++;
1722 /* XXX here we should bump the version number for each component, and write that data out */
1723 } else { /* Disk is already dead... */
1724 /* printf("Disk already marked as dead!\n"); */
1725 }
1726
1727 }
1728 #endif
1729
1730 rs = &raid_softc[unit];
1731 RAIDPUTBUF(rs, raidbp);
1732
1733
1734 if (bp->b_resid == 0) {
1735 db1_printf(("Disk is no longer busy for this buffer... %d %ld %ld\n",
1736 unit, bp->b_resid, bp->b_bcount));
1737 /* XXX is this the right place for a disk_unbusy()??!??!?!? */
1738 disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid));
1739 } else {
1740 db1_printf(("b_resid is still %ld\n", bp->b_resid));
1741 }
1742
1743 rf_DiskIOComplete(queue, req, (bp->b_flags & B_ERROR) ? 1 : 0);
1744 (req->CompleteFunc) (req->argument, (bp->b_flags & B_ERROR) ? 1 : 0);
1745 /* printf("Exiting KernelWakeupFunc\n"); */
1746
1747 splx(s); /* XXX */
1748 }
1749
1750
1751
1752 /*
1753 * initialize a buf structure for doing an I/O in the kernel.
1754 */
1755 static void
1756 InitBP(
1757 struct buf * bp,
1758 struct vnode * b_vp,
1759 unsigned rw_flag,
1760 dev_t dev,
1761 RF_SectorNum_t startSect,
1762 RF_SectorCount_t numSect,
1763 caddr_t buf,
1764 void (*cbFunc) (struct buf *),
1765 void *cbArg,
1766 int logBytesPerSector,
1767 struct proc * b_proc)
1768 {
1769 /* bp->b_flags = B_PHYS | rw_flag; */
1770 bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */
1771 bp->b_bcount = numSect << logBytesPerSector;
1772 bp->b_bufsize = bp->b_bcount;
1773 bp->b_error = 0;
1774 bp->b_dev = dev;
1775 db1_printf(("bp->b_dev is %d\n", dev));
1776 bp->b_un.b_addr = buf;
1777 #if 0
1778 db1_printf(("bp->b_data=0x%x\n", bp->b_data));
1779 #endif
1780
1781 bp->b_blkno = startSect;
1782 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
1783 db1_printf(("b_bcount is: %d\n", (int) bp->b_bcount));
1784 if (bp->b_bcount == 0) {
1785 panic("bp->b_bcount is zero in InitBP!!\n");
1786 }
1787 bp->b_proc = b_proc;
1788 bp->b_iodone = cbFunc;
1789 bp->b_vp = b_vp;
1790
1791 }
1792
1793 static void
1794 raidgetdefaultlabel(raidPtr, rs, lp)
1795 RF_Raid_t *raidPtr;
1796 struct raid_softc *rs;
1797 struct disklabel *lp;
1798 {
1799 db1_printf(("Building a default label...\n"));
1800 bzero(lp, sizeof(*lp));
1801
1802 /* fabricate a label... */
1803 lp->d_secperunit = raidPtr->totalSectors;
1804 lp->d_secsize = raidPtr->bytesPerSector;
1805 lp->d_nsectors = 1024 * (1024 / raidPtr->bytesPerSector);
1806 lp->d_ntracks = 1;
1807 lp->d_ncylinders = raidPtr->totalSectors / lp->d_nsectors;
1808 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1809
1810 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
1811 lp->d_type = DTYPE_RAID;
1812 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1813 lp->d_rpm = 3600;
1814 lp->d_interleave = 1;
1815 lp->d_flags = 0;
1816
1817 lp->d_partitions[RAW_PART].p_offset = 0;
1818 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
1819 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1820 lp->d_npartitions = RAW_PART + 1;
1821
1822 lp->d_magic = DISKMAGIC;
1823 lp->d_magic2 = DISKMAGIC;
1824 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
1825
1826 }
1827 /*
1828 * Read the disklabel from the raid device. If one is not present, fake one
1829 * up.
1830 */
1831 static void
1832 raidgetdisklabel(dev)
1833 dev_t dev;
1834 {
1835 int unit = raidunit(dev);
1836 struct raid_softc *rs = &raid_softc[unit];
1837 char *errstring;
1838 struct disklabel *lp = rs->sc_dkdev.dk_label;
1839 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
1840 RF_Raid_t *raidPtr;
1841
1842 db1_printf(("Getting the disklabel...\n"));
1843
1844 bzero(clp, sizeof(*clp));
1845
1846 raidPtr = raidPtrs[unit];
1847
1848 raidgetdefaultlabel(raidPtr, rs, lp);
1849
1850 /*
1851 * Call the generic disklabel extraction routine.
1852 */
1853 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
1854 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
1855 if (errstring)
1856 raidmakedisklabel(rs);
1857 else {
1858 int i;
1859 struct partition *pp;
1860
1861 /*
1862 * Sanity check whether the found disklabel is valid.
1863 *
1864 * This is necessary since total size of the raid device
1865 * may vary when an interleave is changed even though exactly
1866 * same componets are used, and old disklabel may used
1867 * if that is found.
1868 */
1869 if (lp->d_secperunit != rs->sc_size)
1870 printf("WARNING: %s: "
1871 "total sector size in disklabel (%d) != "
1872 "the size of raid (%ld)\n", rs->sc_xname,
1873 lp->d_secperunit, (long) rs->sc_size);
1874 for (i = 0; i < lp->d_npartitions; i++) {
1875 pp = &lp->d_partitions[i];
1876 if (pp->p_offset + pp->p_size > rs->sc_size)
1877 printf("WARNING: %s: end of partition `%c' "
1878 "exceeds the size of raid (%ld)\n",
1879 rs->sc_xname, 'a' + i, (long) rs->sc_size);
1880 }
1881 }
1882
1883 }
1884 /*
1885 * Take care of things one might want to take care of in the event
1886 * that a disklabel isn't present.
1887 */
1888 static void
1889 raidmakedisklabel(rs)
1890 struct raid_softc *rs;
1891 {
1892 struct disklabel *lp = rs->sc_dkdev.dk_label;
1893 db1_printf(("Making a label..\n"));
1894
1895 /*
1896 * For historical reasons, if there's no disklabel present
1897 * the raw partition must be marked FS_BSDFFS.
1898 */
1899
1900 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1901
1902 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1903
1904 lp->d_checksum = dkcksum(lp);
1905 }
1906 /*
1907 * Lookup the provided name in the filesystem. If the file exists,
1908 * is a valid block device, and isn't being used by anyone else,
1909 * set *vpp to the file's vnode.
1910 * You'll find the original of this in ccd.c
1911 */
1912 int
1913 raidlookup(path, p, vpp)
1914 char *path;
1915 struct proc *p;
1916 struct vnode **vpp; /* result */
1917 {
1918 struct nameidata nd;
1919 struct vnode *vp;
1920 struct vattr va;
1921 int error;
1922
1923 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p);
1924 if ((error = vn_open(&nd, FREAD | FWRITE, 0)) != 0) {
1925 #ifdef DEBUG
1926 printf("RAIDframe: vn_open returned %d\n", error);
1927 #endif
1928 return (error);
1929 }
1930 vp = nd.ni_vp;
1931 if (vp->v_usecount > 1) {
1932 VOP_UNLOCK(vp, 0);
1933 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
1934 return (EBUSY);
1935 }
1936 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) != 0) {
1937 VOP_UNLOCK(vp, 0);
1938 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
1939 return (error);
1940 }
1941 /* XXX: eventually we should handle VREG, too. */
1942 if (va.va_type != VBLK) {
1943 VOP_UNLOCK(vp, 0);
1944 (void) vn_close(vp, FREAD | FWRITE, p->p_ucred, p);
1945 return (ENOTBLK);
1946 }
1947 VOP_UNLOCK(vp, 0);
1948 *vpp = vp;
1949 return (0);
1950 }
1951 /*
1952 * Wait interruptibly for an exclusive lock.
1953 *
1954 * XXX
1955 * Several drivers do this; it should be abstracted and made MP-safe.
1956 * (Hmm... where have we seen this warning before :-> GO )
1957 */
1958 static int
1959 raidlock(rs)
1960 struct raid_softc *rs;
1961 {
1962 int error;
1963
1964 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
1965 rs->sc_flags |= RAIDF_WANTED;
1966 if ((error =
1967 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
1968 return (error);
1969 }
1970 rs->sc_flags |= RAIDF_LOCKED;
1971 return (0);
1972 }
1973 /*
1974 * Unlock and wake up any waiters.
1975 */
1976 static void
1977 raidunlock(rs)
1978 struct raid_softc *rs;
1979 {
1980
1981 rs->sc_flags &= ~RAIDF_LOCKED;
1982 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
1983 rs->sc_flags &= ~RAIDF_WANTED;
1984 wakeup(rs);
1985 }
1986 }
1987
1988
1989 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
1990 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
1991
1992 int
1993 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
1994 {
1995 RF_ComponentLabel_t component_label;
1996 raidread_component_label(dev, b_vp, &component_label);
1997 component_label.mod_counter = mod_counter;
1998 component_label.clean = RF_RAID_CLEAN;
1999 raidwrite_component_label(dev, b_vp, &component_label);
2000 return(0);
2001 }
2002
2003
2004 int
2005 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2006 {
2007 RF_ComponentLabel_t component_label;
2008 raidread_component_label(dev, b_vp, &component_label);
2009 component_label.mod_counter = mod_counter;
2010 component_label.clean = RF_RAID_DIRTY;
2011 raidwrite_component_label(dev, b_vp, &component_label);
2012 return(0);
2013 }
2014
2015 /* ARGSUSED */
2016 int
2017 raidread_component_label(dev, b_vp, component_label, bshift)
2018 dev_t dev;
2019 struct vnode *b_vp;
2020 RF_ComponentLabel_t *component_label;
2021 int bshift;
2022 {
2023 struct buf *bp;
2024 int error;
2025
2026 /* XXX should probably ensure that we don't try to do this if
2027 someone has changed rf_protected_sectors. */
2028
2029 /* get a block of the appropriate size... */
2030 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2031 if (bshift < 0) {
2032 error = EINVAL;
2033 goto out;
2034 }
2035 bp->b_dev = dev;
2036 bp->b_bshift = bshift;
2037 bp->b_bsize = blocksize(bshift);
2038
2039 /* get our ducks in a row for the read */
2040 bp->b_blkno = btodb(RF_COMPONENT_INFO_OFFSET, bshift);
2041 bp->b_resid = btodb(RF_COMPONENT_INFO_SIZE , bshift);
2042 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2043 bp->b_flags = B_BUSY | B_READ;
2044
2045 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2046
2047 error = biowait(bp);
2048
2049 out:
2050 if (!error) {
2051 memcpy(component_label, bp->b_un.b_addr,
2052 sizeof(RF_ComponentLabel_t));
2053 #if 0
2054 printf("raidread_component_label: got component label:\n");
2055 printf("Version: %d\n",component_label->version);
2056 printf("Serial Number: %d\n",component_label->serial_number);
2057 printf("Mod counter: %d\n",component_label->mod_counter);
2058 printf("Row: %d\n", component_label->row);
2059 printf("Column: %d\n", component_label->column);
2060 printf("Num Rows: %d\n", component_label->num_rows);
2061 printf("Num Columns: %d\n", component_label->num_columns);
2062 printf("Clean: %d\n", component_label->clean);
2063 printf("Status: %d\n", component_label->status);
2064 #endif
2065 } else {
2066 printf("Failed to read RAID component label!\n");
2067 }
2068
2069 bp->b_flags = B_INVAL | B_AGE;
2070 brelse(bp);
2071 return(error);
2072 }
2073 /* ARGSUSED */
2074 int
2075 raidwrite_component_label(dev, b_vp, component_label, bshift, bsize)
2076 dev_t dev;
2077 struct vnode *b_vp;
2078 RF_ComponentLabel_t *component_label;
2079 int bshift;
2080 int bsize;
2081 {
2082 struct buf *bp;
2083 int error;
2084
2085 /* get a block of the appropriate size... */
2086 bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2087 if (bshift < 0) {
2088 error = EINVAL;
2089 goto out;
2090 }
2091 bp->b_dev = dev;
2092 bp->b_bshift = bshift;
2093 bp->b_bsize = blocksize(bshift);
2094
2095 /* get our ducks in a row for the write */
2096 bp->b_blkno = btodb(RF_COMPONENT_INFO_OFFSET, bshift);
2097 bp->b_resid = btodb(RF_COMPONENT_INFO_SIZE, bshift);
2098 bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2099 bp->b_flags = B_BUSY | B_WRITE;
2100
2101 memset( bp->b_un.b_addr, 0, RF_COMPONENT_INFO_SIZE );
2102
2103 memcpy( bp->b_un.b_addr, component_label, sizeof(RF_ComponentLabel_t));
2104
2105 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2106 error = biowait(bp);
2107
2108 out:
2109 bp->b_flags = B_INVAL | B_AGE;
2110 brelse(bp);
2111 if (error) {
2112 printf("Failed to write RAID component info!\n");
2113 }
2114
2115 return(error);
2116 }
2117
2118 void
2119 rf_markalldirty( raidPtr )
2120 RF_Raid_t *raidPtr;
2121 {
2122 RF_ComponentLabel_t c_label;
2123 int r,c;
2124
2125 raidPtr->mod_counter++;
2126 for (r = 0; r < raidPtr->numRow; r++) {
2127 for (c = 0; c < raidPtr->numCol; c++) {
2128 if (raidPtr->Disks[r][c].status != rf_ds_failed) {
2129 raidread_component_label(
2130 raidPtr->Disks[r][c].dev,
2131 raidPtr->raid_cinfo[r][c].ci_vp,
2132 &c_label);
2133 if (c_label.status == rf_ds_spared) {
2134 /* XXX do something special...
2135 but whatever you do, don't
2136 try to access it!! */
2137 } else {
2138 #if 0
2139 c_label.status =
2140 raidPtr->Disks[r][c].status;
2141 raidwrite_component_label(
2142 raidPtr->Disks[r][c].dev,
2143 raidPtr->raid_cinfo[r][c].ci_vp,
2144 &c_label);
2145 #endif
2146 raidmarkdirty(
2147 raidPtr->Disks[r][c].dev,
2148 raidPtr->raid_cinfo[r][c].ci_vp,
2149 raidPtr->mod_counter);
2150 }
2151 }
2152 }
2153 }
2154 /* printf("Component labels marked dirty.\n"); */
2155 #if 0
2156 for( c = 0; c < raidPtr->numSpare ; c++) {
2157 sparecol = raidPtr->numCol + c;
2158 if (raidPtr->Disks[r][sparecol].status == rf_ds_used_spare) {
2159 /*
2160
2161 XXX this is where we get fancy and map this spare
2162 into it's correct spot in the array.
2163
2164 */
2165 /*
2166
2167 we claim this disk is "optimal" if it's
2168 rf_ds_used_spare, as that means it should be
2169 directly substitutable for the disk it replaced.
2170 We note that too...
2171
2172 */
2173
2174 for(i=0;i<raidPtr->numRow;i++) {
2175 for(j=0;j<raidPtr->numCol;j++) {
2176 if ((raidPtr->Disks[i][j].spareRow ==
2177 r) &&
2178 (raidPtr->Disks[i][j].spareCol ==
2179 sparecol)) {
2180 srow = r;
2181 scol = sparecol;
2182 break;
2183 }
2184 }
2185 }
2186
2187 raidread_component_label(
2188 raidPtr->Disks[r][sparecol].dev,
2189 raidPtr->raid_cinfo[r][sparecol].ci_vp,
2190 &c_label);
2191 /* make sure status is noted */
2192 c_label.version = RF_COMPONENT_LABEL_VERSION;
2193 c_label.mod_counter = raidPtr->mod_counter;
2194 c_label.serial_number = raidPtr->serial_number;
2195 c_label.row = srow;
2196 c_label.column = scol;
2197 c_label.num_rows = raidPtr->numRow;
2198 c_label.num_columns = raidPtr->numCol;
2199 c_label.clean = RF_RAID_DIRTY; /* changed in a bit*/
2200 c_label.status = rf_ds_optimal;
2201 raidwrite_component_label(
2202 raidPtr->Disks[r][sparecol].dev,
2203 raidPtr->raid_cinfo[r][sparecol].ci_vp,
2204 &c_label);
2205 raidmarkclean( raidPtr->Disks[r][sparecol].dev,
2206 raidPtr->raid_cinfo[r][sparecol].ci_vp);
2207 }
2208 }
2209
2210 #endif
2211 }
2212
2213
2214 void
2215 rf_update_component_labels( raidPtr )
2216 RF_Raid_t *raidPtr;
2217 {
2218 RF_ComponentLabel_t c_label;
2219 int sparecol;
2220 int r,c;
2221 int i,j;
2222 int srow, scol;
2223
2224 srow = -1;
2225 scol = -1;
2226
2227 /* XXX should do extra checks to make sure things really are clean,
2228 rather than blindly setting the clean bit... */
2229
2230 raidPtr->mod_counter++;
2231
2232 for (r = 0; r < raidPtr->numRow; r++) {
2233 for (c = 0; c < raidPtr->numCol; c++) {
2234 if (raidPtr->Disks[r][c].status == rf_ds_optimal) {
2235 raidread_component_label(
2236 raidPtr->Disks[r][c].dev,
2237 raidPtr->raid_cinfo[r][c].ci_vp,
2238 &c_label);
2239 /* make sure status is noted */
2240 c_label.status = rf_ds_optimal;
2241 raidwrite_component_label(
2242 raidPtr->Disks[r][c].dev,
2243 raidPtr->raid_cinfo[r][c].ci_vp,
2244 &c_label);
2245 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2246 raidmarkclean(
2247 raidPtr->Disks[r][c].dev,
2248 raidPtr->raid_cinfo[r][c].ci_vp,
2249 raidPtr->mod_counter);
2250 }
2251 }
2252 /* else we don't touch it.. */
2253 #if 0
2254 else if (raidPtr->Disks[r][c].status !=
2255 rf_ds_failed) {
2256 raidread_component_label(
2257 raidPtr->Disks[r][c].dev,
2258 raidPtr->raid_cinfo[r][c].ci_vp,
2259 &c_label);
2260 /* make sure status is noted */
2261 c_label.status =
2262 raidPtr->Disks[r][c].status;
2263 raidwrite_component_label(
2264 raidPtr->Disks[r][c].dev,
2265 raidPtr->raid_cinfo[r][c].ci_vp,
2266 &c_label);
2267 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2268 raidmarkclean(
2269 raidPtr->Disks[r][c].dev,
2270 raidPtr->raid_cinfo[r][c].ci_vp,
2271 raidPtr->mod_counter);
2272 }
2273 }
2274 #endif
2275 }
2276 }
2277
2278 for( c = 0; c < raidPtr->numSpare ; c++) {
2279 sparecol = raidPtr->numCol + c;
2280 if (raidPtr->Disks[0][sparecol].status == rf_ds_used_spare) {
2281 /*
2282
2283 we claim this disk is "optimal" if it's
2284 rf_ds_used_spare, as that means it should be
2285 directly substitutable for the disk it replaced.
2286 We note that too...
2287
2288 */
2289
2290 for(i=0;i<raidPtr->numRow;i++) {
2291 for(j=0;j<raidPtr->numCol;j++) {
2292 if ((raidPtr->Disks[i][j].spareRow ==
2293 0) &&
2294 (raidPtr->Disks[i][j].spareCol ==
2295 sparecol)) {
2296 srow = i;
2297 scol = j;
2298 break;
2299 }
2300 }
2301 }
2302
2303 raidread_component_label(
2304 raidPtr->Disks[0][sparecol].dev,
2305 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2306 &c_label);
2307 /* make sure status is noted */
2308 c_label.version = RF_COMPONENT_LABEL_VERSION;
2309 c_label.mod_counter = raidPtr->mod_counter;
2310 c_label.serial_number = raidPtr->serial_number;
2311 c_label.row = srow;
2312 c_label.column = scol;
2313 c_label.num_rows = raidPtr->numRow;
2314 c_label.num_columns = raidPtr->numCol;
2315 c_label.clean = RF_RAID_DIRTY; /* changed in a bit*/
2316 c_label.status = rf_ds_optimal;
2317 raidwrite_component_label(
2318 raidPtr->Disks[0][sparecol].dev,
2319 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2320 &c_label);
2321 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2322 raidmarkclean( raidPtr->Disks[0][sparecol].dev,
2323 raidPtr->raid_cinfo[0][sparecol].ci_vp,
2324 raidPtr->mod_counter);
2325 }
2326 }
2327 }
2328 /* printf("Component labels updated\n"); */
2329 }
2330