rf_netbsdkintf.c revision 1.295.4.2 1 /* $NetBSD: rf_netbsdkintf.c,v 1.295.4.2 2012/04/29 23:04:59 mrg Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.295.4.2 2012/04/29 23:04:59 mrg Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #include "raid.h"
110 #endif
111
112 #include <sys/param.h>
113 #include <sys/errno.h>
114 #include <sys/pool.h>
115 #include <sys/proc.h>
116 #include <sys/queue.h>
117 #include <sys/disk.h>
118 #include <sys/device.h>
119 #include <sys/stat.h>
120 #include <sys/ioctl.h>
121 #include <sys/fcntl.h>
122 #include <sys/systm.h>
123 #include <sys/vnode.h>
124 #include <sys/disklabel.h>
125 #include <sys/conf.h>
126 #include <sys/buf.h>
127 #include <sys/bufq.h>
128 #include <sys/reboot.h>
129 #include <sys/kauth.h>
130
131 #include <prop/proplib.h>
132
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154
155 #ifdef DEBUG
156 int rf_kdebug_level = 0;
157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
158 #else /* DEBUG */
159 #define db1_printf(a) { }
160 #endif /* DEBUG */
161
162 static RF_Raid_t **raidPtrs; /* global raid device descriptors */
163
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 static rf_declare_mutex2(rf_sparet_wait_mutex);
166 static rf_declare_cond2(rf_sparet_wait_cv);
167 static rf_declare_cond2(rf_sparet_resp_cv);
168
169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
170 * spare table */
171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
172 * installation process */
173 #endif
174
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181 void *, int, struct proc *);
182 static void raidinit(RF_Raid_t *);
183
184 void raidattach(int);
185 static int raid_match(device_t, cfdata_t, void *);
186 static void raid_attach(device_t, device_t, void *);
187 static int raid_detach(device_t, int);
188
189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t);
191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
192 daddr_t, daddr_t, int);
193
194 static int raidwrite_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196 static int raidread_component_label(unsigned,
197 dev_t, struct vnode *, RF_ComponentLabel_t *);
198
199
200 dev_type_open(raidopen);
201 dev_type_close(raidclose);
202 dev_type_read(raidread);
203 dev_type_write(raidwrite);
204 dev_type_ioctl(raidioctl);
205 dev_type_strategy(raidstrategy);
206 dev_type_dump(raiddump);
207 dev_type_size(raidsize);
208
209 const struct bdevsw raid_bdevsw = {
210 raidopen, raidclose, raidstrategy, raidioctl,
211 raiddump, raidsize, D_DISK
212 };
213
214 const struct cdevsw raid_cdevsw = {
215 raidopen, raidclose, raidread, raidwrite, raidioctl,
216 nostop, notty, nopoll, nommap, nokqfilter, D_DISK
217 };
218
219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
220
221 /* XXX Not sure if the following should be replacing the raidPtrs above,
222 or if it should be used in conjunction with that...
223 */
224
225 struct raid_softc {
226 device_t sc_dev;
227 int sc_flags; /* flags */
228 int sc_cflags; /* configuration flags */
229 uint64_t sc_size; /* size of the raid device */
230 char sc_xname[20]; /* XXX external name */
231 struct disk sc_dkdev; /* generic disk device info */
232 struct bufq_state *buf_queue; /* used for the device queue */
233 };
234 /* sc_flags */
235 #define RAIDF_INITED 0x01 /* unit has been initialized */
236 #define RAIDF_WLABEL 0x02 /* label area is writable */
237 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
238 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */
239 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
240 #define RAIDF_LOCKED 0x80 /* unit is locked */
241
242 #define raidunit(x) DISKUNIT(x)
243 int numraid = 0;
244
245 extern struct cfdriver raid_cd;
246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
247 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
248 DVF_DETACH_SHUTDOWN);
249
250 /*
251 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
252 * Be aware that large numbers can allow the driver to consume a lot of
253 * kernel memory, especially on writes, and in degraded mode reads.
254 *
255 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
256 * a single 64K write will typically require 64K for the old data,
257 * 64K for the old parity, and 64K for the new parity, for a total
258 * of 192K (if the parity buffer is not re-used immediately).
259 * Even it if is used immediately, that's still 128K, which when multiplied
260 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
261 *
262 * Now in degraded mode, for example, a 64K read on the above setup may
263 * require data reconstruction, which will require *all* of the 4 remaining
264 * disks to participate -- 4 * 32K/disk == 128K again.
265 */
266
267 #ifndef RAIDOUTSTANDING
268 #define RAIDOUTSTANDING 6
269 #endif
270
271 #define RAIDLABELDEV(dev) \
272 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
273
274 /* declared here, and made public, for the benefit of KVM stuff.. */
275 struct raid_softc *raid_softc;
276
277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
278 struct disklabel *);
279 static void raidgetdisklabel(dev_t);
280 static void raidmakedisklabel(struct raid_softc *);
281
282 static int raidlock(struct raid_softc *);
283 static void raidunlock(struct raid_softc *);
284
285 static int raid_detach_unlocked(struct raid_softc *);
286
287 static void rf_markalldirty(RF_Raid_t *);
288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
289
290 void rf_ReconThread(struct rf_recon_req *);
291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
292 void rf_CopybackThread(RF_Raid_t *raidPtr);
293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
294 int rf_autoconfig(device_t);
295 void rf_buildroothack(RF_ConfigSet_t *);
296
297 RF_AutoConfig_t *rf_find_raid_components(void);
298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
300 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
302 int rf_set_autoconfig(RF_Raid_t *, int);
303 int rf_set_rootpartition(RF_Raid_t *, int);
304 void rf_release_all_vps(RF_ConfigSet_t *);
305 void rf_cleanup_config_set(RF_ConfigSet_t *);
306 int rf_have_enough_components(RF_ConfigSet_t *);
307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
309
310 /*
311 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
312 * Note that this is overridden by having RAID_AUTOCONFIG as an option
313 * in the kernel config file.
314 */
315 #ifdef RAID_AUTOCONFIG
316 int raidautoconfig = 1;
317 #else
318 int raidautoconfig = 0;
319 #endif
320 static bool raidautoconfigdone = false;
321
322 struct RF_Pools_s rf_pools;
323
324 void
325 raidattach(int num)
326 {
327 int raidID;
328 int i, rc;
329
330 aprint_debug("raidattach: Asked for %d units\n", num);
331
332 if (num <= 0) {
333 #ifdef DIAGNOSTIC
334 panic("raidattach: count <= 0");
335 #endif
336 return;
337 }
338 /* This is where all the initialization stuff gets done. */
339
340 numraid = num;
341
342 /* Make some space for requested number of units... */
343
344 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
345 if (raidPtrs == NULL) {
346 panic("raidPtrs is NULL!!");
347 }
348
349 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
350 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
351 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
352 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
353
354 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
355 #endif
356
357 for (i = 0; i < num; i++)
358 raidPtrs[i] = NULL;
359 rc = rf_BootRaidframe();
360 if (rc == 0)
361 aprint_verbose("Kernelized RAIDframe activated\n");
362 else
363 panic("Serious error booting RAID!!");
364
365 /* put together some datastructures like the CCD device does.. This
366 * lets us lock the device and what-not when it gets opened. */
367
368 raid_softc = (struct raid_softc *)
369 malloc(num * sizeof(struct raid_softc),
370 M_RAIDFRAME, M_NOWAIT);
371 if (raid_softc == NULL) {
372 aprint_error("WARNING: no memory for RAIDframe driver\n");
373 return;
374 }
375
376 memset(raid_softc, 0, num * sizeof(struct raid_softc));
377
378 for (raidID = 0; raidID < num; raidID++) {
379 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
380
381 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
382 (RF_Raid_t *));
383 if (raidPtrs[raidID] == NULL) {
384 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
385 numraid = raidID;
386 return;
387 }
388 }
389
390 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
391 aprint_error("raidattach: config_cfattach_attach failed?\n");
392 }
393
394 raidautoconfigdone = false;
395
396 /*
397 * Register a finalizer which will be used to auto-config RAID
398 * sets once all real hardware devices have been found.
399 */
400 if (config_finalize_register(NULL, rf_autoconfig) != 0)
401 aprint_error("WARNING: unable to register RAIDframe finalizer\n");
402 }
403
404 int
405 rf_autoconfig(device_t self)
406 {
407 RF_AutoConfig_t *ac_list;
408 RF_ConfigSet_t *config_sets;
409
410 if (!raidautoconfig || raidautoconfigdone == true)
411 return (0);
412
413 /* XXX This code can only be run once. */
414 raidautoconfigdone = true;
415
416 /* 1. locate all RAID components on the system */
417 aprint_debug("Searching for RAID components...\n");
418 ac_list = rf_find_raid_components();
419
420 /* 2. Sort them into their respective sets. */
421 config_sets = rf_create_auto_sets(ac_list);
422
423 /*
424 * 3. Evaluate each set andconfigure the valid ones.
425 * This gets done in rf_buildroothack().
426 */
427 rf_buildroothack(config_sets);
428
429 return 1;
430 }
431
432 void
433 rf_buildroothack(RF_ConfigSet_t *config_sets)
434 {
435 RF_ConfigSet_t *cset;
436 RF_ConfigSet_t *next_cset;
437 int retcode;
438 int raidID;
439 int rootID;
440 int col;
441 int num_root;
442 char *devname;
443
444 rootID = 0;
445 num_root = 0;
446 cset = config_sets;
447 while (cset != NULL) {
448 next_cset = cset->next;
449 if (rf_have_enough_components(cset) &&
450 cset->ac->clabel->autoconfigure==1) {
451 retcode = rf_auto_config_set(cset,&raidID);
452 if (!retcode) {
453 aprint_debug("raid%d: configured ok\n", raidID);
454 if (cset->rootable) {
455 rootID = raidID;
456 num_root++;
457 }
458 } else {
459 /* The autoconfig didn't work :( */
460 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
461 rf_release_all_vps(cset);
462 }
463 } else {
464 /* we're not autoconfiguring this set...
465 release the associated resources */
466 rf_release_all_vps(cset);
467 }
468 /* cleanup */
469 rf_cleanup_config_set(cset);
470 cset = next_cset;
471 }
472
473 /* if the user has specified what the root device should be
474 then we don't touch booted_device or boothowto... */
475
476 if (rootspec != NULL)
477 return;
478
479 /* we found something bootable... */
480
481 if (num_root == 1) {
482 if (raid_softc[rootID].sc_dkdev.dk_nwedges != 0) {
483 /* XXX: How do we find the real root partition? */
484 char cname[sizeof(cset->ac->devname)];
485 snprintf(cname, sizeof(cname), "%s%c",
486 device_xname(raid_softc[rootID].sc_dev), 'a');
487 booted_device = dkwedge_find_by_wname(cname);
488 } else
489 booted_device = raid_softc[rootID].sc_dev;
490 } else if (num_root > 1) {
491
492 /*
493 * Maybe the MD code can help. If it cannot, then
494 * setroot() will discover that we have no
495 * booted_device and will ask the user if nothing was
496 * hardwired in the kernel config file
497 */
498
499 if (booted_device == NULL)
500 cpu_rootconf();
501 if (booted_device == NULL)
502 return;
503
504 num_root = 0;
505 for (raidID = 0; raidID < numraid; raidID++) {
506 if (raidPtrs[raidID]->valid == 0)
507 continue;
508
509 if (raidPtrs[raidID]->root_partition == 0)
510 continue;
511
512 for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
513 devname = raidPtrs[raidID]->Disks[col].devname;
514 devname += sizeof("/dev/") - 1;
515 if (strncmp(devname, device_xname(booted_device),
516 strlen(device_xname(booted_device))) != 0)
517 continue;
518 aprint_debug("raid%d includes boot device %s\n",
519 raidID, devname);
520 num_root++;
521 rootID = raidID;
522 }
523 }
524
525 if (num_root == 1) {
526 booted_device = raid_softc[rootID].sc_dev;
527 } else {
528 /* we can't guess.. require the user to answer... */
529 boothowto |= RB_ASKNAME;
530 }
531 }
532 }
533
534
535 int
536 raidsize(dev_t dev)
537 {
538 struct raid_softc *rs;
539 struct disklabel *lp;
540 int part, unit, omask, size;
541
542 unit = raidunit(dev);
543 if (unit >= numraid)
544 return (-1);
545 rs = &raid_softc[unit];
546
547 if ((rs->sc_flags & RAIDF_INITED) == 0)
548 return (-1);
549
550 part = DISKPART(dev);
551 omask = rs->sc_dkdev.dk_openmask & (1 << part);
552 lp = rs->sc_dkdev.dk_label;
553
554 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
555 return (-1);
556
557 if (lp->d_partitions[part].p_fstype != FS_SWAP)
558 size = -1;
559 else
560 size = lp->d_partitions[part].p_size *
561 (lp->d_secsize / DEV_BSIZE);
562
563 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
564 return (-1);
565
566 return (size);
567
568 }
569
570 int
571 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
572 {
573 int unit = raidunit(dev);
574 struct raid_softc *rs;
575 const struct bdevsw *bdev;
576 struct disklabel *lp;
577 RF_Raid_t *raidPtr;
578 daddr_t offset;
579 int part, c, sparecol, j, scol, dumpto;
580 int error = 0;
581
582 if (unit >= numraid)
583 return (ENXIO);
584
585 rs = &raid_softc[unit];
586 raidPtr = raidPtrs[unit];
587
588 if ((rs->sc_flags & RAIDF_INITED) == 0)
589 return ENXIO;
590
591 /* we only support dumping to RAID 1 sets */
592 if (raidPtr->Layout.numDataCol != 1 ||
593 raidPtr->Layout.numParityCol != 1)
594 return EINVAL;
595
596
597 if ((error = raidlock(rs)) != 0)
598 return error;
599
600 if (size % DEV_BSIZE != 0) {
601 error = EINVAL;
602 goto out;
603 }
604
605 if (blkno + size / DEV_BSIZE > rs->sc_size) {
606 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
607 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
608 size / DEV_BSIZE, rs->sc_size);
609 error = EINVAL;
610 goto out;
611 }
612
613 part = DISKPART(dev);
614 lp = rs->sc_dkdev.dk_label;
615 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
616
617 /* figure out what device is alive.. */
618
619 /*
620 Look for a component to dump to. The preference for the
621 component to dump to is as follows:
622 1) the master
623 2) a used_spare of the master
624 3) the slave
625 4) a used_spare of the slave
626 */
627
628 dumpto = -1;
629 for (c = 0; c < raidPtr->numCol; c++) {
630 if (raidPtr->Disks[c].status == rf_ds_optimal) {
631 /* this might be the one */
632 dumpto = c;
633 break;
634 }
635 }
636
637 /*
638 At this point we have possibly selected a live master or a
639 live slave. We now check to see if there is a spared
640 master (or a spared slave), if we didn't find a live master
641 or a live slave.
642 */
643
644 for (c = 0; c < raidPtr->numSpare; c++) {
645 sparecol = raidPtr->numCol + c;
646 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
647 /* How about this one? */
648 scol = -1;
649 for(j=0;j<raidPtr->numCol;j++) {
650 if (raidPtr->Disks[j].spareCol == sparecol) {
651 scol = j;
652 break;
653 }
654 }
655 if (scol == 0) {
656 /*
657 We must have found a spared master!
658 We'll take that over anything else
659 found so far. (We couldn't have
660 found a real master before, since
661 this is a used spare, and it's
662 saying that it's replacing the
663 master.) On reboot (with
664 autoconfiguration turned on)
665 sparecol will become the 1st
666 component (component0) of this set.
667 */
668 dumpto = sparecol;
669 break;
670 } else if (scol != -1) {
671 /*
672 Must be a spared slave. We'll dump
673 to that if we havn't found anything
674 else so far.
675 */
676 if (dumpto == -1)
677 dumpto = sparecol;
678 }
679 }
680 }
681
682 if (dumpto == -1) {
683 /* we couldn't find any live components to dump to!?!?
684 */
685 error = EINVAL;
686 goto out;
687 }
688
689 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
690
691 /*
692 Note that blkno is relative to this particular partition.
693 By adding the offset of this partition in the RAID
694 set, and also adding RF_PROTECTED_SECTORS, we get a
695 value that is relative to the partition used for the
696 underlying component.
697 */
698
699 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
700 blkno + offset, va, size);
701
702 out:
703 raidunlock(rs);
704
705 return error;
706 }
707 /* ARGSUSED */
708 int
709 raidopen(dev_t dev, int flags, int fmt,
710 struct lwp *l)
711 {
712 int unit = raidunit(dev);
713 struct raid_softc *rs;
714 struct disklabel *lp;
715 int part, pmask;
716 int error = 0;
717
718 if (unit >= numraid)
719 return (ENXIO);
720 rs = &raid_softc[unit];
721
722 if ((error = raidlock(rs)) != 0)
723 return (error);
724
725 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
726 error = EBUSY;
727 goto bad;
728 }
729
730 lp = rs->sc_dkdev.dk_label;
731
732 part = DISKPART(dev);
733
734 /*
735 * If there are wedges, and this is not RAW_PART, then we
736 * need to fail.
737 */
738 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
739 error = EBUSY;
740 goto bad;
741 }
742 pmask = (1 << part);
743
744 if ((rs->sc_flags & RAIDF_INITED) &&
745 (rs->sc_dkdev.dk_openmask == 0))
746 raidgetdisklabel(dev);
747
748 /* make sure that this partition exists */
749
750 if (part != RAW_PART) {
751 if (((rs->sc_flags & RAIDF_INITED) == 0) ||
752 ((part >= lp->d_npartitions) ||
753 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
754 error = ENXIO;
755 goto bad;
756 }
757 }
758 /* Prevent this unit from being unconfigured while open. */
759 switch (fmt) {
760 case S_IFCHR:
761 rs->sc_dkdev.dk_copenmask |= pmask;
762 break;
763
764 case S_IFBLK:
765 rs->sc_dkdev.dk_bopenmask |= pmask;
766 break;
767 }
768
769 if ((rs->sc_dkdev.dk_openmask == 0) &&
770 ((rs->sc_flags & RAIDF_INITED) != 0)) {
771 /* First one... mark things as dirty... Note that we *MUST*
772 have done a configure before this. I DO NOT WANT TO BE
773 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
774 THAT THEY BELONG TOGETHER!!!!! */
775 /* XXX should check to see if we're only open for reading
776 here... If so, we needn't do this, but then need some
777 other way of keeping track of what's happened.. */
778
779 rf_markalldirty(raidPtrs[unit]);
780 }
781
782
783 rs->sc_dkdev.dk_openmask =
784 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
785
786 bad:
787 raidunlock(rs);
788
789 return (error);
790
791
792 }
793 /* ARGSUSED */
794 int
795 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
796 {
797 int unit = raidunit(dev);
798 struct raid_softc *rs;
799 int error = 0;
800 int part;
801
802 if (unit >= numraid)
803 return (ENXIO);
804 rs = &raid_softc[unit];
805
806 if ((error = raidlock(rs)) != 0)
807 return (error);
808
809 part = DISKPART(dev);
810
811 /* ...that much closer to allowing unconfiguration... */
812 switch (fmt) {
813 case S_IFCHR:
814 rs->sc_dkdev.dk_copenmask &= ~(1 << part);
815 break;
816
817 case S_IFBLK:
818 rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
819 break;
820 }
821 rs->sc_dkdev.dk_openmask =
822 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
823
824 if ((rs->sc_dkdev.dk_openmask == 0) &&
825 ((rs->sc_flags & RAIDF_INITED) != 0)) {
826 /* Last one... device is not unconfigured yet.
827 Device shutdown has taken care of setting the
828 clean bits if RAIDF_INITED is not set
829 mark things as clean... */
830
831 rf_update_component_labels(raidPtrs[unit],
832 RF_FINAL_COMPONENT_UPDATE);
833
834 /* If the kernel is shutting down, it will detach
835 * this RAID set soon enough.
836 */
837 }
838
839 raidunlock(rs);
840 return (0);
841
842 }
843
844 void
845 raidstrategy(struct buf *bp)
846 {
847 unsigned int raidID = raidunit(bp->b_dev);
848 RF_Raid_t *raidPtr;
849 struct raid_softc *rs = &raid_softc[raidID];
850 int wlabel;
851
852 if ((rs->sc_flags & RAIDF_INITED) ==0) {
853 bp->b_error = ENXIO;
854 goto done;
855 }
856 if (raidID >= numraid || !raidPtrs[raidID]) {
857 bp->b_error = ENODEV;
858 goto done;
859 }
860 raidPtr = raidPtrs[raidID];
861 if (!raidPtr->valid) {
862 bp->b_error = ENODEV;
863 goto done;
864 }
865 if (bp->b_bcount == 0) {
866 db1_printf(("b_bcount is zero..\n"));
867 goto done;
868 }
869
870 /*
871 * Do bounds checking and adjust transfer. If there's an
872 * error, the bounds check will flag that for us.
873 */
874
875 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
876 if (DISKPART(bp->b_dev) == RAW_PART) {
877 uint64_t size; /* device size in DEV_BSIZE unit */
878
879 if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
880 size = raidPtr->totalSectors <<
881 (raidPtr->logBytesPerSector - DEV_BSHIFT);
882 } else {
883 size = raidPtr->totalSectors >>
884 (DEV_BSHIFT - raidPtr->logBytesPerSector);
885 }
886 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
887 goto done;
888 }
889 } else {
890 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
891 db1_printf(("Bounds check failed!!:%d %d\n",
892 (int) bp->b_blkno, (int) wlabel));
893 goto done;
894 }
895 }
896
897 rf_lock_mutex2(raidPtr->iodone_lock);
898
899 bp->b_resid = 0;
900
901 /* stuff it onto our queue */
902 bufq_put(rs->buf_queue, bp);
903
904 /* scheduled the IO to happen at the next convenient time */
905 rf_signal_cond2(raidPtr->iodone_cv);
906 rf_unlock_mutex2(raidPtr->iodone_lock);
907
908 return;
909
910 done:
911 bp->b_resid = bp->b_bcount;
912 biodone(bp);
913 }
914 /* ARGSUSED */
915 int
916 raidread(dev_t dev, struct uio *uio, int flags)
917 {
918 int unit = raidunit(dev);
919 struct raid_softc *rs;
920
921 if (unit >= numraid)
922 return (ENXIO);
923 rs = &raid_softc[unit];
924
925 if ((rs->sc_flags & RAIDF_INITED) == 0)
926 return (ENXIO);
927
928 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
929
930 }
931 /* ARGSUSED */
932 int
933 raidwrite(dev_t dev, struct uio *uio, int flags)
934 {
935 int unit = raidunit(dev);
936 struct raid_softc *rs;
937
938 if (unit >= numraid)
939 return (ENXIO);
940 rs = &raid_softc[unit];
941
942 if ((rs->sc_flags & RAIDF_INITED) == 0)
943 return (ENXIO);
944
945 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
946
947 }
948
949 static int
950 raid_detach_unlocked(struct raid_softc *rs)
951 {
952 int error;
953 RF_Raid_t *raidPtr;
954
955 raidPtr = raidPtrs[device_unit(rs->sc_dev)];
956
957 /*
958 * If somebody has a partition mounted, we shouldn't
959 * shutdown.
960 */
961 if (rs->sc_dkdev.dk_openmask != 0)
962 return EBUSY;
963
964 if ((rs->sc_flags & RAIDF_INITED) == 0)
965 ; /* not initialized: nothing to do */
966 else if ((error = rf_Shutdown(raidPtr)) != 0)
967 return error;
968 else
969 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
970
971 /* Detach the disk. */
972 dkwedge_delall(&rs->sc_dkdev);
973 disk_detach(&rs->sc_dkdev);
974 disk_destroy(&rs->sc_dkdev);
975
976 aprint_normal_dev(rs->sc_dev, "detached\n");
977
978 return 0;
979 }
980
981 int
982 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
983 {
984 int unit = raidunit(dev);
985 int error = 0;
986 int part, pmask;
987 cfdata_t cf;
988 struct raid_softc *rs;
989 RF_Config_t *k_cfg, *u_cfg;
990 RF_Raid_t *raidPtr;
991 RF_RaidDisk_t *diskPtr;
992 RF_AccTotals_t *totals;
993 RF_DeviceConfig_t *d_cfg, **ucfgp;
994 u_char *specific_buf;
995 int retcode = 0;
996 int column;
997 /* int raidid; */
998 struct rf_recon_req *rrcopy, *rr;
999 RF_ComponentLabel_t *clabel;
1000 RF_ComponentLabel_t *ci_label;
1001 RF_ComponentLabel_t **clabel_ptr;
1002 RF_SingleComponent_t *sparePtr,*componentPtr;
1003 RF_SingleComponent_t component;
1004 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1005 int i, j, d;
1006 #ifdef __HAVE_OLD_DISKLABEL
1007 struct disklabel newlabel;
1008 #endif
1009 struct dkwedge_info *dkw;
1010
1011 if (unit >= numraid)
1012 return (ENXIO);
1013 rs = &raid_softc[unit];
1014 raidPtr = raidPtrs[unit];
1015
1016 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1017 (int) DISKPART(dev), (int) unit, cmd));
1018
1019 /* Must be open for writes for these commands... */
1020 switch (cmd) {
1021 #ifdef DIOCGSECTORSIZE
1022 case DIOCGSECTORSIZE:
1023 *(u_int *)data = raidPtr->bytesPerSector;
1024 return 0;
1025 case DIOCGMEDIASIZE:
1026 *(off_t *)data =
1027 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1028 return 0;
1029 #endif
1030 case DIOCSDINFO:
1031 case DIOCWDINFO:
1032 #ifdef __HAVE_OLD_DISKLABEL
1033 case ODIOCWDINFO:
1034 case ODIOCSDINFO:
1035 #endif
1036 case DIOCWLABEL:
1037 case DIOCAWEDGE:
1038 case DIOCDWEDGE:
1039 if ((flag & FWRITE) == 0)
1040 return (EBADF);
1041 }
1042
1043 /* Must be initialized for these... */
1044 switch (cmd) {
1045 case DIOCGDINFO:
1046 case DIOCSDINFO:
1047 case DIOCWDINFO:
1048 #ifdef __HAVE_OLD_DISKLABEL
1049 case ODIOCGDINFO:
1050 case ODIOCWDINFO:
1051 case ODIOCSDINFO:
1052 case ODIOCGDEFLABEL:
1053 #endif
1054 case DIOCGPART:
1055 case DIOCWLABEL:
1056 case DIOCGDEFLABEL:
1057 case DIOCAWEDGE:
1058 case DIOCDWEDGE:
1059 case DIOCLWEDGES:
1060 case DIOCCACHESYNC:
1061 case RAIDFRAME_SHUTDOWN:
1062 case RAIDFRAME_REWRITEPARITY:
1063 case RAIDFRAME_GET_INFO:
1064 case RAIDFRAME_RESET_ACCTOTALS:
1065 case RAIDFRAME_GET_ACCTOTALS:
1066 case RAIDFRAME_KEEP_ACCTOTALS:
1067 case RAIDFRAME_GET_SIZE:
1068 case RAIDFRAME_FAIL_DISK:
1069 case RAIDFRAME_COPYBACK:
1070 case RAIDFRAME_CHECK_RECON_STATUS:
1071 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1072 case RAIDFRAME_GET_COMPONENT_LABEL:
1073 case RAIDFRAME_SET_COMPONENT_LABEL:
1074 case RAIDFRAME_ADD_HOT_SPARE:
1075 case RAIDFRAME_REMOVE_HOT_SPARE:
1076 case RAIDFRAME_INIT_LABELS:
1077 case RAIDFRAME_REBUILD_IN_PLACE:
1078 case RAIDFRAME_CHECK_PARITY:
1079 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1080 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1081 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1082 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1083 case RAIDFRAME_SET_AUTOCONFIG:
1084 case RAIDFRAME_SET_ROOT:
1085 case RAIDFRAME_DELETE_COMPONENT:
1086 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1087 case RAIDFRAME_PARITYMAP_STATUS:
1088 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1089 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1090 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1091 if ((rs->sc_flags & RAIDF_INITED) == 0)
1092 return (ENXIO);
1093 }
1094
1095 switch (cmd) {
1096 #ifdef COMPAT_50
1097 case RAIDFRAME_GET_INFO50:
1098 return rf_get_info50(raidPtr, data);
1099
1100 case RAIDFRAME_CONFIGURE50:
1101 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1102 return retcode;
1103 goto config;
1104 #endif
1105 /* configure the system */
1106 case RAIDFRAME_CONFIGURE:
1107
1108 if (raidPtr->valid) {
1109 /* There is a valid RAID set running on this unit! */
1110 printf("raid%d: Device already configured!\n",unit);
1111 return(EINVAL);
1112 }
1113
1114 /* copy-in the configuration information */
1115 /* data points to a pointer to the configuration structure */
1116
1117 u_cfg = *((RF_Config_t **) data);
1118 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1119 if (k_cfg == NULL) {
1120 return (ENOMEM);
1121 }
1122 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1123 if (retcode) {
1124 RF_Free(k_cfg, sizeof(RF_Config_t));
1125 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1126 retcode));
1127 return (retcode);
1128 }
1129 goto config;
1130 config:
1131 /* allocate a buffer for the layout-specific data, and copy it
1132 * in */
1133 if (k_cfg->layoutSpecificSize) {
1134 if (k_cfg->layoutSpecificSize > 10000) {
1135 /* sanity check */
1136 RF_Free(k_cfg, sizeof(RF_Config_t));
1137 return (EINVAL);
1138 }
1139 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1140 (u_char *));
1141 if (specific_buf == NULL) {
1142 RF_Free(k_cfg, sizeof(RF_Config_t));
1143 return (ENOMEM);
1144 }
1145 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1146 k_cfg->layoutSpecificSize);
1147 if (retcode) {
1148 RF_Free(k_cfg, sizeof(RF_Config_t));
1149 RF_Free(specific_buf,
1150 k_cfg->layoutSpecificSize);
1151 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1152 retcode));
1153 return (retcode);
1154 }
1155 } else
1156 specific_buf = NULL;
1157 k_cfg->layoutSpecific = specific_buf;
1158
1159 /* should do some kind of sanity check on the configuration.
1160 * Store the sum of all the bytes in the last byte? */
1161
1162 /* configure the system */
1163
1164 /*
1165 * Clear the entire RAID descriptor, just to make sure
1166 * there is no stale data left in the case of a
1167 * reconfiguration
1168 */
1169 memset(raidPtr, 0, sizeof(*raidPtr));
1170 raidPtr->raidid = unit;
1171
1172 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1173
1174 if (retcode == 0) {
1175
1176 /* allow this many simultaneous IO's to
1177 this RAID device */
1178 raidPtr->openings = RAIDOUTSTANDING;
1179
1180 raidinit(raidPtr);
1181 rf_markalldirty(raidPtr);
1182 }
1183 /* free the buffers. No return code here. */
1184 if (k_cfg->layoutSpecificSize) {
1185 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1186 }
1187 RF_Free(k_cfg, sizeof(RF_Config_t));
1188
1189 return (retcode);
1190
1191 /* shutdown the system */
1192 case RAIDFRAME_SHUTDOWN:
1193
1194 part = DISKPART(dev);
1195 pmask = (1 << part);
1196
1197 if ((error = raidlock(rs)) != 0)
1198 return (error);
1199
1200 if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1201 ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1202 (rs->sc_dkdev.dk_copenmask & pmask)))
1203 retcode = EBUSY;
1204 else {
1205 rs->sc_flags |= RAIDF_SHUTDOWN;
1206 rs->sc_dkdev.dk_copenmask &= ~pmask;
1207 rs->sc_dkdev.dk_bopenmask &= ~pmask;
1208 rs->sc_dkdev.dk_openmask &= ~pmask;
1209 retcode = 0;
1210 }
1211
1212 raidunlock(rs);
1213
1214 if (retcode != 0)
1215 return retcode;
1216
1217 /* free the pseudo device attach bits */
1218
1219 cf = device_cfdata(rs->sc_dev);
1220 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1221 free(cf, M_RAIDFRAME);
1222
1223 return (retcode);
1224 case RAIDFRAME_GET_COMPONENT_LABEL:
1225 clabel_ptr = (RF_ComponentLabel_t **) data;
1226 /* need to read the component label for the disk indicated
1227 by row,column in clabel */
1228
1229 /*
1230 * Perhaps there should be an option to skip the in-core
1231 * copy and hit the disk, as with disklabel(8).
1232 */
1233 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1234
1235 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1236
1237 if (retcode) {
1238 RF_Free(clabel, sizeof(*clabel));
1239 return retcode;
1240 }
1241
1242 clabel->row = 0; /* Don't allow looking at anything else.*/
1243
1244 column = clabel->column;
1245
1246 if ((column < 0) || (column >= raidPtr->numCol +
1247 raidPtr->numSpare)) {
1248 RF_Free(clabel, sizeof(*clabel));
1249 return EINVAL;
1250 }
1251
1252 RF_Free(clabel, sizeof(*clabel));
1253
1254 clabel = raidget_component_label(raidPtr, column);
1255
1256 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1257
1258 #if 0
1259 case RAIDFRAME_SET_COMPONENT_LABEL:
1260 clabel = (RF_ComponentLabel_t *) data;
1261
1262 /* XXX check the label for valid stuff... */
1263 /* Note that some things *should not* get modified --
1264 the user should be re-initing the labels instead of
1265 trying to patch things.
1266 */
1267
1268 raidid = raidPtr->raidid;
1269 #ifdef DEBUG
1270 printf("raid%d: Got component label:\n", raidid);
1271 printf("raid%d: Version: %d\n", raidid, clabel->version);
1272 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1273 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1274 printf("raid%d: Column: %d\n", raidid, clabel->column);
1275 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1276 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1277 printf("raid%d: Status: %d\n", raidid, clabel->status);
1278 #endif
1279 clabel->row = 0;
1280 column = clabel->column;
1281
1282 if ((column < 0) || (column >= raidPtr->numCol)) {
1283 return(EINVAL);
1284 }
1285
1286 /* XXX this isn't allowed to do anything for now :-) */
1287
1288 /* XXX and before it is, we need to fill in the rest
1289 of the fields!?!?!?! */
1290 memcpy(raidget_component_label(raidPtr, column),
1291 clabel, sizeof(*clabel));
1292 raidflush_component_label(raidPtr, column);
1293 return (0);
1294 #endif
1295
1296 case RAIDFRAME_INIT_LABELS:
1297 clabel = (RF_ComponentLabel_t *) data;
1298 /*
1299 we only want the serial number from
1300 the above. We get all the rest of the information
1301 from the config that was used to create this RAID
1302 set.
1303 */
1304
1305 raidPtr->serial_number = clabel->serial_number;
1306
1307 for(column=0;column<raidPtr->numCol;column++) {
1308 diskPtr = &raidPtr->Disks[column];
1309 if (!RF_DEAD_DISK(diskPtr->status)) {
1310 ci_label = raidget_component_label(raidPtr,
1311 column);
1312 /* Zeroing this is important. */
1313 memset(ci_label, 0, sizeof(*ci_label));
1314 raid_init_component_label(raidPtr, ci_label);
1315 ci_label->serial_number =
1316 raidPtr->serial_number;
1317 ci_label->row = 0; /* we dont' pretend to support more */
1318 rf_component_label_set_partitionsize(ci_label,
1319 diskPtr->partitionSize);
1320 ci_label->column = column;
1321 raidflush_component_label(raidPtr, column);
1322 }
1323 /* XXXjld what about the spares? */
1324 }
1325
1326 return (retcode);
1327 case RAIDFRAME_SET_AUTOCONFIG:
1328 d = rf_set_autoconfig(raidPtr, *(int *) data);
1329 printf("raid%d: New autoconfig value is: %d\n",
1330 raidPtr->raidid, d);
1331 *(int *) data = d;
1332 return (retcode);
1333
1334 case RAIDFRAME_SET_ROOT:
1335 d = rf_set_rootpartition(raidPtr, *(int *) data);
1336 printf("raid%d: New rootpartition value is: %d\n",
1337 raidPtr->raidid, d);
1338 *(int *) data = d;
1339 return (retcode);
1340
1341 /* initialize all parity */
1342 case RAIDFRAME_REWRITEPARITY:
1343
1344 if (raidPtr->Layout.map->faultsTolerated == 0) {
1345 /* Parity for RAID 0 is trivially correct */
1346 raidPtr->parity_good = RF_RAID_CLEAN;
1347 return(0);
1348 }
1349
1350 if (raidPtr->parity_rewrite_in_progress == 1) {
1351 /* Re-write is already in progress! */
1352 return(EINVAL);
1353 }
1354
1355 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1356 rf_RewriteParityThread,
1357 raidPtr,"raid_parity");
1358 return (retcode);
1359
1360
1361 case RAIDFRAME_ADD_HOT_SPARE:
1362 sparePtr = (RF_SingleComponent_t *) data;
1363 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1364 retcode = rf_add_hot_spare(raidPtr, &component);
1365 return(retcode);
1366
1367 case RAIDFRAME_REMOVE_HOT_SPARE:
1368 return(retcode);
1369
1370 case RAIDFRAME_DELETE_COMPONENT:
1371 componentPtr = (RF_SingleComponent_t *)data;
1372 memcpy( &component, componentPtr,
1373 sizeof(RF_SingleComponent_t));
1374 retcode = rf_delete_component(raidPtr, &component);
1375 return(retcode);
1376
1377 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1378 componentPtr = (RF_SingleComponent_t *)data;
1379 memcpy( &component, componentPtr,
1380 sizeof(RF_SingleComponent_t));
1381 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1382 return(retcode);
1383
1384 case RAIDFRAME_REBUILD_IN_PLACE:
1385
1386 if (raidPtr->Layout.map->faultsTolerated == 0) {
1387 /* Can't do this on a RAID 0!! */
1388 return(EINVAL);
1389 }
1390
1391 if (raidPtr->recon_in_progress == 1) {
1392 /* a reconstruct is already in progress! */
1393 return(EINVAL);
1394 }
1395
1396 componentPtr = (RF_SingleComponent_t *) data;
1397 memcpy( &component, componentPtr,
1398 sizeof(RF_SingleComponent_t));
1399 component.row = 0; /* we don't support any more */
1400 column = component.column;
1401
1402 if ((column < 0) || (column >= raidPtr->numCol)) {
1403 return(EINVAL);
1404 }
1405
1406 rf_lock_mutex2(raidPtr->mutex);
1407 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1408 (raidPtr->numFailures > 0)) {
1409 /* XXX 0 above shouldn't be constant!!! */
1410 /* some component other than this has failed.
1411 Let's not make things worse than they already
1412 are... */
1413 printf("raid%d: Unable to reconstruct to disk at:\n",
1414 raidPtr->raidid);
1415 printf("raid%d: Col: %d Too many failures.\n",
1416 raidPtr->raidid, column);
1417 rf_unlock_mutex2(raidPtr->mutex);
1418 return (EINVAL);
1419 }
1420 if (raidPtr->Disks[column].status ==
1421 rf_ds_reconstructing) {
1422 printf("raid%d: Unable to reconstruct to disk at:\n",
1423 raidPtr->raidid);
1424 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column);
1425
1426 rf_unlock_mutex2(raidPtr->mutex);
1427 return (EINVAL);
1428 }
1429 if (raidPtr->Disks[column].status == rf_ds_spared) {
1430 rf_unlock_mutex2(raidPtr->mutex);
1431 return (EINVAL);
1432 }
1433 rf_unlock_mutex2(raidPtr->mutex);
1434
1435 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1436 if (rrcopy == NULL)
1437 return(ENOMEM);
1438
1439 rrcopy->raidPtr = (void *) raidPtr;
1440 rrcopy->col = column;
1441
1442 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1443 rf_ReconstructInPlaceThread,
1444 rrcopy,"raid_reconip");
1445 return(retcode);
1446
1447 case RAIDFRAME_GET_INFO:
1448 if (!raidPtr->valid)
1449 return (ENODEV);
1450 ucfgp = (RF_DeviceConfig_t **) data;
1451 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1452 (RF_DeviceConfig_t *));
1453 if (d_cfg == NULL)
1454 return (ENOMEM);
1455 d_cfg->rows = 1; /* there is only 1 row now */
1456 d_cfg->cols = raidPtr->numCol;
1457 d_cfg->ndevs = raidPtr->numCol;
1458 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1459 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1460 return (ENOMEM);
1461 }
1462 d_cfg->nspares = raidPtr->numSpare;
1463 if (d_cfg->nspares >= RF_MAX_DISKS) {
1464 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1465 return (ENOMEM);
1466 }
1467 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1468 d = 0;
1469 for (j = 0; j < d_cfg->cols; j++) {
1470 d_cfg->devs[d] = raidPtr->Disks[j];
1471 d++;
1472 }
1473 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1474 d_cfg->spares[i] = raidPtr->Disks[j];
1475 }
1476 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1477 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1478
1479 return (retcode);
1480
1481 case RAIDFRAME_CHECK_PARITY:
1482 *(int *) data = raidPtr->parity_good;
1483 return (0);
1484
1485 case RAIDFRAME_PARITYMAP_STATUS:
1486 if (rf_paritymap_ineligible(raidPtr))
1487 return EINVAL;
1488 rf_paritymap_status(raidPtr->parity_map,
1489 (struct rf_pmstat *)data);
1490 return 0;
1491
1492 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1493 if (rf_paritymap_ineligible(raidPtr))
1494 return EINVAL;
1495 if (raidPtr->parity_map == NULL)
1496 return ENOENT; /* ??? */
1497 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1498 (struct rf_pmparams *)data, 1))
1499 return EINVAL;
1500 return 0;
1501
1502 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1503 if (rf_paritymap_ineligible(raidPtr))
1504 return EINVAL;
1505 *(int *) data = rf_paritymap_get_disable(raidPtr);
1506 return 0;
1507
1508 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1509 if (rf_paritymap_ineligible(raidPtr))
1510 return EINVAL;
1511 rf_paritymap_set_disable(raidPtr, *(int *)data);
1512 /* XXX should errors be passed up? */
1513 return 0;
1514
1515 case RAIDFRAME_RESET_ACCTOTALS:
1516 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1517 return (0);
1518
1519 case RAIDFRAME_GET_ACCTOTALS:
1520 totals = (RF_AccTotals_t *) data;
1521 *totals = raidPtr->acc_totals;
1522 return (0);
1523
1524 case RAIDFRAME_KEEP_ACCTOTALS:
1525 raidPtr->keep_acc_totals = *(int *)data;
1526 return (0);
1527
1528 case RAIDFRAME_GET_SIZE:
1529 *(int *) data = raidPtr->totalSectors;
1530 return (0);
1531
1532 /* fail a disk & optionally start reconstruction */
1533 case RAIDFRAME_FAIL_DISK:
1534
1535 if (raidPtr->Layout.map->faultsTolerated == 0) {
1536 /* Can't do this on a RAID 0!! */
1537 return(EINVAL);
1538 }
1539
1540 rr = (struct rf_recon_req *) data;
1541 rr->row = 0;
1542 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1543 return (EINVAL);
1544
1545
1546 rf_lock_mutex2(raidPtr->mutex);
1547 if (raidPtr->status == rf_rs_reconstructing) {
1548 /* you can't fail a disk while we're reconstructing! */
1549 /* XXX wrong for RAID6 */
1550 rf_unlock_mutex2(raidPtr->mutex);
1551 return (EINVAL);
1552 }
1553 if ((raidPtr->Disks[rr->col].status ==
1554 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1555 /* some other component has failed. Let's not make
1556 things worse. XXX wrong for RAID6 */
1557 rf_unlock_mutex2(raidPtr->mutex);
1558 return (EINVAL);
1559 }
1560 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1561 /* Can't fail a spared disk! */
1562 rf_unlock_mutex2(raidPtr->mutex);
1563 return (EINVAL);
1564 }
1565 rf_unlock_mutex2(raidPtr->mutex);
1566
1567 /* make a copy of the recon request so that we don't rely on
1568 * the user's buffer */
1569 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1570 if (rrcopy == NULL)
1571 return(ENOMEM);
1572 memcpy(rrcopy, rr, sizeof(*rr));
1573 rrcopy->raidPtr = (void *) raidPtr;
1574
1575 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1576 rf_ReconThread,
1577 rrcopy,"raid_recon");
1578 return (0);
1579
1580 /* invoke a copyback operation after recon on whatever disk
1581 * needs it, if any */
1582 case RAIDFRAME_COPYBACK:
1583
1584 if (raidPtr->Layout.map->faultsTolerated == 0) {
1585 /* This makes no sense on a RAID 0!! */
1586 return(EINVAL);
1587 }
1588
1589 if (raidPtr->copyback_in_progress == 1) {
1590 /* Copyback is already in progress! */
1591 return(EINVAL);
1592 }
1593
1594 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1595 rf_CopybackThread,
1596 raidPtr,"raid_copyback");
1597 return (retcode);
1598
1599 /* return the percentage completion of reconstruction */
1600 case RAIDFRAME_CHECK_RECON_STATUS:
1601 if (raidPtr->Layout.map->faultsTolerated == 0) {
1602 /* This makes no sense on a RAID 0, so tell the
1603 user it's done. */
1604 *(int *) data = 100;
1605 return(0);
1606 }
1607 if (raidPtr->status != rf_rs_reconstructing)
1608 *(int *) data = 100;
1609 else {
1610 if (raidPtr->reconControl->numRUsTotal > 0) {
1611 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1612 } else {
1613 *(int *) data = 0;
1614 }
1615 }
1616 return (0);
1617 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1618 progressInfoPtr = (RF_ProgressInfo_t **) data;
1619 if (raidPtr->status != rf_rs_reconstructing) {
1620 progressInfo.remaining = 0;
1621 progressInfo.completed = 100;
1622 progressInfo.total = 100;
1623 } else {
1624 progressInfo.total =
1625 raidPtr->reconControl->numRUsTotal;
1626 progressInfo.completed =
1627 raidPtr->reconControl->numRUsComplete;
1628 progressInfo.remaining = progressInfo.total -
1629 progressInfo.completed;
1630 }
1631 retcode = copyout(&progressInfo, *progressInfoPtr,
1632 sizeof(RF_ProgressInfo_t));
1633 return (retcode);
1634
1635 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1636 if (raidPtr->Layout.map->faultsTolerated == 0) {
1637 /* This makes no sense on a RAID 0, so tell the
1638 user it's done. */
1639 *(int *) data = 100;
1640 return(0);
1641 }
1642 if (raidPtr->parity_rewrite_in_progress == 1) {
1643 *(int *) data = 100 *
1644 raidPtr->parity_rewrite_stripes_done /
1645 raidPtr->Layout.numStripe;
1646 } else {
1647 *(int *) data = 100;
1648 }
1649 return (0);
1650
1651 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1652 progressInfoPtr = (RF_ProgressInfo_t **) data;
1653 if (raidPtr->parity_rewrite_in_progress == 1) {
1654 progressInfo.total = raidPtr->Layout.numStripe;
1655 progressInfo.completed =
1656 raidPtr->parity_rewrite_stripes_done;
1657 progressInfo.remaining = progressInfo.total -
1658 progressInfo.completed;
1659 } else {
1660 progressInfo.remaining = 0;
1661 progressInfo.completed = 100;
1662 progressInfo.total = 100;
1663 }
1664 retcode = copyout(&progressInfo, *progressInfoPtr,
1665 sizeof(RF_ProgressInfo_t));
1666 return (retcode);
1667
1668 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1669 if (raidPtr->Layout.map->faultsTolerated == 0) {
1670 /* This makes no sense on a RAID 0 */
1671 *(int *) data = 100;
1672 return(0);
1673 }
1674 if (raidPtr->copyback_in_progress == 1) {
1675 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1676 raidPtr->Layout.numStripe;
1677 } else {
1678 *(int *) data = 100;
1679 }
1680 return (0);
1681
1682 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1683 progressInfoPtr = (RF_ProgressInfo_t **) data;
1684 if (raidPtr->copyback_in_progress == 1) {
1685 progressInfo.total = raidPtr->Layout.numStripe;
1686 progressInfo.completed =
1687 raidPtr->copyback_stripes_done;
1688 progressInfo.remaining = progressInfo.total -
1689 progressInfo.completed;
1690 } else {
1691 progressInfo.remaining = 0;
1692 progressInfo.completed = 100;
1693 progressInfo.total = 100;
1694 }
1695 retcode = copyout(&progressInfo, *progressInfoPtr,
1696 sizeof(RF_ProgressInfo_t));
1697 return (retcode);
1698
1699 /* the sparetable daemon calls this to wait for the kernel to
1700 * need a spare table. this ioctl does not return until a
1701 * spare table is needed. XXX -- calling mpsleep here in the
1702 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1703 * -- I should either compute the spare table in the kernel,
1704 * or have a different -- XXX XXX -- interface (a different
1705 * character device) for delivering the table -- XXX */
1706 #if 0
1707 case RAIDFRAME_SPARET_WAIT:
1708 rf_lock_mutex2(rf_sparet_wait_mutex);
1709 while (!rf_sparet_wait_queue)
1710 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1711 waitreq = rf_sparet_wait_queue;
1712 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1713 rf_unlock_mutex2(rf_sparet_wait_mutex);
1714
1715 /* structure assignment */
1716 *((RF_SparetWait_t *) data) = *waitreq;
1717
1718 RF_Free(waitreq, sizeof(*waitreq));
1719 return (0);
1720
1721 /* wakes up a process waiting on SPARET_WAIT and puts an error
1722 * code in it that will cause the dameon to exit */
1723 case RAIDFRAME_ABORT_SPARET_WAIT:
1724 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1725 waitreq->fcol = -1;
1726 rf_lock_mutex2(rf_sparet_wait_mutex);
1727 waitreq->next = rf_sparet_wait_queue;
1728 rf_sparet_wait_queue = waitreq;
1729 rf_broadcast_conf2(rf_sparet_wait_cv);
1730 rf_unlock_mutex2(rf_sparet_wait_mutex);
1731 return (0);
1732
1733 /* used by the spare table daemon to deliver a spare table
1734 * into the kernel */
1735 case RAIDFRAME_SEND_SPARET:
1736
1737 /* install the spare table */
1738 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1739
1740 /* respond to the requestor. the return status of the spare
1741 * table installation is passed in the "fcol" field */
1742 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1743 waitreq->fcol = retcode;
1744 rf_lock_mutex2(rf_sparet_wait_mutex);
1745 waitreq->next = rf_sparet_resp_queue;
1746 rf_sparet_resp_queue = waitreq;
1747 rf_broadcast_cond2(rf_sparet_resp_cv);
1748 rf_unlock_mutex2(rf_sparet_wait_mutex);
1749
1750 return (retcode);
1751 #endif
1752
1753 default:
1754 break; /* fall through to the os-specific code below */
1755
1756 }
1757
1758 if (!raidPtr->valid)
1759 return (EINVAL);
1760
1761 /*
1762 * Add support for "regular" device ioctls here.
1763 */
1764
1765 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1766 if (error != EPASSTHROUGH)
1767 return (error);
1768
1769 switch (cmd) {
1770 case DIOCGDINFO:
1771 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1772 break;
1773 #ifdef __HAVE_OLD_DISKLABEL
1774 case ODIOCGDINFO:
1775 newlabel = *(rs->sc_dkdev.dk_label);
1776 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1777 return ENOTTY;
1778 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1779 break;
1780 #endif
1781
1782 case DIOCGPART:
1783 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1784 ((struct partinfo *) data)->part =
1785 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1786 break;
1787
1788 case DIOCWDINFO:
1789 case DIOCSDINFO:
1790 #ifdef __HAVE_OLD_DISKLABEL
1791 case ODIOCWDINFO:
1792 case ODIOCSDINFO:
1793 #endif
1794 {
1795 struct disklabel *lp;
1796 #ifdef __HAVE_OLD_DISKLABEL
1797 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1798 memset(&newlabel, 0, sizeof newlabel);
1799 memcpy(&newlabel, data, sizeof (struct olddisklabel));
1800 lp = &newlabel;
1801 } else
1802 #endif
1803 lp = (struct disklabel *)data;
1804
1805 if ((error = raidlock(rs)) != 0)
1806 return (error);
1807
1808 rs->sc_flags |= RAIDF_LABELLING;
1809
1810 error = setdisklabel(rs->sc_dkdev.dk_label,
1811 lp, 0, rs->sc_dkdev.dk_cpulabel);
1812 if (error == 0) {
1813 if (cmd == DIOCWDINFO
1814 #ifdef __HAVE_OLD_DISKLABEL
1815 || cmd == ODIOCWDINFO
1816 #endif
1817 )
1818 error = writedisklabel(RAIDLABELDEV(dev),
1819 raidstrategy, rs->sc_dkdev.dk_label,
1820 rs->sc_dkdev.dk_cpulabel);
1821 }
1822 rs->sc_flags &= ~RAIDF_LABELLING;
1823
1824 raidunlock(rs);
1825
1826 if (error)
1827 return (error);
1828 break;
1829 }
1830
1831 case DIOCWLABEL:
1832 if (*(int *) data != 0)
1833 rs->sc_flags |= RAIDF_WLABEL;
1834 else
1835 rs->sc_flags &= ~RAIDF_WLABEL;
1836 break;
1837
1838 case DIOCGDEFLABEL:
1839 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1840 break;
1841
1842 #ifdef __HAVE_OLD_DISKLABEL
1843 case ODIOCGDEFLABEL:
1844 raidgetdefaultlabel(raidPtr, rs, &newlabel);
1845 if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1846 return ENOTTY;
1847 memcpy(data, &newlabel, sizeof (struct olddisklabel));
1848 break;
1849 #endif
1850
1851 case DIOCAWEDGE:
1852 case DIOCDWEDGE:
1853 dkw = (void *)data;
1854
1855 /* If the ioctl happens here, the parent is us. */
1856 (void)strcpy(dkw->dkw_parent, rs->sc_xname);
1857 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1858
1859 case DIOCLWEDGES:
1860 return dkwedge_list(&rs->sc_dkdev,
1861 (struct dkwedge_list *)data, l);
1862 case DIOCCACHESYNC:
1863 return rf_sync_component_caches(raidPtr);
1864 default:
1865 retcode = ENOTTY;
1866 }
1867 return (retcode);
1868
1869 }
1870
1871
1872 /* raidinit -- complete the rest of the initialization for the
1873 RAIDframe device. */
1874
1875
1876 static void
1877 raidinit(RF_Raid_t *raidPtr)
1878 {
1879 cfdata_t cf;
1880 struct raid_softc *rs;
1881 int unit;
1882
1883 unit = raidPtr->raidid;
1884
1885 rs = &raid_softc[unit];
1886
1887 /* XXX should check return code first... */
1888 rs->sc_flags |= RAIDF_INITED;
1889
1890 /* XXX doesn't check bounds. */
1891 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1892
1893 /* attach the pseudo device */
1894 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1895 cf->cf_name = raid_cd.cd_name;
1896 cf->cf_atname = raid_cd.cd_name;
1897 cf->cf_unit = unit;
1898 cf->cf_fstate = FSTATE_STAR;
1899
1900 rs->sc_dev = config_attach_pseudo(cf);
1901
1902 if (rs->sc_dev == NULL) {
1903 printf("raid%d: config_attach_pseudo failed\n",
1904 raidPtr->raidid);
1905 rs->sc_flags &= ~RAIDF_INITED;
1906 free(cf, M_RAIDFRAME);
1907 return;
1908 }
1909
1910 /* disk_attach actually creates space for the CPU disklabel, among
1911 * other things, so it's critical to call this *BEFORE* we try putzing
1912 * with disklabels. */
1913
1914 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1915 disk_attach(&rs->sc_dkdev);
1916 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1917
1918 /* XXX There may be a weird interaction here between this, and
1919 * protectedSectors, as used in RAIDframe. */
1920
1921 rs->sc_size = raidPtr->totalSectors;
1922
1923 dkwedge_discover(&rs->sc_dkdev);
1924
1925 rf_set_properties(rs, raidPtr);
1926
1927 }
1928 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1929 /* wake up the daemon & tell it to get us a spare table
1930 * XXX
1931 * the entries in the queues should be tagged with the raidPtr
1932 * so that in the extremely rare case that two recons happen at once,
1933 * we know for which device were requesting a spare table
1934 * XXX
1935 *
1936 * XXX This code is not currently used. GO
1937 */
1938 int
1939 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1940 {
1941 int retcode;
1942
1943 rf_lock_mutex2(rf_sparet_wait_mutex);
1944 req->next = rf_sparet_wait_queue;
1945 rf_sparet_wait_queue = req;
1946 rf_broadcast_cond2(rf_sparet_wait_cv);
1947
1948 /* mpsleep unlocks the mutex */
1949 while (!rf_sparet_resp_queue) {
1950 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1951 }
1952 req = rf_sparet_resp_queue;
1953 rf_sparet_resp_queue = req->next;
1954 rf_unlock_mutex2(rf_sparet_wait_mutex);
1955
1956 retcode = req->fcol;
1957 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1958 * alloc'd */
1959 return (retcode);
1960 }
1961 #endif
1962
1963 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1964 * bp & passes it down.
1965 * any calls originating in the kernel must use non-blocking I/O
1966 * do some extra sanity checking to return "appropriate" error values for
1967 * certain conditions (to make some standard utilities work)
1968 *
1969 * Formerly known as: rf_DoAccessKernel
1970 */
1971 void
1972 raidstart(RF_Raid_t *raidPtr)
1973 {
1974 RF_SectorCount_t num_blocks, pb, sum;
1975 RF_RaidAddr_t raid_addr;
1976 struct partition *pp;
1977 daddr_t blocknum;
1978 int unit;
1979 struct raid_softc *rs;
1980 int do_async;
1981 struct buf *bp;
1982 int rc;
1983
1984 unit = raidPtr->raidid;
1985 rs = &raid_softc[unit];
1986
1987 /* quick check to see if anything has died recently */
1988 rf_lock_mutex2(raidPtr->mutex);
1989 if (raidPtr->numNewFailures > 0) {
1990 rf_unlock_mutex2(raidPtr->mutex);
1991 rf_update_component_labels(raidPtr,
1992 RF_NORMAL_COMPONENT_UPDATE);
1993 rf_lock_mutex2(raidPtr->mutex);
1994 raidPtr->numNewFailures--;
1995 }
1996
1997 /* Check to see if we're at the limit... */
1998 while (raidPtr->openings > 0) {
1999 rf_unlock_mutex2(raidPtr->mutex);
2000
2001 /* get the next item, if any, from the queue */
2002 if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2003 /* nothing more to do */
2004 return;
2005 }
2006
2007 /* Ok, for the bp we have here, bp->b_blkno is relative to the
2008 * partition.. Need to make it absolute to the underlying
2009 * device.. */
2010
2011 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2012 if (DISKPART(bp->b_dev) != RAW_PART) {
2013 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2014 blocknum += pp->p_offset;
2015 }
2016
2017 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2018 (int) blocknum));
2019
2020 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2021 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2022
2023 /* *THIS* is where we adjust what block we're going to...
2024 * but DO NOT TOUCH bp->b_blkno!!! */
2025 raid_addr = blocknum;
2026
2027 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2028 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2029 sum = raid_addr + num_blocks + pb;
2030 if (1 || rf_debugKernelAccess) {
2031 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2032 (int) raid_addr, (int) sum, (int) num_blocks,
2033 (int) pb, (int) bp->b_resid));
2034 }
2035 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2036 || (sum < num_blocks) || (sum < pb)) {
2037 bp->b_error = ENOSPC;
2038 bp->b_resid = bp->b_bcount;
2039 biodone(bp);
2040 rf_lock_mutex2(raidPtr->mutex);
2041 continue;
2042 }
2043 /*
2044 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2045 */
2046
2047 if (bp->b_bcount & raidPtr->sectorMask) {
2048 bp->b_error = EINVAL;
2049 bp->b_resid = bp->b_bcount;
2050 biodone(bp);
2051 rf_lock_mutex2(raidPtr->mutex);
2052 continue;
2053
2054 }
2055 db1_printf(("Calling DoAccess..\n"));
2056
2057
2058 rf_lock_mutex2(raidPtr->mutex);
2059 raidPtr->openings--;
2060 rf_unlock_mutex2(raidPtr->mutex);
2061
2062 /*
2063 * Everything is async.
2064 */
2065 do_async = 1;
2066
2067 disk_busy(&rs->sc_dkdev);
2068
2069 /* XXX we're still at splbio() here... do we *really*
2070 need to be? */
2071
2072 /* don't ever condition on bp->b_flags & B_WRITE.
2073 * always condition on B_READ instead */
2074
2075 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2076 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2077 do_async, raid_addr, num_blocks,
2078 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2079
2080 if (rc) {
2081 bp->b_error = rc;
2082 bp->b_resid = bp->b_bcount;
2083 biodone(bp);
2084 /* continue loop */
2085 }
2086
2087 rf_lock_mutex2(raidPtr->mutex);
2088 }
2089 rf_unlock_mutex2(raidPtr->mutex);
2090 }
2091
2092
2093
2094
2095 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2096
2097 int
2098 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2099 {
2100 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2101 struct buf *bp;
2102
2103 req->queue = queue;
2104 bp = req->bp;
2105
2106 switch (req->type) {
2107 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2108 /* XXX need to do something extra here.. */
2109 /* I'm leaving this in, as I've never actually seen it used,
2110 * and I'd like folks to report it... GO */
2111 printf(("WAKEUP CALLED\n"));
2112 queue->numOutstanding++;
2113
2114 bp->b_flags = 0;
2115 bp->b_private = req;
2116
2117 KernelWakeupFunc(bp);
2118 break;
2119
2120 case RF_IO_TYPE_READ:
2121 case RF_IO_TYPE_WRITE:
2122 #if RF_ACC_TRACE > 0
2123 if (req->tracerec) {
2124 RF_ETIMER_START(req->tracerec->timer);
2125 }
2126 #endif
2127 InitBP(bp, queue->rf_cinfo->ci_vp,
2128 op, queue->rf_cinfo->ci_dev,
2129 req->sectorOffset, req->numSector,
2130 req->buf, KernelWakeupFunc, (void *) req,
2131 queue->raidPtr->logBytesPerSector, req->b_proc);
2132
2133 if (rf_debugKernelAccess) {
2134 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2135 (long) bp->b_blkno));
2136 }
2137 queue->numOutstanding++;
2138 queue->last_deq_sector = req->sectorOffset;
2139 /* acc wouldn't have been let in if there were any pending
2140 * reqs at any other priority */
2141 queue->curPriority = req->priority;
2142
2143 db1_printf(("Going for %c to unit %d col %d\n",
2144 req->type, queue->raidPtr->raidid,
2145 queue->col));
2146 db1_printf(("sector %d count %d (%d bytes) %d\n",
2147 (int) req->sectorOffset, (int) req->numSector,
2148 (int) (req->numSector <<
2149 queue->raidPtr->logBytesPerSector),
2150 (int) queue->raidPtr->logBytesPerSector));
2151
2152 /*
2153 * XXX: drop lock here since this can block at
2154 * least with backing SCSI devices. Retake it
2155 * to minimize fuss with calling interfaces.
2156 */
2157
2158 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2159 bdev_strategy(bp);
2160 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2161 break;
2162
2163 default:
2164 panic("bad req->type in rf_DispatchKernelIO");
2165 }
2166 db1_printf(("Exiting from DispatchKernelIO\n"));
2167
2168 return (0);
2169 }
2170 /* this is the callback function associated with a I/O invoked from
2171 kernel code.
2172 */
2173 static void
2174 KernelWakeupFunc(struct buf *bp)
2175 {
2176 RF_DiskQueueData_t *req = NULL;
2177 RF_DiskQueue_t *queue;
2178
2179 db1_printf(("recovering the request queue:\n"));
2180
2181 req = bp->b_private;
2182
2183 queue = (RF_DiskQueue_t *) req->queue;
2184
2185 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2186
2187 #if RF_ACC_TRACE > 0
2188 if (req->tracerec) {
2189 RF_ETIMER_STOP(req->tracerec->timer);
2190 RF_ETIMER_EVAL(req->tracerec->timer);
2191 rf_lock_mutex2(rf_tracing_mutex);
2192 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2193 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2194 req->tracerec->num_phys_ios++;
2195 rf_unlock_mutex2(rf_tracing_mutex);
2196 }
2197 #endif
2198
2199 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2200 * ballistic, and mark the component as hosed... */
2201
2202 if (bp->b_error != 0) {
2203 /* Mark the disk as dead */
2204 /* but only mark it once... */
2205 /* and only if it wouldn't leave this RAID set
2206 completely broken */
2207 if (((queue->raidPtr->Disks[queue->col].status ==
2208 rf_ds_optimal) ||
2209 (queue->raidPtr->Disks[queue->col].status ==
2210 rf_ds_used_spare)) &&
2211 (queue->raidPtr->numFailures <
2212 queue->raidPtr->Layout.map->faultsTolerated)) {
2213 printf("raid%d: IO Error. Marking %s as failed.\n",
2214 queue->raidPtr->raidid,
2215 queue->raidPtr->Disks[queue->col].devname);
2216 queue->raidPtr->Disks[queue->col].status =
2217 rf_ds_failed;
2218 queue->raidPtr->status = rf_rs_degraded;
2219 queue->raidPtr->numFailures++;
2220 queue->raidPtr->numNewFailures++;
2221 } else { /* Disk is already dead... */
2222 /* printf("Disk already marked as dead!\n"); */
2223 }
2224
2225 }
2226
2227 /* Fill in the error value */
2228 req->error = bp->b_error;
2229
2230 /* Drop this one on the "finished" queue... */
2231 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2232
2233 /* Let the raidio thread know there is work to be done. */
2234 rf_signal_cond2(queue->raidPtr->iodone_cv);
2235
2236 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2237 }
2238
2239
2240 /*
2241 * initialize a buf structure for doing an I/O in the kernel.
2242 */
2243 static void
2244 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2245 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2246 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2247 struct proc *b_proc)
2248 {
2249 /* bp->b_flags = B_PHYS | rw_flag; */
2250 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2251 bp->b_oflags = 0;
2252 bp->b_cflags = 0;
2253 bp->b_bcount = numSect << logBytesPerSector;
2254 bp->b_bufsize = bp->b_bcount;
2255 bp->b_error = 0;
2256 bp->b_dev = dev;
2257 bp->b_data = bf;
2258 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2259 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2260 if (bp->b_bcount == 0) {
2261 panic("bp->b_bcount is zero in InitBP!!");
2262 }
2263 bp->b_proc = b_proc;
2264 bp->b_iodone = cbFunc;
2265 bp->b_private = cbArg;
2266 }
2267
2268 static void
2269 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2270 struct disklabel *lp)
2271 {
2272 memset(lp, 0, sizeof(*lp));
2273
2274 /* fabricate a label... */
2275 lp->d_secperunit = raidPtr->totalSectors;
2276 lp->d_secsize = raidPtr->bytesPerSector;
2277 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2278 lp->d_ntracks = 4 * raidPtr->numCol;
2279 lp->d_ncylinders = raidPtr->totalSectors /
2280 (lp->d_nsectors * lp->d_ntracks);
2281 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2282
2283 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2284 lp->d_type = DTYPE_RAID;
2285 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2286 lp->d_rpm = 3600;
2287 lp->d_interleave = 1;
2288 lp->d_flags = 0;
2289
2290 lp->d_partitions[RAW_PART].p_offset = 0;
2291 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2292 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2293 lp->d_npartitions = RAW_PART + 1;
2294
2295 lp->d_magic = DISKMAGIC;
2296 lp->d_magic2 = DISKMAGIC;
2297 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2298
2299 }
2300 /*
2301 * Read the disklabel from the raid device. If one is not present, fake one
2302 * up.
2303 */
2304 static void
2305 raidgetdisklabel(dev_t dev)
2306 {
2307 int unit = raidunit(dev);
2308 struct raid_softc *rs = &raid_softc[unit];
2309 const char *errstring;
2310 struct disklabel *lp = rs->sc_dkdev.dk_label;
2311 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2312 RF_Raid_t *raidPtr;
2313
2314 db1_printf(("Getting the disklabel...\n"));
2315
2316 memset(clp, 0, sizeof(*clp));
2317
2318 raidPtr = raidPtrs[unit];
2319
2320 raidgetdefaultlabel(raidPtr, rs, lp);
2321
2322 /*
2323 * Call the generic disklabel extraction routine.
2324 */
2325 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2326 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2327 if (errstring)
2328 raidmakedisklabel(rs);
2329 else {
2330 int i;
2331 struct partition *pp;
2332
2333 /*
2334 * Sanity check whether the found disklabel is valid.
2335 *
2336 * This is necessary since total size of the raid device
2337 * may vary when an interleave is changed even though exactly
2338 * same components are used, and old disklabel may used
2339 * if that is found.
2340 */
2341 if (lp->d_secperunit != rs->sc_size)
2342 printf("raid%d: WARNING: %s: "
2343 "total sector size in disklabel (%" PRIu32 ") != "
2344 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2345 lp->d_secperunit, rs->sc_size);
2346 for (i = 0; i < lp->d_npartitions; i++) {
2347 pp = &lp->d_partitions[i];
2348 if (pp->p_offset + pp->p_size > rs->sc_size)
2349 printf("raid%d: WARNING: %s: end of partition `%c' "
2350 "exceeds the size of raid (%" PRIu64 ")\n",
2351 unit, rs->sc_xname, 'a' + i, rs->sc_size);
2352 }
2353 }
2354
2355 }
2356 /*
2357 * Take care of things one might want to take care of in the event
2358 * that a disklabel isn't present.
2359 */
2360 static void
2361 raidmakedisklabel(struct raid_softc *rs)
2362 {
2363 struct disklabel *lp = rs->sc_dkdev.dk_label;
2364 db1_printf(("Making a label..\n"));
2365
2366 /*
2367 * For historical reasons, if there's no disklabel present
2368 * the raw partition must be marked FS_BSDFFS.
2369 */
2370
2371 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2372
2373 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2374
2375 lp->d_checksum = dkcksum(lp);
2376 }
2377 /*
2378 * Wait interruptibly for an exclusive lock.
2379 *
2380 * XXX
2381 * Several drivers do this; it should be abstracted and made MP-safe.
2382 * (Hmm... where have we seen this warning before :-> GO )
2383 */
2384 static int
2385 raidlock(struct raid_softc *rs)
2386 {
2387 int error;
2388
2389 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2390 rs->sc_flags |= RAIDF_WANTED;
2391 if ((error =
2392 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2393 return (error);
2394 }
2395 rs->sc_flags |= RAIDF_LOCKED;
2396 return (0);
2397 }
2398 /*
2399 * Unlock and wake up any waiters.
2400 */
2401 static void
2402 raidunlock(struct raid_softc *rs)
2403 {
2404
2405 rs->sc_flags &= ~RAIDF_LOCKED;
2406 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2407 rs->sc_flags &= ~RAIDF_WANTED;
2408 wakeup(rs);
2409 }
2410 }
2411
2412
2413 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2414 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2415 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2416
2417 static daddr_t
2418 rf_component_info_offset(void)
2419 {
2420
2421 return RF_COMPONENT_INFO_OFFSET;
2422 }
2423
2424 static daddr_t
2425 rf_component_info_size(unsigned secsize)
2426 {
2427 daddr_t info_size;
2428
2429 KASSERT(secsize);
2430 if (secsize > RF_COMPONENT_INFO_SIZE)
2431 info_size = secsize;
2432 else
2433 info_size = RF_COMPONENT_INFO_SIZE;
2434
2435 return info_size;
2436 }
2437
2438 static daddr_t
2439 rf_parity_map_offset(RF_Raid_t *raidPtr)
2440 {
2441 daddr_t map_offset;
2442
2443 KASSERT(raidPtr->bytesPerSector);
2444 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2445 map_offset = raidPtr->bytesPerSector;
2446 else
2447 map_offset = RF_COMPONENT_INFO_SIZE;
2448 map_offset += rf_component_info_offset();
2449
2450 return map_offset;
2451 }
2452
2453 static daddr_t
2454 rf_parity_map_size(RF_Raid_t *raidPtr)
2455 {
2456 daddr_t map_size;
2457
2458 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2459 map_size = raidPtr->bytesPerSector;
2460 else
2461 map_size = RF_PARITY_MAP_SIZE;
2462
2463 return map_size;
2464 }
2465
2466 int
2467 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2468 {
2469 RF_ComponentLabel_t *clabel;
2470
2471 clabel = raidget_component_label(raidPtr, col);
2472 clabel->clean = RF_RAID_CLEAN;
2473 raidflush_component_label(raidPtr, col);
2474 return(0);
2475 }
2476
2477
2478 int
2479 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2480 {
2481 RF_ComponentLabel_t *clabel;
2482
2483 clabel = raidget_component_label(raidPtr, col);
2484 clabel->clean = RF_RAID_DIRTY;
2485 raidflush_component_label(raidPtr, col);
2486 return(0);
2487 }
2488
2489 int
2490 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2491 {
2492 KASSERT(raidPtr->bytesPerSector);
2493 return raidread_component_label(raidPtr->bytesPerSector,
2494 raidPtr->Disks[col].dev,
2495 raidPtr->raid_cinfo[col].ci_vp,
2496 &raidPtr->raid_cinfo[col].ci_label);
2497 }
2498
2499 RF_ComponentLabel_t *
2500 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2501 {
2502 return &raidPtr->raid_cinfo[col].ci_label;
2503 }
2504
2505 int
2506 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2507 {
2508 RF_ComponentLabel_t *label;
2509
2510 label = &raidPtr->raid_cinfo[col].ci_label;
2511 label->mod_counter = raidPtr->mod_counter;
2512 #ifndef RF_NO_PARITY_MAP
2513 label->parity_map_modcount = label->mod_counter;
2514 #endif
2515 return raidwrite_component_label(raidPtr->bytesPerSector,
2516 raidPtr->Disks[col].dev,
2517 raidPtr->raid_cinfo[col].ci_vp, label);
2518 }
2519
2520
2521 static int
2522 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2523 RF_ComponentLabel_t *clabel)
2524 {
2525 return raidread_component_area(dev, b_vp, clabel,
2526 sizeof(RF_ComponentLabel_t),
2527 rf_component_info_offset(),
2528 rf_component_info_size(secsize));
2529 }
2530
2531 /* ARGSUSED */
2532 static int
2533 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2534 size_t msize, daddr_t offset, daddr_t dsize)
2535 {
2536 struct buf *bp;
2537 const struct bdevsw *bdev;
2538 int error;
2539
2540 /* XXX should probably ensure that we don't try to do this if
2541 someone has changed rf_protected_sectors. */
2542
2543 if (b_vp == NULL) {
2544 /* For whatever reason, this component is not valid.
2545 Don't try to read a component label from it. */
2546 return(EINVAL);
2547 }
2548
2549 /* get a block of the appropriate size... */
2550 bp = geteblk((int)dsize);
2551 bp->b_dev = dev;
2552
2553 /* get our ducks in a row for the read */
2554 bp->b_blkno = offset / DEV_BSIZE;
2555 bp->b_bcount = dsize;
2556 bp->b_flags |= B_READ;
2557 bp->b_resid = dsize;
2558
2559 bdev = bdevsw_lookup(bp->b_dev);
2560 if (bdev == NULL)
2561 return (ENXIO);
2562 (*bdev->d_strategy)(bp);
2563
2564 error = biowait(bp);
2565
2566 if (!error) {
2567 memcpy(data, bp->b_data, msize);
2568 }
2569
2570 brelse(bp, 0);
2571 return(error);
2572 }
2573
2574
2575 static int
2576 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2577 RF_ComponentLabel_t *clabel)
2578 {
2579 return raidwrite_component_area(dev, b_vp, clabel,
2580 sizeof(RF_ComponentLabel_t),
2581 rf_component_info_offset(),
2582 rf_component_info_size(secsize), 0);
2583 }
2584
2585 /* ARGSUSED */
2586 static int
2587 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2588 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2589 {
2590 struct buf *bp;
2591 const struct bdevsw *bdev;
2592 int error;
2593
2594 /* get a block of the appropriate size... */
2595 bp = geteblk((int)dsize);
2596 bp->b_dev = dev;
2597
2598 /* get our ducks in a row for the write */
2599 bp->b_blkno = offset / DEV_BSIZE;
2600 bp->b_bcount = dsize;
2601 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2602 bp->b_resid = dsize;
2603
2604 memset(bp->b_data, 0, dsize);
2605 memcpy(bp->b_data, data, msize);
2606
2607 bdev = bdevsw_lookup(bp->b_dev);
2608 if (bdev == NULL)
2609 return (ENXIO);
2610 (*bdev->d_strategy)(bp);
2611 if (asyncp)
2612 return 0;
2613 error = biowait(bp);
2614 brelse(bp, 0);
2615 if (error) {
2616 #if 1
2617 printf("Failed to write RAID component info!\n");
2618 #endif
2619 }
2620
2621 return(error);
2622 }
2623
2624 void
2625 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2626 {
2627 int c;
2628
2629 for (c = 0; c < raidPtr->numCol; c++) {
2630 /* Skip dead disks. */
2631 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2632 continue;
2633 /* XXXjld: what if an error occurs here? */
2634 raidwrite_component_area(raidPtr->Disks[c].dev,
2635 raidPtr->raid_cinfo[c].ci_vp, map,
2636 RF_PARITYMAP_NBYTE,
2637 rf_parity_map_offset(raidPtr),
2638 rf_parity_map_size(raidPtr), 0);
2639 }
2640 }
2641
2642 void
2643 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2644 {
2645 struct rf_paritymap_ondisk tmp;
2646 int c,first;
2647
2648 first=1;
2649 for (c = 0; c < raidPtr->numCol; c++) {
2650 /* Skip dead disks. */
2651 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2652 continue;
2653 raidread_component_area(raidPtr->Disks[c].dev,
2654 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2655 RF_PARITYMAP_NBYTE,
2656 rf_parity_map_offset(raidPtr),
2657 rf_parity_map_size(raidPtr));
2658 if (first) {
2659 memcpy(map, &tmp, sizeof(*map));
2660 first = 0;
2661 } else {
2662 rf_paritymap_merge(map, &tmp);
2663 }
2664 }
2665 }
2666
2667 void
2668 rf_markalldirty(RF_Raid_t *raidPtr)
2669 {
2670 RF_ComponentLabel_t *clabel;
2671 int sparecol;
2672 int c;
2673 int j;
2674 int scol = -1;
2675
2676 raidPtr->mod_counter++;
2677 for (c = 0; c < raidPtr->numCol; c++) {
2678 /* we don't want to touch (at all) a disk that has
2679 failed */
2680 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2681 clabel = raidget_component_label(raidPtr, c);
2682 if (clabel->status == rf_ds_spared) {
2683 /* XXX do something special...
2684 but whatever you do, don't
2685 try to access it!! */
2686 } else {
2687 raidmarkdirty(raidPtr, c);
2688 }
2689 }
2690 }
2691
2692 for( c = 0; c < raidPtr->numSpare ; c++) {
2693 sparecol = raidPtr->numCol + c;
2694 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2695 /*
2696
2697 we claim this disk is "optimal" if it's
2698 rf_ds_used_spare, as that means it should be
2699 directly substitutable for the disk it replaced.
2700 We note that too...
2701
2702 */
2703
2704 for(j=0;j<raidPtr->numCol;j++) {
2705 if (raidPtr->Disks[j].spareCol == sparecol) {
2706 scol = j;
2707 break;
2708 }
2709 }
2710
2711 clabel = raidget_component_label(raidPtr, sparecol);
2712 /* make sure status is noted */
2713
2714 raid_init_component_label(raidPtr, clabel);
2715
2716 clabel->row = 0;
2717 clabel->column = scol;
2718 /* Note: we *don't* change status from rf_ds_used_spare
2719 to rf_ds_optimal */
2720 /* clabel.status = rf_ds_optimal; */
2721
2722 raidmarkdirty(raidPtr, sparecol);
2723 }
2724 }
2725 }
2726
2727
2728 void
2729 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2730 {
2731 RF_ComponentLabel_t *clabel;
2732 int sparecol;
2733 int c;
2734 int j;
2735 int scol;
2736
2737 scol = -1;
2738
2739 /* XXX should do extra checks to make sure things really are clean,
2740 rather than blindly setting the clean bit... */
2741
2742 raidPtr->mod_counter++;
2743
2744 for (c = 0; c < raidPtr->numCol; c++) {
2745 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2746 clabel = raidget_component_label(raidPtr, c);
2747 /* make sure status is noted */
2748 clabel->status = rf_ds_optimal;
2749
2750 /* note what unit we are configured as */
2751 clabel->last_unit = raidPtr->raidid;
2752
2753 raidflush_component_label(raidPtr, c);
2754 if (final == RF_FINAL_COMPONENT_UPDATE) {
2755 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2756 raidmarkclean(raidPtr, c);
2757 }
2758 }
2759 }
2760 /* else we don't touch it.. */
2761 }
2762
2763 for( c = 0; c < raidPtr->numSpare ; c++) {
2764 sparecol = raidPtr->numCol + c;
2765 /* Need to ensure that the reconstruct actually completed! */
2766 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2767 /*
2768
2769 we claim this disk is "optimal" if it's
2770 rf_ds_used_spare, as that means it should be
2771 directly substitutable for the disk it replaced.
2772 We note that too...
2773
2774 */
2775
2776 for(j=0;j<raidPtr->numCol;j++) {
2777 if (raidPtr->Disks[j].spareCol == sparecol) {
2778 scol = j;
2779 break;
2780 }
2781 }
2782
2783 /* XXX shouldn't *really* need this... */
2784 clabel = raidget_component_label(raidPtr, sparecol);
2785 /* make sure status is noted */
2786
2787 raid_init_component_label(raidPtr, clabel);
2788
2789 clabel->column = scol;
2790 clabel->status = rf_ds_optimal;
2791 clabel->last_unit = raidPtr->raidid;
2792
2793 raidflush_component_label(raidPtr, sparecol);
2794 if (final == RF_FINAL_COMPONENT_UPDATE) {
2795 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2796 raidmarkclean(raidPtr, sparecol);
2797 }
2798 }
2799 }
2800 }
2801 }
2802
2803 void
2804 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2805 {
2806
2807 if (vp != NULL) {
2808 if (auto_configured == 1) {
2809 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2810 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2811 vput(vp);
2812
2813 } else {
2814 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2815 }
2816 }
2817 }
2818
2819
2820 void
2821 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2822 {
2823 int r,c;
2824 struct vnode *vp;
2825 int acd;
2826
2827
2828 /* We take this opportunity to close the vnodes like we should.. */
2829
2830 for (c = 0; c < raidPtr->numCol; c++) {
2831 vp = raidPtr->raid_cinfo[c].ci_vp;
2832 acd = raidPtr->Disks[c].auto_configured;
2833 rf_close_component(raidPtr, vp, acd);
2834 raidPtr->raid_cinfo[c].ci_vp = NULL;
2835 raidPtr->Disks[c].auto_configured = 0;
2836 }
2837
2838 for (r = 0; r < raidPtr->numSpare; r++) {
2839 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2840 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2841 rf_close_component(raidPtr, vp, acd);
2842 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2843 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2844 }
2845 }
2846
2847
2848 void
2849 rf_ReconThread(struct rf_recon_req *req)
2850 {
2851 int s;
2852 RF_Raid_t *raidPtr;
2853
2854 s = splbio();
2855 raidPtr = (RF_Raid_t *) req->raidPtr;
2856 raidPtr->recon_in_progress = 1;
2857
2858 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2859 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2860
2861 RF_Free(req, sizeof(*req));
2862
2863 raidPtr->recon_in_progress = 0;
2864 splx(s);
2865
2866 /* That's all... */
2867 kthread_exit(0); /* does not return */
2868 }
2869
2870 void
2871 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2872 {
2873 int retcode;
2874 int s;
2875
2876 raidPtr->parity_rewrite_stripes_done = 0;
2877 raidPtr->parity_rewrite_in_progress = 1;
2878 s = splbio();
2879 retcode = rf_RewriteParity(raidPtr);
2880 splx(s);
2881 if (retcode) {
2882 printf("raid%d: Error re-writing parity (%d)!\n",
2883 raidPtr->raidid, retcode);
2884 } else {
2885 /* set the clean bit! If we shutdown correctly,
2886 the clean bit on each component label will get
2887 set */
2888 raidPtr->parity_good = RF_RAID_CLEAN;
2889 }
2890 raidPtr->parity_rewrite_in_progress = 0;
2891
2892 /* Anyone waiting for us to stop? If so, inform them... */
2893 if (raidPtr->waitShutdown) {
2894 wakeup(&raidPtr->parity_rewrite_in_progress);
2895 }
2896
2897 /* That's all... */
2898 kthread_exit(0); /* does not return */
2899 }
2900
2901
2902 void
2903 rf_CopybackThread(RF_Raid_t *raidPtr)
2904 {
2905 int s;
2906
2907 raidPtr->copyback_in_progress = 1;
2908 s = splbio();
2909 rf_CopybackReconstructedData(raidPtr);
2910 splx(s);
2911 raidPtr->copyback_in_progress = 0;
2912
2913 /* That's all... */
2914 kthread_exit(0); /* does not return */
2915 }
2916
2917
2918 void
2919 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2920 {
2921 int s;
2922 RF_Raid_t *raidPtr;
2923
2924 s = splbio();
2925 raidPtr = req->raidPtr;
2926 raidPtr->recon_in_progress = 1;
2927 rf_ReconstructInPlace(raidPtr, req->col);
2928 RF_Free(req, sizeof(*req));
2929 raidPtr->recon_in_progress = 0;
2930 splx(s);
2931
2932 /* That's all... */
2933 kthread_exit(0); /* does not return */
2934 }
2935
2936 static RF_AutoConfig_t *
2937 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2938 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2939 unsigned secsize)
2940 {
2941 int good_one = 0;
2942 RF_ComponentLabel_t *clabel;
2943 RF_AutoConfig_t *ac;
2944
2945 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2946 if (clabel == NULL) {
2947 oomem:
2948 while(ac_list) {
2949 ac = ac_list;
2950 if (ac->clabel)
2951 free(ac->clabel, M_RAIDFRAME);
2952 ac_list = ac_list->next;
2953 free(ac, M_RAIDFRAME);
2954 }
2955 printf("RAID auto config: out of memory!\n");
2956 return NULL; /* XXX probably should panic? */
2957 }
2958
2959 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2960 /* Got the label. Does it look reasonable? */
2961 if (rf_reasonable_label(clabel, numsecs) &&
2962 (rf_component_label_partitionsize(clabel) <= size)) {
2963 #ifdef DEBUG
2964 printf("Component on: %s: %llu\n",
2965 cname, (unsigned long long)size);
2966 rf_print_component_label(clabel);
2967 #endif
2968 /* if it's reasonable, add it, else ignore it. */
2969 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2970 M_NOWAIT);
2971 if (ac == NULL) {
2972 free(clabel, M_RAIDFRAME);
2973 goto oomem;
2974 }
2975 strlcpy(ac->devname, cname, sizeof(ac->devname));
2976 ac->dev = dev;
2977 ac->vp = vp;
2978 ac->clabel = clabel;
2979 ac->next = ac_list;
2980 ac_list = ac;
2981 good_one = 1;
2982 }
2983 }
2984 if (!good_one) {
2985 /* cleanup */
2986 free(clabel, M_RAIDFRAME);
2987 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2988 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2989 vput(vp);
2990 }
2991 return ac_list;
2992 }
2993
2994 RF_AutoConfig_t *
2995 rf_find_raid_components(void)
2996 {
2997 struct vnode *vp;
2998 struct disklabel label;
2999 device_t dv;
3000 deviter_t di;
3001 dev_t dev;
3002 int bmajor, bminor, wedge, rf_part_found;
3003 int error;
3004 int i;
3005 RF_AutoConfig_t *ac_list;
3006 uint64_t numsecs;
3007 unsigned secsize;
3008
3009 /* initialize the AutoConfig list */
3010 ac_list = NULL;
3011
3012 /* we begin by trolling through *all* the devices on the system */
3013
3014 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3015 dv = deviter_next(&di)) {
3016
3017 /* we are only interested in disks... */
3018 if (device_class(dv) != DV_DISK)
3019 continue;
3020
3021 /* we don't care about floppies... */
3022 if (device_is_a(dv, "fd")) {
3023 continue;
3024 }
3025
3026 /* we don't care about CD's... */
3027 if (device_is_a(dv, "cd")) {
3028 continue;
3029 }
3030
3031 /* we don't care about md's... */
3032 if (device_is_a(dv, "md")) {
3033 continue;
3034 }
3035
3036 /* hdfd is the Atari/Hades floppy driver */
3037 if (device_is_a(dv, "hdfd")) {
3038 continue;
3039 }
3040
3041 /* fdisa is the Atari/Milan floppy driver */
3042 if (device_is_a(dv, "fdisa")) {
3043 continue;
3044 }
3045
3046 /* need to find the device_name_to_block_device_major stuff */
3047 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3048
3049 rf_part_found = 0; /*No raid partition as yet*/
3050
3051 /* get a vnode for the raw partition of this disk */
3052
3053 wedge = device_is_a(dv, "dk");
3054 bminor = minor(device_unit(dv));
3055 dev = wedge ? makedev(bmajor, bminor) :
3056 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3057 if (bdevvp(dev, &vp))
3058 panic("RAID can't alloc vnode");
3059
3060 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3061
3062 if (error) {
3063 /* "Who cares." Continue looking
3064 for something that exists*/
3065 vput(vp);
3066 continue;
3067 }
3068
3069 error = getdisksize(vp, &numsecs, &secsize);
3070 if (error) {
3071 vput(vp);
3072 continue;
3073 }
3074 if (wedge) {
3075 struct dkwedge_info dkw;
3076 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3077 NOCRED);
3078 if (error) {
3079 printf("RAIDframe: can't get wedge info for "
3080 "dev %s (%d)\n", device_xname(dv), error);
3081 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3082 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3083 vput(vp);
3084 continue;
3085 }
3086
3087 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3088 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3089 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3090 vput(vp);
3091 continue;
3092 }
3093
3094 ac_list = rf_get_component(ac_list, dev, vp,
3095 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3096 rf_part_found = 1; /*There is a raid component on this disk*/
3097 continue;
3098 }
3099
3100 /* Ok, the disk exists. Go get the disklabel. */
3101 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3102 if (error) {
3103 /*
3104 * XXX can't happen - open() would
3105 * have errored out (or faked up one)
3106 */
3107 if (error != ENOTTY)
3108 printf("RAIDframe: can't get label for dev "
3109 "%s (%d)\n", device_xname(dv), error);
3110 }
3111
3112 /* don't need this any more. We'll allocate it again
3113 a little later if we really do... */
3114 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3115 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3116 vput(vp);
3117
3118 if (error)
3119 continue;
3120
3121 rf_part_found = 0; /*No raid partitions yet*/
3122 for (i = 0; i < label.d_npartitions; i++) {
3123 char cname[sizeof(ac_list->devname)];
3124
3125 /* We only support partitions marked as RAID */
3126 if (label.d_partitions[i].p_fstype != FS_RAID)
3127 continue;
3128
3129 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3130 if (bdevvp(dev, &vp))
3131 panic("RAID can't alloc vnode");
3132
3133 error = VOP_OPEN(vp, FREAD, NOCRED);
3134 if (error) {
3135 /* Whatever... */
3136 vput(vp);
3137 continue;
3138 }
3139 snprintf(cname, sizeof(cname), "%s%c",
3140 device_xname(dv), 'a' + i);
3141 ac_list = rf_get_component(ac_list, dev, vp, cname,
3142 label.d_partitions[i].p_size, numsecs, secsize);
3143 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3144 }
3145
3146 /*
3147 *If there is no raid component on this disk, either in a
3148 *disklabel or inside a wedge, check the raw partition as well,
3149 *as it is possible to configure raid components on raw disk
3150 *devices.
3151 */
3152
3153 if (!rf_part_found) {
3154 char cname[sizeof(ac_list->devname)];
3155
3156 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3157 if (bdevvp(dev, &vp))
3158 panic("RAID can't alloc vnode");
3159
3160 error = VOP_OPEN(vp, FREAD, NOCRED);
3161 if (error) {
3162 /* Whatever... */
3163 vput(vp);
3164 continue;
3165 }
3166 snprintf(cname, sizeof(cname), "%s%c",
3167 device_xname(dv), 'a' + RAW_PART);
3168 ac_list = rf_get_component(ac_list, dev, vp, cname,
3169 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3170 }
3171 }
3172 deviter_release(&di);
3173 return ac_list;
3174 }
3175
3176
3177 int
3178 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3179 {
3180
3181 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3182 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3183 ((clabel->clean == RF_RAID_CLEAN) ||
3184 (clabel->clean == RF_RAID_DIRTY)) &&
3185 clabel->row >=0 &&
3186 clabel->column >= 0 &&
3187 clabel->num_rows > 0 &&
3188 clabel->num_columns > 0 &&
3189 clabel->row < clabel->num_rows &&
3190 clabel->column < clabel->num_columns &&
3191 clabel->blockSize > 0 &&
3192 /*
3193 * numBlocksHi may contain garbage, but it is ok since
3194 * the type is unsigned. If it is really garbage,
3195 * rf_fix_old_label_size() will fix it.
3196 */
3197 rf_component_label_numblocks(clabel) > 0) {
3198 /*
3199 * label looks reasonable enough...
3200 * let's make sure it has no old garbage.
3201 */
3202 if (numsecs)
3203 rf_fix_old_label_size(clabel, numsecs);
3204 return(1);
3205 }
3206 return(0);
3207 }
3208
3209
3210 /*
3211 * For reasons yet unknown, some old component labels have garbage in
3212 * the newer numBlocksHi region, and this causes lossage. Since those
3213 * disks will also have numsecs set to less than 32 bits of sectors,
3214 * we can determine when this corruption has occured, and fix it.
3215 *
3216 * The exact same problem, with the same unknown reason, happens to
3217 * the partitionSizeHi member as well.
3218 */
3219 static void
3220 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3221 {
3222
3223 if (numsecs < ((uint64_t)1 << 32)) {
3224 if (clabel->numBlocksHi) {
3225 printf("WARNING: total sectors < 32 bits, yet "
3226 "numBlocksHi set\n"
3227 "WARNING: resetting numBlocksHi to zero.\n");
3228 clabel->numBlocksHi = 0;
3229 }
3230
3231 if (clabel->partitionSizeHi) {
3232 printf("WARNING: total sectors < 32 bits, yet "
3233 "partitionSizeHi set\n"
3234 "WARNING: resetting partitionSizeHi to zero.\n");
3235 clabel->partitionSizeHi = 0;
3236 }
3237 }
3238 }
3239
3240
3241 #ifdef DEBUG
3242 void
3243 rf_print_component_label(RF_ComponentLabel_t *clabel)
3244 {
3245 uint64_t numBlocks;
3246
3247 numBlocks = rf_component_label_numblocks(clabel);
3248
3249 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3250 clabel->row, clabel->column,
3251 clabel->num_rows, clabel->num_columns);
3252 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3253 clabel->version, clabel->serial_number,
3254 clabel->mod_counter);
3255 printf(" Clean: %s Status: %d\n",
3256 clabel->clean ? "Yes" : "No", clabel->status);
3257 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3258 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3259 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3260 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3261 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3262 printf(" Contains root partition: %s\n",
3263 clabel->root_partition ? "Yes" : "No");
3264 printf(" Last configured as: raid%d\n", clabel->last_unit);
3265 #if 0
3266 printf(" Config order: %d\n", clabel->config_order);
3267 #endif
3268
3269 }
3270 #endif
3271
3272 RF_ConfigSet_t *
3273 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3274 {
3275 RF_AutoConfig_t *ac;
3276 RF_ConfigSet_t *config_sets;
3277 RF_ConfigSet_t *cset;
3278 RF_AutoConfig_t *ac_next;
3279
3280
3281 config_sets = NULL;
3282
3283 /* Go through the AutoConfig list, and figure out which components
3284 belong to what sets. */
3285 ac = ac_list;
3286 while(ac!=NULL) {
3287 /* we're going to putz with ac->next, so save it here
3288 for use at the end of the loop */
3289 ac_next = ac->next;
3290
3291 if (config_sets == NULL) {
3292 /* will need at least this one... */
3293 config_sets = (RF_ConfigSet_t *)
3294 malloc(sizeof(RF_ConfigSet_t),
3295 M_RAIDFRAME, M_NOWAIT);
3296 if (config_sets == NULL) {
3297 panic("rf_create_auto_sets: No memory!");
3298 }
3299 /* this one is easy :) */
3300 config_sets->ac = ac;
3301 config_sets->next = NULL;
3302 config_sets->rootable = 0;
3303 ac->next = NULL;
3304 } else {
3305 /* which set does this component fit into? */
3306 cset = config_sets;
3307 while(cset!=NULL) {
3308 if (rf_does_it_fit(cset, ac)) {
3309 /* looks like it matches... */
3310 ac->next = cset->ac;
3311 cset->ac = ac;
3312 break;
3313 }
3314 cset = cset->next;
3315 }
3316 if (cset==NULL) {
3317 /* didn't find a match above... new set..*/
3318 cset = (RF_ConfigSet_t *)
3319 malloc(sizeof(RF_ConfigSet_t),
3320 M_RAIDFRAME, M_NOWAIT);
3321 if (cset == NULL) {
3322 panic("rf_create_auto_sets: No memory!");
3323 }
3324 cset->ac = ac;
3325 ac->next = NULL;
3326 cset->next = config_sets;
3327 cset->rootable = 0;
3328 config_sets = cset;
3329 }
3330 }
3331 ac = ac_next;
3332 }
3333
3334
3335 return(config_sets);
3336 }
3337
3338 static int
3339 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3340 {
3341 RF_ComponentLabel_t *clabel1, *clabel2;
3342
3343 /* If this one matches the *first* one in the set, that's good
3344 enough, since the other members of the set would have been
3345 through here too... */
3346 /* note that we are not checking partitionSize here..
3347
3348 Note that we are also not checking the mod_counters here.
3349 If everything else matches execpt the mod_counter, that's
3350 good enough for this test. We will deal with the mod_counters
3351 a little later in the autoconfiguration process.
3352
3353 (clabel1->mod_counter == clabel2->mod_counter) &&
3354
3355 The reason we don't check for this is that failed disks
3356 will have lower modification counts. If those disks are
3357 not added to the set they used to belong to, then they will
3358 form their own set, which may result in 2 different sets,
3359 for example, competing to be configured at raid0, and
3360 perhaps competing to be the root filesystem set. If the
3361 wrong ones get configured, or both attempt to become /,
3362 weird behaviour and or serious lossage will occur. Thus we
3363 need to bring them into the fold here, and kick them out at
3364 a later point.
3365
3366 */
3367
3368 clabel1 = cset->ac->clabel;
3369 clabel2 = ac->clabel;
3370 if ((clabel1->version == clabel2->version) &&
3371 (clabel1->serial_number == clabel2->serial_number) &&
3372 (clabel1->num_rows == clabel2->num_rows) &&
3373 (clabel1->num_columns == clabel2->num_columns) &&
3374 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3375 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3376 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3377 (clabel1->parityConfig == clabel2->parityConfig) &&
3378 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3379 (clabel1->blockSize == clabel2->blockSize) &&
3380 rf_component_label_numblocks(clabel1) ==
3381 rf_component_label_numblocks(clabel2) &&
3382 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3383 (clabel1->root_partition == clabel2->root_partition) &&
3384 (clabel1->last_unit == clabel2->last_unit) &&
3385 (clabel1->config_order == clabel2->config_order)) {
3386 /* if it get's here, it almost *has* to be a match */
3387 } else {
3388 /* it's not consistent with somebody in the set..
3389 punt */
3390 return(0);
3391 }
3392 /* all was fine.. it must fit... */
3393 return(1);
3394 }
3395
3396 int
3397 rf_have_enough_components(RF_ConfigSet_t *cset)
3398 {
3399 RF_AutoConfig_t *ac;
3400 RF_AutoConfig_t *auto_config;
3401 RF_ComponentLabel_t *clabel;
3402 int c;
3403 int num_cols;
3404 int num_missing;
3405 int mod_counter;
3406 int mod_counter_found;
3407 int even_pair_failed;
3408 char parity_type;
3409
3410
3411 /* check to see that we have enough 'live' components
3412 of this set. If so, we can configure it if necessary */
3413
3414 num_cols = cset->ac->clabel->num_columns;
3415 parity_type = cset->ac->clabel->parityConfig;
3416
3417 /* XXX Check for duplicate components!?!?!? */
3418
3419 /* Determine what the mod_counter is supposed to be for this set. */
3420
3421 mod_counter_found = 0;
3422 mod_counter = 0;
3423 ac = cset->ac;
3424 while(ac!=NULL) {
3425 if (mod_counter_found==0) {
3426 mod_counter = ac->clabel->mod_counter;
3427 mod_counter_found = 1;
3428 } else {
3429 if (ac->clabel->mod_counter > mod_counter) {
3430 mod_counter = ac->clabel->mod_counter;
3431 }
3432 }
3433 ac = ac->next;
3434 }
3435
3436 num_missing = 0;
3437 auto_config = cset->ac;
3438
3439 even_pair_failed = 0;
3440 for(c=0; c<num_cols; c++) {
3441 ac = auto_config;
3442 while(ac!=NULL) {
3443 if ((ac->clabel->column == c) &&
3444 (ac->clabel->mod_counter == mod_counter)) {
3445 /* it's this one... */
3446 #ifdef DEBUG
3447 printf("Found: %s at %d\n",
3448 ac->devname,c);
3449 #endif
3450 break;
3451 }
3452 ac=ac->next;
3453 }
3454 if (ac==NULL) {
3455 /* Didn't find one here! */
3456 /* special case for RAID 1, especially
3457 where there are more than 2
3458 components (where RAIDframe treats
3459 things a little differently :( ) */
3460 if (parity_type == '1') {
3461 if (c%2 == 0) { /* even component */
3462 even_pair_failed = 1;
3463 } else { /* odd component. If
3464 we're failed, and
3465 so is the even
3466 component, it's
3467 "Good Night, Charlie" */
3468 if (even_pair_failed == 1) {
3469 return(0);
3470 }
3471 }
3472 } else {
3473 /* normal accounting */
3474 num_missing++;
3475 }
3476 }
3477 if ((parity_type == '1') && (c%2 == 1)) {
3478 /* Just did an even component, and we didn't
3479 bail.. reset the even_pair_failed flag,
3480 and go on to the next component.... */
3481 even_pair_failed = 0;
3482 }
3483 }
3484
3485 clabel = cset->ac->clabel;
3486
3487 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3488 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3489 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3490 /* XXX this needs to be made *much* more general */
3491 /* Too many failures */
3492 return(0);
3493 }
3494 /* otherwise, all is well, and we've got enough to take a kick
3495 at autoconfiguring this set */
3496 return(1);
3497 }
3498
3499 void
3500 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3501 RF_Raid_t *raidPtr)
3502 {
3503 RF_ComponentLabel_t *clabel;
3504 int i;
3505
3506 clabel = ac->clabel;
3507
3508 /* 1. Fill in the common stuff */
3509 config->numRow = clabel->num_rows = 1;
3510 config->numCol = clabel->num_columns;
3511 config->numSpare = 0; /* XXX should this be set here? */
3512 config->sectPerSU = clabel->sectPerSU;
3513 config->SUsPerPU = clabel->SUsPerPU;
3514 config->SUsPerRU = clabel->SUsPerRU;
3515 config->parityConfig = clabel->parityConfig;
3516 /* XXX... */
3517 strcpy(config->diskQueueType,"fifo");
3518 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3519 config->layoutSpecificSize = 0; /* XXX ?? */
3520
3521 while(ac!=NULL) {
3522 /* row/col values will be in range due to the checks
3523 in reasonable_label() */
3524 strcpy(config->devnames[0][ac->clabel->column],
3525 ac->devname);
3526 ac = ac->next;
3527 }
3528
3529 for(i=0;i<RF_MAXDBGV;i++) {
3530 config->debugVars[i][0] = 0;
3531 }
3532 }
3533
3534 int
3535 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3536 {
3537 RF_ComponentLabel_t *clabel;
3538 int column;
3539 int sparecol;
3540
3541 raidPtr->autoconfigure = new_value;
3542
3543 for(column=0; column<raidPtr->numCol; column++) {
3544 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3545 clabel = raidget_component_label(raidPtr, column);
3546 clabel->autoconfigure = new_value;
3547 raidflush_component_label(raidPtr, column);
3548 }
3549 }
3550 for(column = 0; column < raidPtr->numSpare ; column++) {
3551 sparecol = raidPtr->numCol + column;
3552 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3553 clabel = raidget_component_label(raidPtr, sparecol);
3554 clabel->autoconfigure = new_value;
3555 raidflush_component_label(raidPtr, sparecol);
3556 }
3557 }
3558 return(new_value);
3559 }
3560
3561 int
3562 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3563 {
3564 RF_ComponentLabel_t *clabel;
3565 int column;
3566 int sparecol;
3567
3568 raidPtr->root_partition = new_value;
3569 for(column=0; column<raidPtr->numCol; column++) {
3570 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3571 clabel = raidget_component_label(raidPtr, column);
3572 clabel->root_partition = new_value;
3573 raidflush_component_label(raidPtr, column);
3574 }
3575 }
3576 for(column = 0; column < raidPtr->numSpare ; column++) {
3577 sparecol = raidPtr->numCol + column;
3578 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3579 clabel = raidget_component_label(raidPtr, sparecol);
3580 clabel->root_partition = new_value;
3581 raidflush_component_label(raidPtr, sparecol);
3582 }
3583 }
3584 return(new_value);
3585 }
3586
3587 void
3588 rf_release_all_vps(RF_ConfigSet_t *cset)
3589 {
3590 RF_AutoConfig_t *ac;
3591
3592 ac = cset->ac;
3593 while(ac!=NULL) {
3594 /* Close the vp, and give it back */
3595 if (ac->vp) {
3596 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3597 VOP_CLOSE(ac->vp, FREAD, NOCRED);
3598 vput(ac->vp);
3599 ac->vp = NULL;
3600 }
3601 ac = ac->next;
3602 }
3603 }
3604
3605
3606 void
3607 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3608 {
3609 RF_AutoConfig_t *ac;
3610 RF_AutoConfig_t *next_ac;
3611
3612 ac = cset->ac;
3613 while(ac!=NULL) {
3614 next_ac = ac->next;
3615 /* nuke the label */
3616 free(ac->clabel, M_RAIDFRAME);
3617 /* cleanup the config structure */
3618 free(ac, M_RAIDFRAME);
3619 /* "next.." */
3620 ac = next_ac;
3621 }
3622 /* and, finally, nuke the config set */
3623 free(cset, M_RAIDFRAME);
3624 }
3625
3626
3627 void
3628 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3629 {
3630 /* current version number */
3631 clabel->version = RF_COMPONENT_LABEL_VERSION;
3632 clabel->serial_number = raidPtr->serial_number;
3633 clabel->mod_counter = raidPtr->mod_counter;
3634
3635 clabel->num_rows = 1;
3636 clabel->num_columns = raidPtr->numCol;
3637 clabel->clean = RF_RAID_DIRTY; /* not clean */
3638 clabel->status = rf_ds_optimal; /* "It's good!" */
3639
3640 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3641 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3642 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3643
3644 clabel->blockSize = raidPtr->bytesPerSector;
3645 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3646
3647 /* XXX not portable */
3648 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3649 clabel->maxOutstanding = raidPtr->maxOutstanding;
3650 clabel->autoconfigure = raidPtr->autoconfigure;
3651 clabel->root_partition = raidPtr->root_partition;
3652 clabel->last_unit = raidPtr->raidid;
3653 clabel->config_order = raidPtr->config_order;
3654
3655 #ifndef RF_NO_PARITY_MAP
3656 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3657 #endif
3658 }
3659
3660 int
3661 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3662 {
3663 RF_Raid_t *raidPtr;
3664 RF_Config_t *config;
3665 int raidID;
3666 int retcode;
3667
3668 #ifdef DEBUG
3669 printf("RAID autoconfigure\n");
3670 #endif
3671
3672 retcode = 0;
3673 *unit = -1;
3674
3675 /* 1. Create a config structure */
3676
3677 config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3678 M_RAIDFRAME,
3679 M_NOWAIT);
3680 if (config==NULL) {
3681 printf("Out of mem!?!?\n");
3682 /* XXX do something more intelligent here. */
3683 return(1);
3684 }
3685
3686 memset(config, 0, sizeof(RF_Config_t));
3687
3688 /*
3689 2. Figure out what RAID ID this one is supposed to live at
3690 See if we can get the same RAID dev that it was configured
3691 on last time..
3692 */
3693
3694 raidID = cset->ac->clabel->last_unit;
3695 if ((raidID < 0) || (raidID >= numraid)) {
3696 /* let's not wander off into lala land. */
3697 raidID = numraid - 1;
3698 }
3699 if (raidPtrs[raidID]->valid != 0) {
3700
3701 /*
3702 Nope... Go looking for an alternative...
3703 Start high so we don't immediately use raid0 if that's
3704 not taken.
3705 */
3706
3707 for(raidID = numraid - 1; raidID >= 0; raidID--) {
3708 if (raidPtrs[raidID]->valid == 0) {
3709 /* can use this one! */
3710 break;
3711 }
3712 }
3713 }
3714
3715 if (raidID < 0) {
3716 /* punt... */
3717 printf("Unable to auto configure this set!\n");
3718 printf("(Out of RAID devs!)\n");
3719 free(config, M_RAIDFRAME);
3720 return(1);
3721 }
3722
3723 #ifdef DEBUG
3724 printf("Configuring raid%d:\n",raidID);
3725 #endif
3726
3727 raidPtr = raidPtrs[raidID];
3728
3729 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3730 raidPtr->raidid = raidID;
3731 raidPtr->openings = RAIDOUTSTANDING;
3732
3733 /* 3. Build the configuration structure */
3734 rf_create_configuration(cset->ac, config, raidPtr);
3735
3736 /* 4. Do the configuration */
3737 retcode = rf_Configure(raidPtr, config, cset->ac);
3738
3739 if (retcode == 0) {
3740
3741 raidinit(raidPtrs[raidID]);
3742
3743 rf_markalldirty(raidPtrs[raidID]);
3744 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3745 if (cset->ac->clabel->root_partition==1) {
3746 /* everything configured just fine. Make a note
3747 that this set is eligible to be root. */
3748 cset->rootable = 1;
3749 /* XXX do this here? */
3750 raidPtrs[raidID]->root_partition = 1;
3751 }
3752 }
3753
3754 /* 5. Cleanup */
3755 free(config, M_RAIDFRAME);
3756
3757 *unit = raidID;
3758 return(retcode);
3759 }
3760
3761 void
3762 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3763 {
3764 struct buf *bp;
3765
3766 bp = (struct buf *)desc->bp;
3767 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3768 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3769 }
3770
3771 void
3772 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3773 size_t xmin, size_t xmax)
3774 {
3775 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3776 pool_sethiwat(p, xmax);
3777 pool_prime(p, xmin);
3778 pool_setlowat(p, xmin);
3779 }
3780
3781 /*
3782 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3783 * if there is IO pending and if that IO could possibly be done for a
3784 * given RAID set. Returns 0 if IO is waiting and can be done, 1
3785 * otherwise.
3786 *
3787 */
3788
3789 int
3790 rf_buf_queue_check(int raidid)
3791 {
3792 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
3793 raidPtrs[raidid]->openings > 0) {
3794 /* there is work to do */
3795 return 0;
3796 }
3797 /* default is nothing to do */
3798 return 1;
3799 }
3800
3801 int
3802 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3803 {
3804 uint64_t numsecs;
3805 unsigned secsize;
3806 int error;
3807
3808 error = getdisksize(vp, &numsecs, &secsize);
3809 if (error == 0) {
3810 diskPtr->blockSize = secsize;
3811 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3812 diskPtr->partitionSize = numsecs;
3813 return 0;
3814 }
3815 return error;
3816 }
3817
3818 static int
3819 raid_match(device_t self, cfdata_t cfdata, void *aux)
3820 {
3821 return 1;
3822 }
3823
3824 static void
3825 raid_attach(device_t parent, device_t self, void *aux)
3826 {
3827
3828 }
3829
3830
3831 static int
3832 raid_detach(device_t self, int flags)
3833 {
3834 int error;
3835 struct raid_softc *rs = &raid_softc[device_unit(self)];
3836
3837 if ((error = raidlock(rs)) != 0)
3838 return (error);
3839
3840 error = raid_detach_unlocked(rs);
3841
3842 raidunlock(rs);
3843
3844 return error;
3845 }
3846
3847 static void
3848 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3849 {
3850 prop_dictionary_t disk_info, odisk_info, geom;
3851 disk_info = prop_dictionary_create();
3852 geom = prop_dictionary_create();
3853 prop_dictionary_set_uint64(geom, "sectors-per-unit",
3854 raidPtr->totalSectors);
3855 prop_dictionary_set_uint32(geom, "sector-size",
3856 raidPtr->bytesPerSector);
3857
3858 prop_dictionary_set_uint16(geom, "sectors-per-track",
3859 raidPtr->Layout.dataSectorsPerStripe);
3860 prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3861 4 * raidPtr->numCol);
3862
3863 prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3864 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3865 (4 * raidPtr->numCol)));
3866
3867 prop_dictionary_set(disk_info, "geometry", geom);
3868 prop_object_release(geom);
3869 prop_dictionary_set(device_properties(rs->sc_dev),
3870 "disk-info", disk_info);
3871 odisk_info = rs->sc_dkdev.dk_info;
3872 rs->sc_dkdev.dk_info = disk_info;
3873 if (odisk_info)
3874 prop_object_release(odisk_info);
3875 }
3876
3877 /*
3878 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3879 * We end up returning whatever error was returned by the first cache flush
3880 * that fails.
3881 */
3882
3883 int
3884 rf_sync_component_caches(RF_Raid_t *raidPtr)
3885 {
3886 int c, sparecol;
3887 int e,error;
3888 int force = 1;
3889
3890 error = 0;
3891 for (c = 0; c < raidPtr->numCol; c++) {
3892 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3893 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3894 &force, FWRITE, NOCRED);
3895 if (e) {
3896 if (e != ENODEV)
3897 printf("raid%d: cache flush to component %s failed.\n",
3898 raidPtr->raidid, raidPtr->Disks[c].devname);
3899 if (error == 0) {
3900 error = e;
3901 }
3902 }
3903 }
3904 }
3905
3906 for( c = 0; c < raidPtr->numSpare ; c++) {
3907 sparecol = raidPtr->numCol + c;
3908 /* Need to ensure that the reconstruct actually completed! */
3909 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3910 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3911 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3912 if (e) {
3913 if (e != ENODEV)
3914 printf("raid%d: cache flush to component %s failed.\n",
3915 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3916 if (error == 0) {
3917 error = e;
3918 }
3919 }
3920 }
3921 }
3922 return error;
3923 }
3924