rf_netbsdkintf.c revision 1.396 1 /* $NetBSD: rf_netbsdkintf.c,v 1.396 2021/07/23 02:35:14 oster Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.396 2021/07/23 02:35:14 oster Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171
172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
173 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
175 * installation process */
176 #endif
177
178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
179
180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
181
182 /* prototypes */
183 static void KernelWakeupFunc(struct buf *);
184 static void InitBP(struct buf *, struct vnode *, unsigned,
185 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
186 void *, int);
187 static void raidinit(struct raid_softc *);
188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
190
191 static int raid_match(device_t, cfdata_t, void *);
192 static void raid_attach(device_t, device_t, void *);
193 static int raid_detach(device_t, int);
194
195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
196 daddr_t, daddr_t);
197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
198 daddr_t, daddr_t, int);
199
200 static int raidwrite_component_label(unsigned,
201 dev_t, struct vnode *, RF_ComponentLabel_t *);
202 static int raidread_component_label(unsigned,
203 dev_t, struct vnode *, RF_ComponentLabel_t *);
204
205 static int raid_diskstart(device_t, struct buf *bp);
206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
207 static int raid_lastclose(device_t);
208
209 static dev_type_open(raidopen);
210 static dev_type_close(raidclose);
211 static dev_type_read(raidread);
212 static dev_type_write(raidwrite);
213 static dev_type_ioctl(raidioctl);
214 static dev_type_strategy(raidstrategy);
215 static dev_type_dump(raiddump);
216 static dev_type_size(raidsize);
217
218 const struct bdevsw raid_bdevsw = {
219 .d_open = raidopen,
220 .d_close = raidclose,
221 .d_strategy = raidstrategy,
222 .d_ioctl = raidioctl,
223 .d_dump = raiddump,
224 .d_psize = raidsize,
225 .d_discard = nodiscard,
226 .d_flag = D_DISK
227 };
228
229 const struct cdevsw raid_cdevsw = {
230 .d_open = raidopen,
231 .d_close = raidclose,
232 .d_read = raidread,
233 .d_write = raidwrite,
234 .d_ioctl = raidioctl,
235 .d_stop = nostop,
236 .d_tty = notty,
237 .d_poll = nopoll,
238 .d_mmap = nommap,
239 .d_kqfilter = nokqfilter,
240 .d_discard = nodiscard,
241 .d_flag = D_DISK
242 };
243
244 static struct dkdriver rf_dkdriver = {
245 .d_open = raidopen,
246 .d_close = raidclose,
247 .d_strategy = raidstrategy,
248 .d_diskstart = raid_diskstart,
249 .d_dumpblocks = raid_dumpblocks,
250 .d_lastclose = raid_lastclose,
251 .d_minphys = minphys
252 };
253
254 #define raidunit(x) DISKUNIT(x)
255 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
256
257 extern struct cfdriver raid_cd;
258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
259 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
260 DVF_DETACH_SHUTDOWN);
261
262 /* Internal representation of a rf_recon_req */
263 struct rf_recon_req_internal {
264 RF_RowCol_t col;
265 RF_ReconReqFlags_t flags;
266 void *raidPtr;
267 };
268
269 /*
270 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
271 * Be aware that large numbers can allow the driver to consume a lot of
272 * kernel memory, especially on writes, and in degraded mode reads.
273 *
274 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
275 * a single 64K write will typically require 64K for the old data,
276 * 64K for the old parity, and 64K for the new parity, for a total
277 * of 192K (if the parity buffer is not re-used immediately).
278 * Even it if is used immediately, that's still 128K, which when multiplied
279 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
280 *
281 * Now in degraded mode, for example, a 64K read on the above setup may
282 * require data reconstruction, which will require *all* of the 4 remaining
283 * disks to participate -- 4 * 32K/disk == 128K again.
284 */
285
286 #ifndef RAIDOUTSTANDING
287 #define RAIDOUTSTANDING 6
288 #endif
289
290 #define RAIDLABELDEV(dev) \
291 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
292
293 /* declared here, and made public, for the benefit of KVM stuff.. */
294
295 static int raidlock(struct raid_softc *);
296 static void raidunlock(struct raid_softc *);
297
298 static int raid_detach_unlocked(struct raid_softc *);
299
300 static void rf_markalldirty(RF_Raid_t *);
301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
302
303 static void rf_ReconThread(struct rf_recon_req_internal *);
304 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
305 static void rf_CopybackThread(RF_Raid_t *raidPtr);
306 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
307 static int rf_autoconfig(device_t);
308 static void rf_buildroothack(RF_ConfigSet_t *);
309
310 static RF_AutoConfig_t *rf_find_raid_components(void);
311 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
313 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
314 static int rf_set_autoconfig(RF_Raid_t *, int);
315 static int rf_set_rootpartition(RF_Raid_t *, int);
316 static void rf_release_all_vps(RF_ConfigSet_t *);
317 static void rf_cleanup_config_set(RF_ConfigSet_t *);
318 static int rf_have_enough_components(RF_ConfigSet_t *);
319 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
320 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
321
322 /*
323 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
324 * Note that this is overridden by having RAID_AUTOCONFIG as an option
325 * in the kernel config file.
326 */
327 #ifdef RAID_AUTOCONFIG
328 int raidautoconfig = 1;
329 #else
330 int raidautoconfig = 0;
331 #endif
332 static bool raidautoconfigdone = false;
333
334 struct pool rf_alloclist_pool; /* AllocList */
335
336 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
337 static kmutex_t raid_lock;
338
339 static struct raid_softc *
340 raidcreate(int unit) {
341 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
342 sc->sc_unit = unit;
343 cv_init(&sc->sc_cv, "raidunit");
344 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
345 return sc;
346 }
347
348 static void
349 raiddestroy(struct raid_softc *sc) {
350 cv_destroy(&sc->sc_cv);
351 mutex_destroy(&sc->sc_mutex);
352 kmem_free(sc, sizeof(*sc));
353 }
354
355 static struct raid_softc *
356 raidget(int unit, bool create) {
357 struct raid_softc *sc;
358 if (unit < 0) {
359 #ifdef DIAGNOSTIC
360 panic("%s: unit %d!", __func__, unit);
361 #endif
362 return NULL;
363 }
364 mutex_enter(&raid_lock);
365 LIST_FOREACH(sc, &raids, sc_link) {
366 if (sc->sc_unit == unit) {
367 mutex_exit(&raid_lock);
368 return sc;
369 }
370 }
371 mutex_exit(&raid_lock);
372 if (!create)
373 return NULL;
374 sc = raidcreate(unit);
375 mutex_enter(&raid_lock);
376 LIST_INSERT_HEAD(&raids, sc, sc_link);
377 mutex_exit(&raid_lock);
378 return sc;
379 }
380
381 static void
382 raidput(struct raid_softc *sc) {
383 mutex_enter(&raid_lock);
384 LIST_REMOVE(sc, sc_link);
385 mutex_exit(&raid_lock);
386 raiddestroy(sc);
387 }
388
389 void
390 raidattach(int num)
391 {
392
393 /*
394 * Device attachment and associated initialization now occurs
395 * as part of the module initialization.
396 */
397 }
398
399 static int
400 rf_autoconfig(device_t self)
401 {
402 RF_AutoConfig_t *ac_list;
403 RF_ConfigSet_t *config_sets;
404
405 if (!raidautoconfig || raidautoconfigdone == true)
406 return 0;
407
408 /* XXX This code can only be run once. */
409 raidautoconfigdone = true;
410
411 #ifdef __HAVE_CPU_BOOTCONF
412 /*
413 * 0. find the boot device if needed first so we can use it later
414 * this needs to be done before we autoconfigure any raid sets,
415 * because if we use wedges we are not going to be able to open
416 * the boot device later
417 */
418 if (booted_device == NULL)
419 cpu_bootconf();
420 #endif
421 /* 1. locate all RAID components on the system */
422 aprint_debug("Searching for RAID components...\n");
423 ac_list = rf_find_raid_components();
424
425 /* 2. Sort them into their respective sets. */
426 config_sets = rf_create_auto_sets(ac_list);
427
428 /*
429 * 3. Evaluate each set and configure the valid ones.
430 * This gets done in rf_buildroothack().
431 */
432 rf_buildroothack(config_sets);
433
434 return 1;
435 }
436
437 int
438 rf_inited(const struct raid_softc *rs) {
439 return (rs->sc_flags & RAIDF_INITED) != 0;
440 }
441
442 RF_Raid_t *
443 rf_get_raid(struct raid_softc *rs) {
444 return &rs->sc_r;
445 }
446
447 int
448 rf_get_unit(const struct raid_softc *rs) {
449 return rs->sc_unit;
450 }
451
452 static int
453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
454 const char *bootname;
455 size_t len;
456
457 /* if bdv is NULL, the set can't contain it. exit early. */
458 if (bdv == NULL)
459 return 0;
460
461 bootname = device_xname(bdv);
462 len = strlen(bootname);
463
464 for (int col = 0; col < r->numCol; col++) {
465 const char *devname = r->Disks[col].devname;
466 devname += sizeof("/dev/") - 1;
467 if (strncmp(devname, "dk", 2) == 0) {
468 const char *parent =
469 dkwedge_get_parent_name(r->Disks[col].dev);
470 if (parent != NULL)
471 devname = parent;
472 }
473 if (strncmp(devname, bootname, len) == 0) {
474 struct raid_softc *sc = r->softc;
475 aprint_debug("raid%d includes boot device %s\n",
476 sc->sc_unit, devname);
477 return 1;
478 }
479 }
480 return 0;
481 }
482
483 static void
484 rf_buildroothack(RF_ConfigSet_t *config_sets)
485 {
486 RF_ConfigSet_t *cset;
487 RF_ConfigSet_t *next_cset;
488 int num_root;
489 struct raid_softc *sc, *rsc;
490 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
491
492 sc = rsc = NULL;
493 num_root = 0;
494 cset = config_sets;
495 while (cset != NULL) {
496 next_cset = cset->next;
497 if (rf_have_enough_components(cset) &&
498 cset->ac->clabel->autoconfigure == 1) {
499 sc = rf_auto_config_set(cset);
500 if (sc != NULL) {
501 aprint_debug("raid%d: configured ok, rootable %d\n",
502 sc->sc_unit, cset->rootable);
503 if (cset->rootable) {
504 rsc = sc;
505 num_root++;
506 }
507 } else {
508 /* The autoconfig didn't work :( */
509 aprint_debug("Autoconfig failed\n");
510 rf_release_all_vps(cset);
511 }
512 } else {
513 /* we're not autoconfiguring this set...
514 release the associated resources */
515 rf_release_all_vps(cset);
516 }
517 /* cleanup */
518 rf_cleanup_config_set(cset);
519 cset = next_cset;
520 }
521
522 /* if the user has specified what the root device should be
523 then we don't touch booted_device or boothowto... */
524
525 if (rootspec != NULL) {
526 DPRINTF("%s: rootspec %s\n", __func__, rootspec);
527 return;
528 }
529
530 /* we found something bootable... */
531
532 /*
533 * XXX: The following code assumes that the root raid
534 * is the first ('a') partition. This is about the best
535 * we can do with a BSD disklabel, but we might be able
536 * to do better with a GPT label, by setting a specified
537 * attribute to indicate the root partition. We can then
538 * stash the partition number in the r->root_partition
539 * high bits (the bottom 2 bits are already used). For
540 * now we just set booted_partition to 0 when we override
541 * root.
542 */
543 if (num_root == 1) {
544 device_t candidate_root;
545 dksc = &rsc->sc_dksc;
546 if (dksc->sc_dkdev.dk_nwedges != 0) {
547 char cname[sizeof(cset->ac->devname)];
548 /* XXX: assume partition 'a' first */
549 snprintf(cname, sizeof(cname), "%s%c",
550 device_xname(dksc->sc_dev), 'a');
551 candidate_root = dkwedge_find_by_wname(cname);
552 DPRINTF("%s: candidate wedge root=%s\n", __func__,
553 cname);
554 if (candidate_root == NULL) {
555 /*
556 * If that is not found, because we don't use
557 * disklabel, return the first dk child
558 * XXX: we can skip the 'a' check above
559 * and always do this...
560 */
561 size_t i = 0;
562 candidate_root = dkwedge_find_by_parent(
563 device_xname(dksc->sc_dev), &i);
564 }
565 DPRINTF("%s: candidate wedge root=%p\n", __func__,
566 candidate_root);
567 } else
568 candidate_root = dksc->sc_dev;
569 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
570 DPRINTF("%s: booted_device=%p root_partition=%d "
571 "contains_boot=%d",
572 __func__, booted_device, rsc->sc_r.root_partition,
573 rf_containsboot(&rsc->sc_r, booted_device));
574 /* XXX the check for booted_device == NULL can probably be
575 * dropped, now that rf_containsboot handles that case.
576 */
577 if (booted_device == NULL ||
578 rsc->sc_r.root_partition == 1 ||
579 rf_containsboot(&rsc->sc_r, booted_device)) {
580 booted_device = candidate_root;
581 booted_method = "raidframe/single";
582 booted_partition = 0; /* XXX assume 'a' */
583 DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
584 device_xname(booted_device), booted_device);
585 }
586 } else if (num_root > 1) {
587 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
588 booted_device);
589
590 /*
591 * Maybe the MD code can help. If it cannot, then
592 * setroot() will discover that we have no
593 * booted_device and will ask the user if nothing was
594 * hardwired in the kernel config file
595 */
596 if (booted_device == NULL)
597 return;
598
599 num_root = 0;
600 mutex_enter(&raid_lock);
601 LIST_FOREACH(sc, &raids, sc_link) {
602 RF_Raid_t *r = &sc->sc_r;
603 if (r->valid == 0)
604 continue;
605
606 if (r->root_partition == 0)
607 continue;
608
609 if (rf_containsboot(r, booted_device)) {
610 num_root++;
611 rsc = sc;
612 dksc = &rsc->sc_dksc;
613 }
614 }
615 mutex_exit(&raid_lock);
616
617 if (num_root == 1) {
618 booted_device = dksc->sc_dev;
619 booted_method = "raidframe/multi";
620 booted_partition = 0; /* XXX assume 'a' */
621 } else {
622 /* we can't guess.. require the user to answer... */
623 boothowto |= RB_ASKNAME;
624 }
625 }
626 }
627
628 static int
629 raidsize(dev_t dev)
630 {
631 struct raid_softc *rs;
632 struct dk_softc *dksc;
633 unsigned int unit;
634
635 unit = raidunit(dev);
636 if ((rs = raidget(unit, false)) == NULL)
637 return -1;
638 dksc = &rs->sc_dksc;
639
640 if ((rs->sc_flags & RAIDF_INITED) == 0)
641 return -1;
642
643 return dk_size(dksc, dev);
644 }
645
646 static int
647 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
648 {
649 unsigned int unit;
650 struct raid_softc *rs;
651 struct dk_softc *dksc;
652
653 unit = raidunit(dev);
654 if ((rs = raidget(unit, false)) == NULL)
655 return ENXIO;
656 dksc = &rs->sc_dksc;
657
658 if ((rs->sc_flags & RAIDF_INITED) == 0)
659 return ENODEV;
660
661 /*
662 Note that blkno is relative to this particular partition.
663 By adding adding RF_PROTECTED_SECTORS, we get a value that
664 is relative to the partition used for the underlying component.
665 */
666 blkno += RF_PROTECTED_SECTORS;
667
668 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
669 }
670
671 static int
672 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
673 {
674 struct raid_softc *rs = raidsoftc(dev);
675 const struct bdevsw *bdev;
676 RF_Raid_t *raidPtr;
677 int c, sparecol, j, scol, dumpto;
678 int error = 0;
679
680 raidPtr = &rs->sc_r;
681
682 /* we only support dumping to RAID 1 sets */
683 if (raidPtr->Layout.numDataCol != 1 ||
684 raidPtr->Layout.numParityCol != 1)
685 return EINVAL;
686
687 if ((error = raidlock(rs)) != 0)
688 return error;
689
690 /* figure out what device is alive.. */
691
692 /*
693 Look for a component to dump to. The preference for the
694 component to dump to is as follows:
695 1) the first component
696 2) a used_spare of the first component
697 3) the second component
698 4) a used_spare of the second component
699 */
700
701 dumpto = -1;
702 for (c = 0; c < raidPtr->numCol; c++) {
703 if (raidPtr->Disks[c].status == rf_ds_optimal) {
704 /* this might be the one */
705 dumpto = c;
706 break;
707 }
708 }
709
710 /*
711 At this point we have possibly selected a live component.
712 If we didn't find a live ocmponent, we now check to see
713 if there is a relevant spared component.
714 */
715
716 for (c = 0; c < raidPtr->numSpare; c++) {
717 sparecol = raidPtr->numCol + c;
718 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
719 /* How about this one? */
720 scol = -1;
721 for(j=0;j<raidPtr->numCol;j++) {
722 if (raidPtr->Disks[j].spareCol == sparecol) {
723 scol = j;
724 break;
725 }
726 }
727 if (scol == 0) {
728 /*
729 We must have found a spared first
730 component! We'll take that over
731 anything else found so far. (We
732 couldn't have found a real first
733 component before, since this is a
734 used spare, and it's saying that
735 it's replacing the first
736 component.) On reboot (with
737 autoconfiguration turned on)
738 sparecol will become the first
739 component (component0) of this set.
740 */
741 dumpto = sparecol;
742 break;
743 } else if (scol != -1) {
744 /*
745 Must be a spared second component.
746 We'll dump to that if we havn't found
747 anything else so far.
748 */
749 if (dumpto == -1)
750 dumpto = sparecol;
751 }
752 }
753 }
754
755 if (dumpto == -1) {
756 /* we couldn't find any live components to dump to!?!?
757 */
758 error = EINVAL;
759 goto out;
760 }
761
762 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
763 if (bdev == NULL) {
764 error = ENXIO;
765 goto out;
766 }
767
768 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
769 blkno, va, nblk * raidPtr->bytesPerSector);
770
771 out:
772 raidunlock(rs);
773
774 return error;
775 }
776
777 /* ARGSUSED */
778 static int
779 raidopen(dev_t dev, int flags, int fmt,
780 struct lwp *l)
781 {
782 int unit = raidunit(dev);
783 struct raid_softc *rs;
784 struct dk_softc *dksc;
785 int error = 0;
786 int part, pmask;
787
788 if ((rs = raidget(unit, true)) == NULL)
789 return ENXIO;
790 if ((error = raidlock(rs)) != 0)
791 return error;
792
793 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
794 error = EBUSY;
795 goto bad;
796 }
797
798 dksc = &rs->sc_dksc;
799
800 part = DISKPART(dev);
801 pmask = (1 << part);
802
803 if (!DK_BUSY(dksc, pmask) &&
804 ((rs->sc_flags & RAIDF_INITED) != 0)) {
805 /* First one... mark things as dirty... Note that we *MUST*
806 have done a configure before this. I DO NOT WANT TO BE
807 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
808 THAT THEY BELONG TOGETHER!!!!! */
809 /* XXX should check to see if we're only open for reading
810 here... If so, we needn't do this, but then need some
811 other way of keeping track of what's happened.. */
812
813 rf_markalldirty(&rs->sc_r);
814 }
815
816 if ((rs->sc_flags & RAIDF_INITED) != 0)
817 error = dk_open(dksc, dev, flags, fmt, l);
818
819 bad:
820 raidunlock(rs);
821
822 return error;
823
824
825 }
826
827 static int
828 raid_lastclose(device_t self)
829 {
830 struct raid_softc *rs = raidsoftc(self);
831
832 /* Last one... device is not unconfigured yet.
833 Device shutdown has taken care of setting the
834 clean bits if RAIDF_INITED is not set
835 mark things as clean... */
836
837 rf_update_component_labels(&rs->sc_r,
838 RF_FINAL_COMPONENT_UPDATE);
839
840 /* pass to unlocked code */
841 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
842 rs->sc_flags |= RAIDF_DETACH;
843
844 return 0;
845 }
846
847 /* ARGSUSED */
848 static int
849 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
850 {
851 int unit = raidunit(dev);
852 struct raid_softc *rs;
853 struct dk_softc *dksc;
854 cfdata_t cf;
855 int error = 0, do_detach = 0, do_put = 0;
856
857 if ((rs = raidget(unit, false)) == NULL)
858 return ENXIO;
859 dksc = &rs->sc_dksc;
860
861 if ((error = raidlock(rs)) != 0)
862 return error;
863
864 if ((rs->sc_flags & RAIDF_INITED) != 0) {
865 error = dk_close(dksc, dev, flags, fmt, l);
866 if ((rs->sc_flags & RAIDF_DETACH) != 0)
867 do_detach = 1;
868 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
869 do_put = 1;
870
871 raidunlock(rs);
872
873 if (do_detach) {
874 /* free the pseudo device attach bits */
875 cf = device_cfdata(dksc->sc_dev);
876 error = config_detach(dksc->sc_dev, 0);
877 if (error == 0)
878 free(cf, M_RAIDFRAME);
879 } else if (do_put) {
880 raidput(rs);
881 }
882
883 return error;
884
885 }
886
887 static void
888 raid_wakeup(RF_Raid_t *raidPtr)
889 {
890 rf_lock_mutex2(raidPtr->iodone_lock);
891 rf_signal_cond2(raidPtr->iodone_cv);
892 rf_unlock_mutex2(raidPtr->iodone_lock);
893 }
894
895 static void
896 raidstrategy(struct buf *bp)
897 {
898 unsigned int unit;
899 struct raid_softc *rs;
900 struct dk_softc *dksc;
901 RF_Raid_t *raidPtr;
902
903 unit = raidunit(bp->b_dev);
904 if ((rs = raidget(unit, false)) == NULL) {
905 bp->b_error = ENXIO;
906 goto fail;
907 }
908 if ((rs->sc_flags & RAIDF_INITED) == 0) {
909 bp->b_error = ENXIO;
910 goto fail;
911 }
912 dksc = &rs->sc_dksc;
913 raidPtr = &rs->sc_r;
914
915 /* Queue IO only */
916 if (dk_strategy_defer(dksc, bp))
917 goto done;
918
919 /* schedule the IO to happen at the next convenient time */
920 raid_wakeup(raidPtr);
921
922 done:
923 return;
924
925 fail:
926 bp->b_resid = bp->b_bcount;
927 biodone(bp);
928 }
929
930 static int
931 raid_diskstart(device_t dev, struct buf *bp)
932 {
933 struct raid_softc *rs = raidsoftc(dev);
934 RF_Raid_t *raidPtr;
935
936 raidPtr = &rs->sc_r;
937 if (!raidPtr->valid) {
938 db1_printf(("raid is not valid..\n"));
939 return ENODEV;
940 }
941
942 /* XXX */
943 bp->b_resid = 0;
944
945 return raiddoaccess(raidPtr, bp);
946 }
947
948 void
949 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
950 {
951 struct raid_softc *rs;
952 struct dk_softc *dksc;
953
954 rs = raidPtr->softc;
955 dksc = &rs->sc_dksc;
956
957 dk_done(dksc, bp);
958
959 rf_lock_mutex2(raidPtr->mutex);
960 raidPtr->openings++;
961 rf_unlock_mutex2(raidPtr->mutex);
962
963 /* schedule more IO */
964 raid_wakeup(raidPtr);
965 }
966
967 /* ARGSUSED */
968 static int
969 raidread(dev_t dev, struct uio *uio, int flags)
970 {
971 int unit = raidunit(dev);
972 struct raid_softc *rs;
973
974 if ((rs = raidget(unit, false)) == NULL)
975 return ENXIO;
976
977 if ((rs->sc_flags & RAIDF_INITED) == 0)
978 return ENXIO;
979
980 return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
981
982 }
983
984 /* ARGSUSED */
985 static int
986 raidwrite(dev_t dev, struct uio *uio, int flags)
987 {
988 int unit = raidunit(dev);
989 struct raid_softc *rs;
990
991 if ((rs = raidget(unit, false)) == NULL)
992 return ENXIO;
993
994 if ((rs->sc_flags & RAIDF_INITED) == 0)
995 return ENXIO;
996
997 return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
998
999 }
1000
1001 static int
1002 raid_detach_unlocked(struct raid_softc *rs)
1003 {
1004 struct dk_softc *dksc = &rs->sc_dksc;
1005 RF_Raid_t *raidPtr;
1006 int error;
1007
1008 raidPtr = &rs->sc_r;
1009
1010 if (DK_BUSY(dksc, 0) ||
1011 raidPtr->recon_in_progress != 0 ||
1012 raidPtr->parity_rewrite_in_progress != 0 ||
1013 raidPtr->copyback_in_progress != 0)
1014 return EBUSY;
1015
1016 if ((rs->sc_flags & RAIDF_INITED) == 0)
1017 return 0;
1018
1019 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1020
1021 if ((error = rf_Shutdown(raidPtr)) != 0)
1022 return error;
1023
1024 rs->sc_flags &= ~RAIDF_INITED;
1025
1026 /* Kill off any queued buffers */
1027 dk_drain(dksc);
1028 bufq_free(dksc->sc_bufq);
1029
1030 /* Detach the disk. */
1031 dkwedge_delall(&dksc->sc_dkdev);
1032 disk_detach(&dksc->sc_dkdev);
1033 disk_destroy(&dksc->sc_dkdev);
1034 dk_detach(dksc);
1035
1036 return 0;
1037 }
1038
1039 static bool
1040 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1041 {
1042 switch (cmd) {
1043 case RAIDFRAME_ADD_HOT_SPARE:
1044 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1045 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1046 case RAIDFRAME_CHECK_PARITY:
1047 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1048 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1049 case RAIDFRAME_CHECK_RECON_STATUS:
1050 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1051 case RAIDFRAME_COPYBACK:
1052 case RAIDFRAME_DELETE_COMPONENT:
1053 case RAIDFRAME_FAIL_DISK:
1054 case RAIDFRAME_GET_ACCTOTALS:
1055 case RAIDFRAME_GET_COMPONENT_LABEL:
1056 case RAIDFRAME_GET_INFO:
1057 case RAIDFRAME_GET_SIZE:
1058 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1059 case RAIDFRAME_INIT_LABELS:
1060 case RAIDFRAME_KEEP_ACCTOTALS:
1061 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1062 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1063 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1064 case RAIDFRAME_PARITYMAP_STATUS:
1065 case RAIDFRAME_REBUILD_IN_PLACE:
1066 case RAIDFRAME_REMOVE_HOT_SPARE:
1067 case RAIDFRAME_RESET_ACCTOTALS:
1068 case RAIDFRAME_REWRITEPARITY:
1069 case RAIDFRAME_SET_AUTOCONFIG:
1070 case RAIDFRAME_SET_COMPONENT_LABEL:
1071 case RAIDFRAME_SET_ROOT:
1072 return (rs->sc_flags & RAIDF_INITED) == 0;
1073 }
1074 return false;
1075 }
1076
1077 int
1078 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1079 {
1080 struct rf_recon_req_internal *rrint;
1081
1082 if (raidPtr->Layout.map->faultsTolerated == 0) {
1083 /* Can't do this on a RAID 0!! */
1084 return EINVAL;
1085 }
1086
1087 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1088 /* bad column */
1089 return EINVAL;
1090 }
1091
1092 rf_lock_mutex2(raidPtr->mutex);
1093 if (raidPtr->status == rf_rs_reconstructing) {
1094 /* you can't fail a disk while we're reconstructing! */
1095 /* XXX wrong for RAID6 */
1096 goto out;
1097 }
1098 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1099 (raidPtr->numFailures > 0)) {
1100 /* some other component has failed. Let's not make
1101 things worse. XXX wrong for RAID6 */
1102 goto out;
1103 }
1104 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1105 /* Can't fail a spared disk! */
1106 goto out;
1107 }
1108 rf_unlock_mutex2(raidPtr->mutex);
1109
1110 /* make a copy of the recon request so that we don't rely on
1111 * the user's buffer */
1112 rrint = RF_Malloc(sizeof(*rrint));
1113 if (rrint == NULL)
1114 return(ENOMEM);
1115 rrint->col = rr->col;
1116 rrint->flags = rr->flags;
1117 rrint->raidPtr = raidPtr;
1118
1119 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1120 rrint, "raid_recon");
1121 out:
1122 rf_unlock_mutex2(raidPtr->mutex);
1123 return EINVAL;
1124 }
1125
1126 static int
1127 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1128 {
1129 /* allocate a buffer for the layout-specific data, and copy it in */
1130 if (k_cfg->layoutSpecificSize == 0)
1131 return 0;
1132
1133 if (k_cfg->layoutSpecificSize > 10000) {
1134 /* sanity check */
1135 return EINVAL;
1136 }
1137
1138 u_char *specific_buf;
1139 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1140 if (specific_buf == NULL)
1141 return ENOMEM;
1142
1143 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1144 k_cfg->layoutSpecificSize);
1145 if (retcode) {
1146 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1147 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1148 return retcode;
1149 }
1150
1151 k_cfg->layoutSpecific = specific_buf;
1152 return 0;
1153 }
1154
1155 static int
1156 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1157 {
1158 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1159
1160 if (rs->sc_r.valid) {
1161 /* There is a valid RAID set running on this unit! */
1162 printf("raid%d: Device already configured!\n", rs->sc_unit);
1163 return EINVAL;
1164 }
1165
1166 /* copy-in the configuration information */
1167 /* data points to a pointer to the configuration structure */
1168 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1169 if (*k_cfg == NULL) {
1170 return ENOMEM;
1171 }
1172 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1173 if (retcode == 0)
1174 return 0;
1175 RF_Free(*k_cfg, sizeof(RF_Config_t));
1176 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1177 rs->sc_flags |= RAIDF_SHUTDOWN;
1178 return retcode;
1179 }
1180
1181 int
1182 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1183 {
1184 int retcode;
1185 RF_Raid_t *raidPtr = &rs->sc_r;
1186
1187 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1188
1189 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1190 goto out;
1191
1192 /* should do some kind of sanity check on the configuration.
1193 * Store the sum of all the bytes in the last byte? */
1194
1195 /* configure the system */
1196
1197 /*
1198 * Clear the entire RAID descriptor, just to make sure
1199 * there is no stale data left in the case of a
1200 * reconfiguration
1201 */
1202 memset(raidPtr, 0, sizeof(*raidPtr));
1203 raidPtr->softc = rs;
1204 raidPtr->raidid = rs->sc_unit;
1205
1206 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1207
1208 if (retcode == 0) {
1209 /* allow this many simultaneous IO's to
1210 this RAID device */
1211 raidPtr->openings = RAIDOUTSTANDING;
1212
1213 raidinit(rs);
1214 raid_wakeup(raidPtr);
1215 rf_markalldirty(raidPtr);
1216 }
1217
1218 /* free the buffers. No return code here. */
1219 if (k_cfg->layoutSpecificSize) {
1220 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1221 }
1222 out:
1223 RF_Free(k_cfg, sizeof(RF_Config_t));
1224 if (retcode) {
1225 /*
1226 * If configuration failed, set sc_flags so that we
1227 * will detach the device when we close it.
1228 */
1229 rs->sc_flags |= RAIDF_SHUTDOWN;
1230 }
1231 return retcode;
1232 }
1233
1234 #if RF_DISABLED
1235 static int
1236 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1237 {
1238
1239 /* XXX check the label for valid stuff... */
1240 /* Note that some things *should not* get modified --
1241 the user should be re-initing the labels instead of
1242 trying to patch things.
1243 */
1244 #ifdef DEBUG
1245 int raidid = raidPtr->raidid;
1246 printf("raid%d: Got component label:\n", raidid);
1247 printf("raid%d: Version: %d\n", raidid, clabel->version);
1248 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1249 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1250 printf("raid%d: Column: %d\n", raidid, clabel->column);
1251 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1252 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1253 printf("raid%d: Status: %d\n", raidid, clabel->status);
1254 #endif /* DEBUG */
1255 clabel->row = 0;
1256 int column = clabel->column;
1257
1258 if ((column < 0) || (column >= raidPtr->numCol)) {
1259 return(EINVAL);
1260 }
1261
1262 /* XXX this isn't allowed to do anything for now :-) */
1263
1264 /* XXX and before it is, we need to fill in the rest
1265 of the fields!?!?!?! */
1266 memcpy(raidget_component_label(raidPtr, column),
1267 clabel, sizeof(*clabel));
1268 raidflush_component_label(raidPtr, column);
1269 return 0;
1270 }
1271 #endif
1272
1273 static int
1274 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1275 {
1276 /*
1277 we only want the serial number from
1278 the above. We get all the rest of the information
1279 from the config that was used to create this RAID
1280 set.
1281 */
1282
1283 raidPtr->serial_number = clabel->serial_number;
1284
1285 for (int column = 0; column < raidPtr->numCol; column++) {
1286 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1287 if (RF_DEAD_DISK(diskPtr->status))
1288 continue;
1289 RF_ComponentLabel_t *ci_label = raidget_component_label(
1290 raidPtr, column);
1291 /* Zeroing this is important. */
1292 memset(ci_label, 0, sizeof(*ci_label));
1293 raid_init_component_label(raidPtr, ci_label);
1294 ci_label->serial_number = raidPtr->serial_number;
1295 ci_label->row = 0; /* we dont' pretend to support more */
1296 rf_component_label_set_partitionsize(ci_label,
1297 diskPtr->partitionSize);
1298 ci_label->column = column;
1299 raidflush_component_label(raidPtr, column);
1300 /* XXXjld what about the spares? */
1301 }
1302
1303 return 0;
1304 }
1305
1306 static int
1307 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1308 {
1309
1310 if (raidPtr->Layout.map->faultsTolerated == 0) {
1311 /* Can't do this on a RAID 0!! */
1312 return EINVAL;
1313 }
1314
1315 if (raidPtr->recon_in_progress == 1) {
1316 /* a reconstruct is already in progress! */
1317 return EINVAL;
1318 }
1319
1320 RF_SingleComponent_t component;
1321 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1322 component.row = 0; /* we don't support any more */
1323 int column = component.column;
1324
1325 if ((column < 0) || (column >= raidPtr->numCol)) {
1326 return EINVAL;
1327 }
1328
1329 rf_lock_mutex2(raidPtr->mutex);
1330 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1331 (raidPtr->numFailures > 0)) {
1332 /* XXX 0 above shouldn't be constant!!! */
1333 /* some component other than this has failed.
1334 Let's not make things worse than they already
1335 are... */
1336 printf("raid%d: Unable to reconstruct to disk at:\n",
1337 raidPtr->raidid);
1338 printf("raid%d: Col: %d Too many failures.\n",
1339 raidPtr->raidid, column);
1340 rf_unlock_mutex2(raidPtr->mutex);
1341 return EINVAL;
1342 }
1343
1344 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1345 printf("raid%d: Unable to reconstruct to disk at:\n",
1346 raidPtr->raidid);
1347 printf("raid%d: Col: %d "
1348 "Reconstruction already occurring!\n",
1349 raidPtr->raidid, column);
1350
1351 rf_unlock_mutex2(raidPtr->mutex);
1352 return EINVAL;
1353 }
1354
1355 if (raidPtr->Disks[column].status == rf_ds_spared) {
1356 rf_unlock_mutex2(raidPtr->mutex);
1357 return EINVAL;
1358 }
1359
1360 rf_unlock_mutex2(raidPtr->mutex);
1361
1362 struct rf_recon_req_internal *rrint;
1363 rrint = RF_Malloc(sizeof(*rrint));
1364 if (rrint == NULL)
1365 return ENOMEM;
1366
1367 rrint->col = column;
1368 rrint->raidPtr = raidPtr;
1369
1370 return RF_CREATE_THREAD(raidPtr->recon_thread,
1371 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1372 }
1373
1374 static int
1375 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1376 {
1377 /*
1378 * This makes no sense on a RAID 0, or if we are not reconstructing
1379 * so tell the user it's done.
1380 */
1381 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1382 raidPtr->status != rf_rs_reconstructing) {
1383 *data = 100;
1384 return 0;
1385 }
1386 if (raidPtr->reconControl->numRUsTotal == 0) {
1387 *data = 0;
1388 return 0;
1389 }
1390 *data = (raidPtr->reconControl->numRUsComplete * 100
1391 / raidPtr->reconControl->numRUsTotal);
1392 return 0;
1393 }
1394
1395 static int
1396 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1397 {
1398 int unit = raidunit(dev);
1399 int part, pmask;
1400 struct raid_softc *rs;
1401 struct dk_softc *dksc;
1402 RF_Config_t *k_cfg;
1403 RF_Raid_t *raidPtr;
1404 RF_AccTotals_t *totals;
1405 RF_SingleComponent_t component;
1406 RF_DeviceConfig_t *d_cfg, *ucfgp;
1407 int retcode = 0;
1408 int column;
1409 RF_ComponentLabel_t *clabel;
1410 RF_SingleComponent_t *sparePtr,*componentPtr;
1411 int d;
1412
1413 if ((rs = raidget(unit, false)) == NULL)
1414 return ENXIO;
1415
1416 dksc = &rs->sc_dksc;
1417 raidPtr = &rs->sc_r;
1418
1419 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1420 (int) DISKPART(dev), (int) unit, cmd));
1421
1422 /* Must be initialized for these... */
1423 if (rf_must_be_initialized(rs, cmd))
1424 return ENXIO;
1425
1426 switch (cmd) {
1427 /* configure the system */
1428 case RAIDFRAME_CONFIGURE:
1429 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1430 return retcode;
1431 return rf_construct(rs, k_cfg);
1432
1433 /* shutdown the system */
1434 case RAIDFRAME_SHUTDOWN:
1435
1436 part = DISKPART(dev);
1437 pmask = (1 << part);
1438
1439 if ((retcode = raidlock(rs)) != 0)
1440 return retcode;
1441
1442 if (DK_BUSY(dksc, pmask) ||
1443 raidPtr->recon_in_progress != 0 ||
1444 raidPtr->parity_rewrite_in_progress != 0 ||
1445 raidPtr->copyback_in_progress != 0)
1446 retcode = EBUSY;
1447 else {
1448 /* detach and free on close */
1449 rs->sc_flags |= RAIDF_SHUTDOWN;
1450 retcode = 0;
1451 }
1452
1453 raidunlock(rs);
1454
1455 return retcode;
1456 case RAIDFRAME_GET_COMPONENT_LABEL:
1457 return rf_get_component_label(raidPtr, data);
1458
1459 #if RF_DISABLED
1460 case RAIDFRAME_SET_COMPONENT_LABEL:
1461 return rf_set_component_label(raidPtr, data);
1462 #endif
1463
1464 case RAIDFRAME_INIT_LABELS:
1465 return rf_init_component_label(raidPtr, data);
1466
1467 case RAIDFRAME_SET_AUTOCONFIG:
1468 d = rf_set_autoconfig(raidPtr, *(int *) data);
1469 printf("raid%d: New autoconfig value is: %d\n",
1470 raidPtr->raidid, d);
1471 *(int *) data = d;
1472 return retcode;
1473
1474 case RAIDFRAME_SET_ROOT:
1475 d = rf_set_rootpartition(raidPtr, *(int *) data);
1476 printf("raid%d: New rootpartition value is: %d\n",
1477 raidPtr->raidid, d);
1478 *(int *) data = d;
1479 return retcode;
1480
1481 /* initialize all parity */
1482 case RAIDFRAME_REWRITEPARITY:
1483
1484 if (raidPtr->Layout.map->faultsTolerated == 0) {
1485 /* Parity for RAID 0 is trivially correct */
1486 raidPtr->parity_good = RF_RAID_CLEAN;
1487 return 0;
1488 }
1489
1490 if (raidPtr->parity_rewrite_in_progress == 1) {
1491 /* Re-write is already in progress! */
1492 return EINVAL;
1493 }
1494
1495 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1496 rf_RewriteParityThread, raidPtr,"raid_parity");
1497
1498 case RAIDFRAME_ADD_HOT_SPARE:
1499 sparePtr = (RF_SingleComponent_t *) data;
1500 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1501 return rf_add_hot_spare(raidPtr, &component);
1502
1503 case RAIDFRAME_REMOVE_HOT_SPARE:
1504 return retcode;
1505
1506 case RAIDFRAME_DELETE_COMPONENT:
1507 componentPtr = (RF_SingleComponent_t *)data;
1508 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1509 return rf_delete_component(raidPtr, &component);
1510
1511 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1512 componentPtr = (RF_SingleComponent_t *)data;
1513 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1514 return rf_incorporate_hot_spare(raidPtr, &component);
1515
1516 case RAIDFRAME_REBUILD_IN_PLACE:
1517 return rf_rebuild_in_place(raidPtr, data);
1518
1519 case RAIDFRAME_GET_INFO:
1520 ucfgp = *(RF_DeviceConfig_t **)data;
1521 d_cfg = RF_Malloc(sizeof(*d_cfg));
1522 if (d_cfg == NULL)
1523 return ENOMEM;
1524 retcode = rf_get_info(raidPtr, d_cfg);
1525 if (retcode == 0) {
1526 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1527 }
1528 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1529 return retcode;
1530
1531 case RAIDFRAME_CHECK_PARITY:
1532 *(int *) data = raidPtr->parity_good;
1533 return 0;
1534
1535 case RAIDFRAME_PARITYMAP_STATUS:
1536 if (rf_paritymap_ineligible(raidPtr))
1537 return EINVAL;
1538 rf_paritymap_status(raidPtr->parity_map, data);
1539 return 0;
1540
1541 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1542 if (rf_paritymap_ineligible(raidPtr))
1543 return EINVAL;
1544 if (raidPtr->parity_map == NULL)
1545 return ENOENT; /* ??? */
1546 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1547 return EINVAL;
1548 return 0;
1549
1550 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1551 if (rf_paritymap_ineligible(raidPtr))
1552 return EINVAL;
1553 *(int *) data = rf_paritymap_get_disable(raidPtr);
1554 return 0;
1555
1556 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1557 if (rf_paritymap_ineligible(raidPtr))
1558 return EINVAL;
1559 rf_paritymap_set_disable(raidPtr, *(int *)data);
1560 /* XXX should errors be passed up? */
1561 return 0;
1562
1563 case RAIDFRAME_RESET_ACCTOTALS:
1564 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1565 return 0;
1566
1567 case RAIDFRAME_GET_ACCTOTALS:
1568 totals = (RF_AccTotals_t *) data;
1569 *totals = raidPtr->acc_totals;
1570 return 0;
1571
1572 case RAIDFRAME_KEEP_ACCTOTALS:
1573 raidPtr->keep_acc_totals = *(int *)data;
1574 return 0;
1575
1576 case RAIDFRAME_GET_SIZE:
1577 *(int *) data = raidPtr->totalSectors;
1578 return 0;
1579
1580 case RAIDFRAME_FAIL_DISK:
1581 return rf_fail_disk(raidPtr, data);
1582
1583 /* invoke a copyback operation after recon on whatever disk
1584 * needs it, if any */
1585 case RAIDFRAME_COPYBACK:
1586
1587 if (raidPtr->Layout.map->faultsTolerated == 0) {
1588 /* This makes no sense on a RAID 0!! */
1589 return EINVAL;
1590 }
1591
1592 if (raidPtr->copyback_in_progress == 1) {
1593 /* Copyback is already in progress! */
1594 return EINVAL;
1595 }
1596
1597 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1598 rf_CopybackThread, raidPtr, "raid_copyback");
1599
1600 /* return the percentage completion of reconstruction */
1601 case RAIDFRAME_CHECK_RECON_STATUS:
1602 return rf_check_recon_status(raidPtr, data);
1603
1604 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1605 rf_check_recon_status_ext(raidPtr, data);
1606 return 0;
1607
1608 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1609 if (raidPtr->Layout.map->faultsTolerated == 0) {
1610 /* This makes no sense on a RAID 0, so tell the
1611 user it's done. */
1612 *(int *) data = 100;
1613 return 0;
1614 }
1615 if (raidPtr->parity_rewrite_in_progress == 1) {
1616 *(int *) data = 100 *
1617 raidPtr->parity_rewrite_stripes_done /
1618 raidPtr->Layout.numStripe;
1619 } else {
1620 *(int *) data = 100;
1621 }
1622 return 0;
1623
1624 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1625 rf_check_parityrewrite_status_ext(raidPtr, data);
1626 return 0;
1627
1628 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1629 if (raidPtr->Layout.map->faultsTolerated == 0) {
1630 /* This makes no sense on a RAID 0 */
1631 *(int *) data = 100;
1632 return 0;
1633 }
1634 if (raidPtr->copyback_in_progress == 1) {
1635 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1636 raidPtr->Layout.numStripe;
1637 } else {
1638 *(int *) data = 100;
1639 }
1640 return 0;
1641
1642 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1643 rf_check_copyback_status_ext(raidPtr, data);
1644 return 0;
1645
1646 case RAIDFRAME_SET_LAST_UNIT:
1647 for (column = 0; column < raidPtr->numCol; column++)
1648 if (raidPtr->Disks[column].status != rf_ds_optimal)
1649 return EBUSY;
1650
1651 for (column = 0; column < raidPtr->numCol; column++) {
1652 clabel = raidget_component_label(raidPtr, column);
1653 clabel->last_unit = *(int *)data;
1654 raidflush_component_label(raidPtr, column);
1655 }
1656 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1657 return 0;
1658
1659 /* the sparetable daemon calls this to wait for the kernel to
1660 * need a spare table. this ioctl does not return until a
1661 * spare table is needed. XXX -- calling mpsleep here in the
1662 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1663 * -- I should either compute the spare table in the kernel,
1664 * or have a different -- XXX XXX -- interface (a different
1665 * character device) for delivering the table -- XXX */
1666 #if RF_DISABLED
1667 case RAIDFRAME_SPARET_WAIT:
1668 rf_lock_mutex2(rf_sparet_wait_mutex);
1669 while (!rf_sparet_wait_queue)
1670 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1671 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1672 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1673 rf_unlock_mutex2(rf_sparet_wait_mutex);
1674
1675 /* structure assignment */
1676 *((RF_SparetWait_t *) data) = *waitreq;
1677
1678 RF_Free(waitreq, sizeof(*waitreq));
1679 return 0;
1680
1681 /* wakes up a process waiting on SPARET_WAIT and puts an error
1682 * code in it that will cause the dameon to exit */
1683 case RAIDFRAME_ABORT_SPARET_WAIT:
1684 waitreq = RF_Malloc(sizeof(*waitreq));
1685 waitreq->fcol = -1;
1686 rf_lock_mutex2(rf_sparet_wait_mutex);
1687 waitreq->next = rf_sparet_wait_queue;
1688 rf_sparet_wait_queue = waitreq;
1689 rf_broadcast_cond2(rf_sparet_wait_cv);
1690 rf_unlock_mutex2(rf_sparet_wait_mutex);
1691 return 0;
1692
1693 /* used by the spare table daemon to deliver a spare table
1694 * into the kernel */
1695 case RAIDFRAME_SEND_SPARET:
1696
1697 /* install the spare table */
1698 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1699
1700 /* respond to the requestor. the return status of the spare
1701 * table installation is passed in the "fcol" field */
1702 waitred = RF_Malloc(sizeof(*waitreq));
1703 waitreq->fcol = retcode;
1704 rf_lock_mutex2(rf_sparet_wait_mutex);
1705 waitreq->next = rf_sparet_resp_queue;
1706 rf_sparet_resp_queue = waitreq;
1707 rf_broadcast_cond2(rf_sparet_resp_cv);
1708 rf_unlock_mutex2(rf_sparet_wait_mutex);
1709
1710 return retcode;
1711 #endif
1712 default:
1713 /*
1714 * Don't bother trying to load compat modules
1715 * if it is not our ioctl. This is more efficient
1716 * and makes rump tests not depend on compat code
1717 */
1718 if (IOCGROUP(cmd) != 'r')
1719 break;
1720 #ifdef _LP64
1721 if ((l->l_proc->p_flag & PK_32) != 0) {
1722 module_autoload("compat_netbsd32_raid",
1723 MODULE_CLASS_EXEC);
1724 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1725 (rs, cmd, data), enosys(), retcode);
1726 if (retcode != EPASSTHROUGH)
1727 return retcode;
1728 }
1729 #endif
1730 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1731 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1732 (rs, cmd, data), enosys(), retcode);
1733 if (retcode != EPASSTHROUGH)
1734 return retcode;
1735
1736 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1737 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1738 (rs, cmd, data), enosys(), retcode);
1739 if (retcode != EPASSTHROUGH)
1740 return retcode;
1741 break; /* fall through to the os-specific code below */
1742
1743 }
1744
1745 if (!raidPtr->valid)
1746 return EINVAL;
1747
1748 /*
1749 * Add support for "regular" device ioctls here.
1750 */
1751
1752 switch (cmd) {
1753 case DIOCGCACHE:
1754 retcode = rf_get_component_caches(raidPtr, (int *)data);
1755 break;
1756
1757 case DIOCCACHESYNC:
1758 retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1759 break;
1760
1761 default:
1762 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1763 break;
1764 }
1765
1766 return retcode;
1767
1768 }
1769
1770
1771 /* raidinit -- complete the rest of the initialization for the
1772 RAIDframe device. */
1773
1774
1775 static void
1776 raidinit(struct raid_softc *rs)
1777 {
1778 cfdata_t cf;
1779 unsigned int unit;
1780 struct dk_softc *dksc = &rs->sc_dksc;
1781 RF_Raid_t *raidPtr = &rs->sc_r;
1782 device_t dev;
1783
1784 unit = raidPtr->raidid;
1785
1786 /* XXX doesn't check bounds. */
1787 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1788
1789 /* attach the pseudo device */
1790 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1791 cf->cf_name = raid_cd.cd_name;
1792 cf->cf_atname = raid_cd.cd_name;
1793 cf->cf_unit = unit;
1794 cf->cf_fstate = FSTATE_STAR;
1795
1796 dev = config_attach_pseudo(cf);
1797 if (dev == NULL) {
1798 printf("raid%d: config_attach_pseudo failed\n",
1799 raidPtr->raidid);
1800 free(cf, M_RAIDFRAME);
1801 return;
1802 }
1803
1804 /* provide a backpointer to the real softc */
1805 raidsoftc(dev) = rs;
1806
1807 /* disk_attach actually creates space for the CPU disklabel, among
1808 * other things, so it's critical to call this *BEFORE* we try putzing
1809 * with disklabels. */
1810 dk_init(dksc, dev, DKTYPE_RAID);
1811 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1812
1813 /* XXX There may be a weird interaction here between this, and
1814 * protectedSectors, as used in RAIDframe. */
1815
1816 rs->sc_size = raidPtr->totalSectors;
1817
1818 /* Attach dk and disk subsystems */
1819 dk_attach(dksc);
1820 disk_attach(&dksc->sc_dkdev);
1821 rf_set_geometry(rs, raidPtr);
1822
1823 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1824
1825 /* mark unit as usuable */
1826 rs->sc_flags |= RAIDF_INITED;
1827
1828 dkwedge_discover(&dksc->sc_dkdev);
1829 }
1830
1831 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1832 /* wake up the daemon & tell it to get us a spare table
1833 * XXX
1834 * the entries in the queues should be tagged with the raidPtr
1835 * so that in the extremely rare case that two recons happen at once,
1836 * we know for which device were requesting a spare table
1837 * XXX
1838 *
1839 * XXX This code is not currently used. GO
1840 */
1841 int
1842 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1843 {
1844 int retcode;
1845
1846 rf_lock_mutex2(rf_sparet_wait_mutex);
1847 req->next = rf_sparet_wait_queue;
1848 rf_sparet_wait_queue = req;
1849 rf_broadcast_cond2(rf_sparet_wait_cv);
1850
1851 /* mpsleep unlocks the mutex */
1852 while (!rf_sparet_resp_queue) {
1853 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1854 }
1855 req = rf_sparet_resp_queue;
1856 rf_sparet_resp_queue = req->next;
1857 rf_unlock_mutex2(rf_sparet_wait_mutex);
1858
1859 retcode = req->fcol;
1860 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1861 * alloc'd */
1862 return retcode;
1863 }
1864 #endif
1865
1866 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1867 * bp & passes it down.
1868 * any calls originating in the kernel must use non-blocking I/O
1869 * do some extra sanity checking to return "appropriate" error values for
1870 * certain conditions (to make some standard utilities work)
1871 *
1872 * Formerly known as: rf_DoAccessKernel
1873 */
1874 void
1875 raidstart(RF_Raid_t *raidPtr)
1876 {
1877 struct raid_softc *rs;
1878 struct dk_softc *dksc;
1879
1880 rs = raidPtr->softc;
1881 dksc = &rs->sc_dksc;
1882 /* quick check to see if anything has died recently */
1883 rf_lock_mutex2(raidPtr->mutex);
1884 if (raidPtr->numNewFailures > 0) {
1885 rf_unlock_mutex2(raidPtr->mutex);
1886 rf_update_component_labels(raidPtr,
1887 RF_NORMAL_COMPONENT_UPDATE);
1888 rf_lock_mutex2(raidPtr->mutex);
1889 raidPtr->numNewFailures--;
1890 }
1891 rf_unlock_mutex2(raidPtr->mutex);
1892
1893 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1894 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1895 return;
1896 }
1897
1898 dk_start(dksc, NULL);
1899 }
1900
1901 static int
1902 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1903 {
1904 RF_SectorCount_t num_blocks, pb, sum;
1905 RF_RaidAddr_t raid_addr;
1906 daddr_t blocknum;
1907 int rc;
1908
1909 rf_lock_mutex2(raidPtr->mutex);
1910 if (raidPtr->openings == 0) {
1911 rf_unlock_mutex2(raidPtr->mutex);
1912 return EAGAIN;
1913 }
1914 rf_unlock_mutex2(raidPtr->mutex);
1915
1916 blocknum = bp->b_rawblkno;
1917
1918 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1919 (int) blocknum));
1920
1921 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1922 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1923
1924 /* *THIS* is where we adjust what block we're going to...
1925 * but DO NOT TOUCH bp->b_blkno!!! */
1926 raid_addr = blocknum;
1927
1928 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1929 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1930 sum = raid_addr + num_blocks + pb;
1931 if (1 || rf_debugKernelAccess) {
1932 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1933 (int) raid_addr, (int) sum, (int) num_blocks,
1934 (int) pb, (int) bp->b_resid));
1935 }
1936 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1937 || (sum < num_blocks) || (sum < pb)) {
1938 rc = ENOSPC;
1939 goto done;
1940 }
1941 /*
1942 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1943 */
1944
1945 if (bp->b_bcount & raidPtr->sectorMask) {
1946 rc = ENOSPC;
1947 goto done;
1948 }
1949 db1_printf(("Calling DoAccess..\n"));
1950
1951
1952 rf_lock_mutex2(raidPtr->mutex);
1953 raidPtr->openings--;
1954 rf_unlock_mutex2(raidPtr->mutex);
1955
1956 /* don't ever condition on bp->b_flags & B_WRITE.
1957 * always condition on B_READ instead */
1958
1959 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1960 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1961 raid_addr, num_blocks,
1962 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1963
1964 done:
1965 return rc;
1966 }
1967
1968 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
1969
1970 int
1971 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1972 {
1973 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1974 struct buf *bp;
1975
1976 req->queue = queue;
1977 bp = req->bp;
1978
1979 switch (req->type) {
1980 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
1981 /* XXX need to do something extra here.. */
1982 /* I'm leaving this in, as I've never actually seen it used,
1983 * and I'd like folks to report it... GO */
1984 printf("%s: WAKEUP CALLED\n", __func__);
1985 queue->numOutstanding++;
1986
1987 bp->b_flags = 0;
1988 bp->b_private = req;
1989
1990 KernelWakeupFunc(bp);
1991 break;
1992
1993 case RF_IO_TYPE_READ:
1994 case RF_IO_TYPE_WRITE:
1995 #if RF_ACC_TRACE > 0
1996 if (req->tracerec) {
1997 RF_ETIMER_START(req->tracerec->timer);
1998 }
1999 #endif
2000 InitBP(bp, queue->rf_cinfo->ci_vp,
2001 op, queue->rf_cinfo->ci_dev,
2002 req->sectorOffset, req->numSector,
2003 req->buf, KernelWakeupFunc, (void *) req,
2004 queue->raidPtr->logBytesPerSector);
2005
2006 if (rf_debugKernelAccess) {
2007 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2008 (long) bp->b_blkno));
2009 }
2010 queue->numOutstanding++;
2011 queue->last_deq_sector = req->sectorOffset;
2012 /* acc wouldn't have been let in if there were any pending
2013 * reqs at any other priority */
2014 queue->curPriority = req->priority;
2015
2016 db1_printf(("Going for %c to unit %d col %d\n",
2017 req->type, queue->raidPtr->raidid,
2018 queue->col));
2019 db1_printf(("sector %d count %d (%d bytes) %d\n",
2020 (int) req->sectorOffset, (int) req->numSector,
2021 (int) (req->numSector <<
2022 queue->raidPtr->logBytesPerSector),
2023 (int) queue->raidPtr->logBytesPerSector));
2024
2025 /*
2026 * XXX: drop lock here since this can block at
2027 * least with backing SCSI devices. Retake it
2028 * to minimize fuss with calling interfaces.
2029 */
2030
2031 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2032 bdev_strategy(bp);
2033 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2034 break;
2035
2036 default:
2037 panic("bad req->type in rf_DispatchKernelIO");
2038 }
2039 db1_printf(("Exiting from DispatchKernelIO\n"));
2040
2041 return 0;
2042 }
2043 /* this is the callback function associated with a I/O invoked from
2044 kernel code.
2045 */
2046 static void
2047 KernelWakeupFunc(struct buf *bp)
2048 {
2049 RF_DiskQueueData_t *req = NULL;
2050 RF_DiskQueue_t *queue;
2051
2052 db1_printf(("recovering the request queue:\n"));
2053
2054 req = bp->b_private;
2055
2056 queue = (RF_DiskQueue_t *) req->queue;
2057
2058 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2059
2060 #if RF_ACC_TRACE > 0
2061 if (req->tracerec) {
2062 RF_ETIMER_STOP(req->tracerec->timer);
2063 RF_ETIMER_EVAL(req->tracerec->timer);
2064 rf_lock_mutex2(rf_tracing_mutex);
2065 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2066 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2067 req->tracerec->num_phys_ios++;
2068 rf_unlock_mutex2(rf_tracing_mutex);
2069 }
2070 #endif
2071
2072 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2073 * ballistic, and mark the component as hosed... */
2074
2075 if (bp->b_error != 0) {
2076 /* Mark the disk as dead */
2077 /* but only mark it once... */
2078 /* and only if it wouldn't leave this RAID set
2079 completely broken */
2080 if (((queue->raidPtr->Disks[queue->col].status ==
2081 rf_ds_optimal) ||
2082 (queue->raidPtr->Disks[queue->col].status ==
2083 rf_ds_used_spare)) &&
2084 (queue->raidPtr->numFailures <
2085 queue->raidPtr->Layout.map->faultsTolerated)) {
2086 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2087 queue->raidPtr->raidid,
2088 bp->b_error,
2089 queue->raidPtr->Disks[queue->col].devname);
2090 queue->raidPtr->Disks[queue->col].status =
2091 rf_ds_failed;
2092 queue->raidPtr->status = rf_rs_degraded;
2093 queue->raidPtr->numFailures++;
2094 queue->raidPtr->numNewFailures++;
2095 } else { /* Disk is already dead... */
2096 /* printf("Disk already marked as dead!\n"); */
2097 }
2098
2099 }
2100
2101 /* Fill in the error value */
2102 req->error = bp->b_error;
2103
2104 /* Drop this one on the "finished" queue... */
2105 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2106
2107 /* Let the raidio thread know there is work to be done. */
2108 rf_signal_cond2(queue->raidPtr->iodone_cv);
2109
2110 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2111 }
2112
2113
2114 /*
2115 * initialize a buf structure for doing an I/O in the kernel.
2116 */
2117 static void
2118 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2119 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2120 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2121 {
2122 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2123 bp->b_oflags = 0;
2124 bp->b_cflags = 0;
2125 bp->b_bcount = numSect << logBytesPerSector;
2126 bp->b_bufsize = bp->b_bcount;
2127 bp->b_error = 0;
2128 bp->b_dev = dev;
2129 bp->b_data = bf;
2130 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2131 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2132 if (bp->b_bcount == 0) {
2133 panic("bp->b_bcount is zero in InitBP!!");
2134 }
2135 bp->b_iodone = cbFunc;
2136 bp->b_private = cbArg;
2137 }
2138
2139 /*
2140 * Wait interruptibly for an exclusive lock.
2141 *
2142 * XXX
2143 * Several drivers do this; it should be abstracted and made MP-safe.
2144 * (Hmm... where have we seen this warning before :-> GO )
2145 */
2146 static int
2147 raidlock(struct raid_softc *rs)
2148 {
2149 int error;
2150
2151 error = 0;
2152 mutex_enter(&rs->sc_mutex);
2153 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2154 rs->sc_flags |= RAIDF_WANTED;
2155 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2156 if (error != 0)
2157 goto done;
2158 }
2159 rs->sc_flags |= RAIDF_LOCKED;
2160 done:
2161 mutex_exit(&rs->sc_mutex);
2162 return error;
2163 }
2164 /*
2165 * Unlock and wake up any waiters.
2166 */
2167 static void
2168 raidunlock(struct raid_softc *rs)
2169 {
2170
2171 mutex_enter(&rs->sc_mutex);
2172 rs->sc_flags &= ~RAIDF_LOCKED;
2173 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2174 rs->sc_flags &= ~RAIDF_WANTED;
2175 cv_broadcast(&rs->sc_cv);
2176 }
2177 mutex_exit(&rs->sc_mutex);
2178 }
2179
2180
2181 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2182 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2183 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2184
2185 static daddr_t
2186 rf_component_info_offset(void)
2187 {
2188
2189 return RF_COMPONENT_INFO_OFFSET;
2190 }
2191
2192 static daddr_t
2193 rf_component_info_size(unsigned secsize)
2194 {
2195 daddr_t info_size;
2196
2197 KASSERT(secsize);
2198 if (secsize > RF_COMPONENT_INFO_SIZE)
2199 info_size = secsize;
2200 else
2201 info_size = RF_COMPONENT_INFO_SIZE;
2202
2203 return info_size;
2204 }
2205
2206 static daddr_t
2207 rf_parity_map_offset(RF_Raid_t *raidPtr)
2208 {
2209 daddr_t map_offset;
2210
2211 KASSERT(raidPtr->bytesPerSector);
2212 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2213 map_offset = raidPtr->bytesPerSector;
2214 else
2215 map_offset = RF_COMPONENT_INFO_SIZE;
2216 map_offset += rf_component_info_offset();
2217
2218 return map_offset;
2219 }
2220
2221 static daddr_t
2222 rf_parity_map_size(RF_Raid_t *raidPtr)
2223 {
2224 daddr_t map_size;
2225
2226 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2227 map_size = raidPtr->bytesPerSector;
2228 else
2229 map_size = RF_PARITY_MAP_SIZE;
2230
2231 return map_size;
2232 }
2233
2234 int
2235 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2236 {
2237 RF_ComponentLabel_t *clabel;
2238
2239 clabel = raidget_component_label(raidPtr, col);
2240 clabel->clean = RF_RAID_CLEAN;
2241 raidflush_component_label(raidPtr, col);
2242 return(0);
2243 }
2244
2245
2246 int
2247 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2248 {
2249 RF_ComponentLabel_t *clabel;
2250
2251 clabel = raidget_component_label(raidPtr, col);
2252 clabel->clean = RF_RAID_DIRTY;
2253 raidflush_component_label(raidPtr, col);
2254 return(0);
2255 }
2256
2257 int
2258 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2259 {
2260 KASSERT(raidPtr->bytesPerSector);
2261
2262 return raidread_component_label(raidPtr->bytesPerSector,
2263 raidPtr->Disks[col].dev,
2264 raidPtr->raid_cinfo[col].ci_vp,
2265 &raidPtr->raid_cinfo[col].ci_label);
2266 }
2267
2268 RF_ComponentLabel_t *
2269 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2270 {
2271 return &raidPtr->raid_cinfo[col].ci_label;
2272 }
2273
2274 int
2275 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2276 {
2277 RF_ComponentLabel_t *label;
2278
2279 label = &raidPtr->raid_cinfo[col].ci_label;
2280 label->mod_counter = raidPtr->mod_counter;
2281 #ifndef RF_NO_PARITY_MAP
2282 label->parity_map_modcount = label->mod_counter;
2283 #endif
2284 return raidwrite_component_label(raidPtr->bytesPerSector,
2285 raidPtr->Disks[col].dev,
2286 raidPtr->raid_cinfo[col].ci_vp, label);
2287 }
2288
2289 /*
2290 * Swap the label endianness.
2291 *
2292 * Everything in the component label is 4-byte-swapped except the version,
2293 * which is kept in the byte-swapped version at all times, and indicates
2294 * for the writer that a swap is necessary.
2295 *
2296 * For reads it is expected that out_label == clabel, but writes expect
2297 * separate labels so only the re-swapped label is written out to disk,
2298 * leaving the swapped-except-version internally.
2299 *
2300 * Only support swapping label version 2.
2301 */
2302 static void
2303 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2304 {
2305 int *in, *out, *in_last;
2306
2307 KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2308
2309 /* Don't swap the label, but do copy it. */
2310 out_label->version = clabel->version;
2311
2312 in = &clabel->serial_number;
2313 in_last = &clabel->future_use2[42];
2314 out = &out_label->serial_number;
2315
2316 for (; in < in_last; in++, out++)
2317 *out = bswap32(*in);
2318 }
2319
2320 static int
2321 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2322 RF_ComponentLabel_t *clabel)
2323 {
2324 int error;
2325
2326 error = raidread_component_area(dev, b_vp, clabel,
2327 sizeof(RF_ComponentLabel_t),
2328 rf_component_info_offset(),
2329 rf_component_info_size(secsize));
2330
2331 if (error == 0 &&
2332 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2333 rf_swap_label(clabel, clabel);
2334 }
2335
2336 return error;
2337 }
2338
2339 /* ARGSUSED */
2340 static int
2341 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2342 size_t msize, daddr_t offset, daddr_t dsize)
2343 {
2344 struct buf *bp;
2345 int error;
2346
2347 /* XXX should probably ensure that we don't try to do this if
2348 someone has changed rf_protected_sectors. */
2349
2350 if (b_vp == NULL) {
2351 /* For whatever reason, this component is not valid.
2352 Don't try to read a component label from it. */
2353 return(EINVAL);
2354 }
2355
2356 /* get a block of the appropriate size... */
2357 bp = geteblk((int)dsize);
2358 bp->b_dev = dev;
2359
2360 /* get our ducks in a row for the read */
2361 bp->b_blkno = offset / DEV_BSIZE;
2362 bp->b_bcount = dsize;
2363 bp->b_flags |= B_READ;
2364 bp->b_resid = dsize;
2365
2366 bdev_strategy(bp);
2367 error = biowait(bp);
2368
2369 if (!error) {
2370 memcpy(data, bp->b_data, msize);
2371 }
2372
2373 brelse(bp, 0);
2374 return(error);
2375 }
2376
2377 static int
2378 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2379 RF_ComponentLabel_t *clabel)
2380 {
2381 RF_ComponentLabel_t *clabel_write = clabel;
2382 RF_ComponentLabel_t lclabel;
2383 int error;
2384
2385 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2386 clabel_write = &lclabel;
2387 rf_swap_label(clabel, clabel_write);
2388 }
2389 error = raidwrite_component_area(dev, b_vp, clabel_write,
2390 sizeof(RF_ComponentLabel_t),
2391 rf_component_info_offset(),
2392 rf_component_info_size(secsize), 0);
2393
2394 return error;
2395 }
2396
2397 /* ARGSUSED */
2398 static int
2399 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2400 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2401 {
2402 struct buf *bp;
2403 int error;
2404
2405 /* get a block of the appropriate size... */
2406 bp = geteblk((int)dsize);
2407 bp->b_dev = dev;
2408
2409 /* get our ducks in a row for the write */
2410 bp->b_blkno = offset / DEV_BSIZE;
2411 bp->b_bcount = dsize;
2412 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2413 bp->b_resid = dsize;
2414
2415 memset(bp->b_data, 0, dsize);
2416 memcpy(bp->b_data, data, msize);
2417
2418 bdev_strategy(bp);
2419 if (asyncp)
2420 return 0;
2421 error = biowait(bp);
2422 brelse(bp, 0);
2423 if (error) {
2424 #if 1
2425 printf("Failed to write RAID component info!\n");
2426 #endif
2427 }
2428
2429 return(error);
2430 }
2431
2432 void
2433 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2434 {
2435 int c;
2436
2437 for (c = 0; c < raidPtr->numCol; c++) {
2438 /* Skip dead disks. */
2439 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2440 continue;
2441 /* XXXjld: what if an error occurs here? */
2442 raidwrite_component_area(raidPtr->Disks[c].dev,
2443 raidPtr->raid_cinfo[c].ci_vp, map,
2444 RF_PARITYMAP_NBYTE,
2445 rf_parity_map_offset(raidPtr),
2446 rf_parity_map_size(raidPtr), 0);
2447 }
2448 }
2449
2450 void
2451 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2452 {
2453 struct rf_paritymap_ondisk tmp;
2454 int c,first;
2455
2456 first=1;
2457 for (c = 0; c < raidPtr->numCol; c++) {
2458 /* Skip dead disks. */
2459 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2460 continue;
2461 raidread_component_area(raidPtr->Disks[c].dev,
2462 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2463 RF_PARITYMAP_NBYTE,
2464 rf_parity_map_offset(raidPtr),
2465 rf_parity_map_size(raidPtr));
2466 if (first) {
2467 memcpy(map, &tmp, sizeof(*map));
2468 first = 0;
2469 } else {
2470 rf_paritymap_merge(map, &tmp);
2471 }
2472 }
2473 }
2474
2475 void
2476 rf_markalldirty(RF_Raid_t *raidPtr)
2477 {
2478 RF_ComponentLabel_t *clabel;
2479 int sparecol;
2480 int c;
2481 int j;
2482 int scol = -1;
2483
2484 raidPtr->mod_counter++;
2485 for (c = 0; c < raidPtr->numCol; c++) {
2486 /* we don't want to touch (at all) a disk that has
2487 failed */
2488 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2489 clabel = raidget_component_label(raidPtr, c);
2490 if (clabel->status == rf_ds_spared) {
2491 /* XXX do something special...
2492 but whatever you do, don't
2493 try to access it!! */
2494 } else {
2495 raidmarkdirty(raidPtr, c);
2496 }
2497 }
2498 }
2499
2500 for( c = 0; c < raidPtr->numSpare ; c++) {
2501 sparecol = raidPtr->numCol + c;
2502 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2503 /*
2504
2505 we claim this disk is "optimal" if it's
2506 rf_ds_used_spare, as that means it should be
2507 directly substitutable for the disk it replaced.
2508 We note that too...
2509
2510 */
2511
2512 for(j=0;j<raidPtr->numCol;j++) {
2513 if (raidPtr->Disks[j].spareCol == sparecol) {
2514 scol = j;
2515 break;
2516 }
2517 }
2518
2519 clabel = raidget_component_label(raidPtr, sparecol);
2520 /* make sure status is noted */
2521
2522 raid_init_component_label(raidPtr, clabel);
2523
2524 clabel->row = 0;
2525 clabel->column = scol;
2526 /* Note: we *don't* change status from rf_ds_used_spare
2527 to rf_ds_optimal */
2528 /* clabel.status = rf_ds_optimal; */
2529
2530 raidmarkdirty(raidPtr, sparecol);
2531 }
2532 }
2533 }
2534
2535
2536 void
2537 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2538 {
2539 RF_ComponentLabel_t *clabel;
2540 int sparecol;
2541 int c;
2542 int j;
2543 int scol;
2544 struct raid_softc *rs = raidPtr->softc;
2545
2546 scol = -1;
2547
2548 /* XXX should do extra checks to make sure things really are clean,
2549 rather than blindly setting the clean bit... */
2550
2551 raidPtr->mod_counter++;
2552
2553 for (c = 0; c < raidPtr->numCol; c++) {
2554 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2555 clabel = raidget_component_label(raidPtr, c);
2556 /* make sure status is noted */
2557 clabel->status = rf_ds_optimal;
2558
2559 /* note what unit we are configured as */
2560 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2561 clabel->last_unit = raidPtr->raidid;
2562
2563 raidflush_component_label(raidPtr, c);
2564 if (final == RF_FINAL_COMPONENT_UPDATE) {
2565 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2566 raidmarkclean(raidPtr, c);
2567 }
2568 }
2569 }
2570 /* else we don't touch it.. */
2571 }
2572
2573 for( c = 0; c < raidPtr->numSpare ; c++) {
2574 sparecol = raidPtr->numCol + c;
2575 /* Need to ensure that the reconstruct actually completed! */
2576 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2577 /*
2578
2579 we claim this disk is "optimal" if it's
2580 rf_ds_used_spare, as that means it should be
2581 directly substitutable for the disk it replaced.
2582 We note that too...
2583
2584 */
2585
2586 for(j=0;j<raidPtr->numCol;j++) {
2587 if (raidPtr->Disks[j].spareCol == sparecol) {
2588 scol = j;
2589 break;
2590 }
2591 }
2592
2593 /* XXX shouldn't *really* need this... */
2594 clabel = raidget_component_label(raidPtr, sparecol);
2595 /* make sure status is noted */
2596
2597 raid_init_component_label(raidPtr, clabel);
2598
2599 clabel->column = scol;
2600 clabel->status = rf_ds_optimal;
2601 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2602 clabel->last_unit = raidPtr->raidid;
2603
2604 raidflush_component_label(raidPtr, sparecol);
2605 if (final == RF_FINAL_COMPONENT_UPDATE) {
2606 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2607 raidmarkclean(raidPtr, sparecol);
2608 }
2609 }
2610 }
2611 }
2612 }
2613
2614 void
2615 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2616 {
2617
2618 if (vp != NULL) {
2619 if (auto_configured == 1) {
2620 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2621 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2622 vput(vp);
2623
2624 } else {
2625 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2626 }
2627 }
2628 }
2629
2630
2631 void
2632 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2633 {
2634 int r,c;
2635 struct vnode *vp;
2636 int acd;
2637
2638
2639 /* We take this opportunity to close the vnodes like we should.. */
2640
2641 for (c = 0; c < raidPtr->numCol; c++) {
2642 vp = raidPtr->raid_cinfo[c].ci_vp;
2643 acd = raidPtr->Disks[c].auto_configured;
2644 rf_close_component(raidPtr, vp, acd);
2645 raidPtr->raid_cinfo[c].ci_vp = NULL;
2646 raidPtr->Disks[c].auto_configured = 0;
2647 }
2648
2649 for (r = 0; r < raidPtr->numSpare; r++) {
2650 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2651 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2652 rf_close_component(raidPtr, vp, acd);
2653 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2654 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2655 }
2656 }
2657
2658
2659 static void
2660 rf_ReconThread(struct rf_recon_req_internal *req)
2661 {
2662 int s;
2663 RF_Raid_t *raidPtr;
2664
2665 s = splbio();
2666 raidPtr = (RF_Raid_t *) req->raidPtr;
2667 raidPtr->recon_in_progress = 1;
2668
2669 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2670 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2671
2672 RF_Free(req, sizeof(*req));
2673
2674 raidPtr->recon_in_progress = 0;
2675 splx(s);
2676
2677 /* That's all... */
2678 kthread_exit(0); /* does not return */
2679 }
2680
2681 static void
2682 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2683 {
2684 int retcode;
2685 int s;
2686
2687 raidPtr->parity_rewrite_stripes_done = 0;
2688 raidPtr->parity_rewrite_in_progress = 1;
2689 s = splbio();
2690 retcode = rf_RewriteParity(raidPtr);
2691 splx(s);
2692 if (retcode) {
2693 printf("raid%d: Error re-writing parity (%d)!\n",
2694 raidPtr->raidid, retcode);
2695 } else {
2696 /* set the clean bit! If we shutdown correctly,
2697 the clean bit on each component label will get
2698 set */
2699 raidPtr->parity_good = RF_RAID_CLEAN;
2700 }
2701 raidPtr->parity_rewrite_in_progress = 0;
2702
2703 /* Anyone waiting for us to stop? If so, inform them... */
2704 if (raidPtr->waitShutdown) {
2705 rf_lock_mutex2(raidPtr->rad_lock);
2706 cv_broadcast(&raidPtr->parity_rewrite_cv);
2707 rf_unlock_mutex2(raidPtr->rad_lock);
2708 }
2709
2710 /* That's all... */
2711 kthread_exit(0); /* does not return */
2712 }
2713
2714
2715 static void
2716 rf_CopybackThread(RF_Raid_t *raidPtr)
2717 {
2718 int s;
2719
2720 raidPtr->copyback_in_progress = 1;
2721 s = splbio();
2722 rf_CopybackReconstructedData(raidPtr);
2723 splx(s);
2724 raidPtr->copyback_in_progress = 0;
2725
2726 /* That's all... */
2727 kthread_exit(0); /* does not return */
2728 }
2729
2730
2731 static void
2732 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2733 {
2734 int s;
2735 RF_Raid_t *raidPtr;
2736
2737 s = splbio();
2738 raidPtr = req->raidPtr;
2739 raidPtr->recon_in_progress = 1;
2740 rf_ReconstructInPlace(raidPtr, req->col);
2741 RF_Free(req, sizeof(*req));
2742 raidPtr->recon_in_progress = 0;
2743 splx(s);
2744
2745 /* That's all... */
2746 kthread_exit(0); /* does not return */
2747 }
2748
2749 static RF_AutoConfig_t *
2750 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2751 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2752 unsigned secsize)
2753 {
2754 int good_one = 0;
2755 RF_ComponentLabel_t *clabel;
2756 RF_AutoConfig_t *ac;
2757
2758 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2759
2760 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2761 /* Got the label. Does it look reasonable? */
2762 if (rf_reasonable_label(clabel, numsecs) &&
2763 (rf_component_label_partitionsize(clabel) <= size)) {
2764 #ifdef DEBUG
2765 printf("Component on: %s: %llu\n",
2766 cname, (unsigned long long)size);
2767 rf_print_component_label(clabel);
2768 #endif
2769 /* if it's reasonable, add it, else ignore it. */
2770 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2771 M_WAITOK);
2772 strlcpy(ac->devname, cname, sizeof(ac->devname));
2773 ac->dev = dev;
2774 ac->vp = vp;
2775 ac->clabel = clabel;
2776 ac->next = ac_list;
2777 ac_list = ac;
2778 good_one = 1;
2779 }
2780 }
2781 if (!good_one) {
2782 /* cleanup */
2783 free(clabel, M_RAIDFRAME);
2784 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2785 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2786 vput(vp);
2787 }
2788 return ac_list;
2789 }
2790
2791 static RF_AutoConfig_t *
2792 rf_find_raid_components(void)
2793 {
2794 struct vnode *vp;
2795 struct disklabel label;
2796 device_t dv;
2797 deviter_t di;
2798 dev_t dev;
2799 int bmajor, bminor, wedge, rf_part_found;
2800 int error;
2801 int i;
2802 RF_AutoConfig_t *ac_list;
2803 uint64_t numsecs;
2804 unsigned secsize;
2805 int dowedges;
2806
2807 /* initialize the AutoConfig list */
2808 ac_list = NULL;
2809
2810 /*
2811 * we begin by trolling through *all* the devices on the system *twice*
2812 * first we scan for wedges, second for other devices. This avoids
2813 * using a raw partition instead of a wedge that covers the whole disk
2814 */
2815
2816 for (dowedges=1; dowedges>=0; --dowedges) {
2817 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2818 dv = deviter_next(&di)) {
2819
2820 /* we are only interested in disks */
2821 if (device_class(dv) != DV_DISK)
2822 continue;
2823
2824 /* we don't care about floppies */
2825 if (device_is_a(dv, "fd")) {
2826 continue;
2827 }
2828
2829 /* we don't care about CDs. */
2830 if (device_is_a(dv, "cd")) {
2831 continue;
2832 }
2833
2834 /* we don't care about md. */
2835 if (device_is_a(dv, "md")) {
2836 continue;
2837 }
2838
2839 /* hdfd is the Atari/Hades floppy driver */
2840 if (device_is_a(dv, "hdfd")) {
2841 continue;
2842 }
2843
2844 /* fdisa is the Atari/Milan floppy driver */
2845 if (device_is_a(dv, "fdisa")) {
2846 continue;
2847 }
2848
2849 /* we don't care about spiflash */
2850 if (device_is_a(dv, "spiflash")) {
2851 continue;
2852 }
2853
2854 /* are we in the wedges pass ? */
2855 wedge = device_is_a(dv, "dk");
2856 if (wedge != dowedges) {
2857 continue;
2858 }
2859
2860 /* need to find the device_name_to_block_device_major stuff */
2861 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2862
2863 rf_part_found = 0; /*No raid partition as yet*/
2864
2865 /* get a vnode for the raw partition of this disk */
2866 bminor = minor(device_unit(dv));
2867 dev = wedge ? makedev(bmajor, bminor) :
2868 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2869 if (bdevvp(dev, &vp))
2870 panic("RAID can't alloc vnode");
2871
2872 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2873 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2874
2875 if (error) {
2876 /* "Who cares." Continue looking
2877 for something that exists*/
2878 vput(vp);
2879 continue;
2880 }
2881
2882 error = getdisksize(vp, &numsecs, &secsize);
2883 if (error) {
2884 /*
2885 * Pseudo devices like vnd and cgd can be
2886 * opened but may still need some configuration.
2887 * Ignore these quietly.
2888 */
2889 if (error != ENXIO)
2890 printf("RAIDframe: can't get disk size"
2891 " for dev %s (%d)\n",
2892 device_xname(dv), error);
2893 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2894 vput(vp);
2895 continue;
2896 }
2897 if (wedge) {
2898 struct dkwedge_info dkw;
2899 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2900 NOCRED);
2901 if (error) {
2902 printf("RAIDframe: can't get wedge info for "
2903 "dev %s (%d)\n", device_xname(dv), error);
2904 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2905 vput(vp);
2906 continue;
2907 }
2908
2909 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2910 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2911 vput(vp);
2912 continue;
2913 }
2914
2915 VOP_UNLOCK(vp);
2916 ac_list = rf_get_component(ac_list, dev, vp,
2917 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2918 rf_part_found = 1; /*There is a raid component on this disk*/
2919 continue;
2920 }
2921
2922 /* Ok, the disk exists. Go get the disklabel. */
2923 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2924 if (error) {
2925 /*
2926 * XXX can't happen - open() would
2927 * have errored out (or faked up one)
2928 */
2929 if (error != ENOTTY)
2930 printf("RAIDframe: can't get label for dev "
2931 "%s (%d)\n", device_xname(dv), error);
2932 }
2933
2934 /* don't need this any more. We'll allocate it again
2935 a little later if we really do... */
2936 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2937 vput(vp);
2938
2939 if (error)
2940 continue;
2941
2942 rf_part_found = 0; /*No raid partitions yet*/
2943 for (i = 0; i < label.d_npartitions; i++) {
2944 char cname[sizeof(ac_list->devname)];
2945
2946 /* We only support partitions marked as RAID */
2947 if (label.d_partitions[i].p_fstype != FS_RAID)
2948 continue;
2949
2950 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2951 if (bdevvp(dev, &vp))
2952 panic("RAID can't alloc vnode");
2953
2954 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2955 error = VOP_OPEN(vp, FREAD, NOCRED);
2956 if (error) {
2957 /* Whatever... */
2958 vput(vp);
2959 continue;
2960 }
2961 VOP_UNLOCK(vp);
2962 snprintf(cname, sizeof(cname), "%s%c",
2963 device_xname(dv), 'a' + i);
2964 ac_list = rf_get_component(ac_list, dev, vp, cname,
2965 label.d_partitions[i].p_size, numsecs, secsize);
2966 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2967 }
2968
2969 /*
2970 *If there is no raid component on this disk, either in a
2971 *disklabel or inside a wedge, check the raw partition as well,
2972 *as it is possible to configure raid components on raw disk
2973 *devices.
2974 */
2975
2976 if (!rf_part_found) {
2977 char cname[sizeof(ac_list->devname)];
2978
2979 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2980 if (bdevvp(dev, &vp))
2981 panic("RAID can't alloc vnode");
2982
2983 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2984
2985 error = VOP_OPEN(vp, FREAD, NOCRED);
2986 if (error) {
2987 /* Whatever... */
2988 vput(vp);
2989 continue;
2990 }
2991 VOP_UNLOCK(vp);
2992 snprintf(cname, sizeof(cname), "%s%c",
2993 device_xname(dv), 'a' + RAW_PART);
2994 ac_list = rf_get_component(ac_list, dev, vp, cname,
2995 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2996 }
2997 }
2998 deviter_release(&di);
2999 }
3000 return ac_list;
3001 }
3002
3003 int
3004 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3005 {
3006
3007 if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3008 clabel->version==RF_COMPONENT_LABEL_VERSION ||
3009 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3010 (clabel->clean == RF_RAID_CLEAN ||
3011 clabel->clean == RF_RAID_DIRTY) &&
3012 clabel->row >=0 &&
3013 clabel->column >= 0 &&
3014 clabel->num_rows > 0 &&
3015 clabel->num_columns > 0 &&
3016 clabel->row < clabel->num_rows &&
3017 clabel->column < clabel->num_columns &&
3018 clabel->blockSize > 0 &&
3019 /*
3020 * numBlocksHi may contain garbage, but it is ok since
3021 * the type is unsigned. If it is really garbage,
3022 * rf_fix_old_label_size() will fix it.
3023 */
3024 rf_component_label_numblocks(clabel) > 0) {
3025 /*
3026 * label looks reasonable enough...
3027 * let's make sure it has no old garbage.
3028 */
3029 if (numsecs)
3030 rf_fix_old_label_size(clabel, numsecs);
3031 return(1);
3032 }
3033 return(0);
3034 }
3035
3036
3037 /*
3038 * For reasons yet unknown, some old component labels have garbage in
3039 * the newer numBlocksHi region, and this causes lossage. Since those
3040 * disks will also have numsecs set to less than 32 bits of sectors,
3041 * we can determine when this corruption has occurred, and fix it.
3042 *
3043 * The exact same problem, with the same unknown reason, happens to
3044 * the partitionSizeHi member as well.
3045 */
3046 static void
3047 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3048 {
3049
3050 if (numsecs < ((uint64_t)1 << 32)) {
3051 if (clabel->numBlocksHi) {
3052 printf("WARNING: total sectors < 32 bits, yet "
3053 "numBlocksHi set\n"
3054 "WARNING: resetting numBlocksHi to zero.\n");
3055 clabel->numBlocksHi = 0;
3056 }
3057
3058 if (clabel->partitionSizeHi) {
3059 printf("WARNING: total sectors < 32 bits, yet "
3060 "partitionSizeHi set\n"
3061 "WARNING: resetting partitionSizeHi to zero.\n");
3062 clabel->partitionSizeHi = 0;
3063 }
3064 }
3065 }
3066
3067
3068 #ifdef DEBUG
3069 void
3070 rf_print_component_label(RF_ComponentLabel_t *clabel)
3071 {
3072 uint64_t numBlocks;
3073 static const char *rp[] = {
3074 "No", "Force", "Soft", "*invalid*"
3075 };
3076
3077
3078 numBlocks = rf_component_label_numblocks(clabel);
3079
3080 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3081 clabel->row, clabel->column,
3082 clabel->num_rows, clabel->num_columns);
3083 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3084 clabel->version, clabel->serial_number,
3085 clabel->mod_counter);
3086 printf(" Clean: %s Status: %d\n",
3087 clabel->clean ? "Yes" : "No", clabel->status);
3088 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3089 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3090 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3091 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3092 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3093 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3094 printf(" Last configured as: raid%d\n", clabel->last_unit);
3095 #if 0
3096 printf(" Config order: %d\n", clabel->config_order);
3097 #endif
3098
3099 }
3100 #endif
3101
3102 static RF_ConfigSet_t *
3103 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3104 {
3105 RF_AutoConfig_t *ac;
3106 RF_ConfigSet_t *config_sets;
3107 RF_ConfigSet_t *cset;
3108 RF_AutoConfig_t *ac_next;
3109
3110
3111 config_sets = NULL;
3112
3113 /* Go through the AutoConfig list, and figure out which components
3114 belong to what sets. */
3115 ac = ac_list;
3116 while(ac!=NULL) {
3117 /* we're going to putz with ac->next, so save it here
3118 for use at the end of the loop */
3119 ac_next = ac->next;
3120
3121 if (config_sets == NULL) {
3122 /* will need at least this one... */
3123 config_sets = malloc(sizeof(RF_ConfigSet_t),
3124 M_RAIDFRAME, M_WAITOK);
3125 /* this one is easy :) */
3126 config_sets->ac = ac;
3127 config_sets->next = NULL;
3128 config_sets->rootable = 0;
3129 ac->next = NULL;
3130 } else {
3131 /* which set does this component fit into? */
3132 cset = config_sets;
3133 while(cset!=NULL) {
3134 if (rf_does_it_fit(cset, ac)) {
3135 /* looks like it matches... */
3136 ac->next = cset->ac;
3137 cset->ac = ac;
3138 break;
3139 }
3140 cset = cset->next;
3141 }
3142 if (cset==NULL) {
3143 /* didn't find a match above... new set..*/
3144 cset = malloc(sizeof(RF_ConfigSet_t),
3145 M_RAIDFRAME, M_WAITOK);
3146 cset->ac = ac;
3147 ac->next = NULL;
3148 cset->next = config_sets;
3149 cset->rootable = 0;
3150 config_sets = cset;
3151 }
3152 }
3153 ac = ac_next;
3154 }
3155
3156
3157 return(config_sets);
3158 }
3159
3160 static int
3161 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3162 {
3163 RF_ComponentLabel_t *clabel1, *clabel2;
3164
3165 /* If this one matches the *first* one in the set, that's good
3166 enough, since the other members of the set would have been
3167 through here too... */
3168 /* note that we are not checking partitionSize here..
3169
3170 Note that we are also not checking the mod_counters here.
3171 If everything else matches except the mod_counter, that's
3172 good enough for this test. We will deal with the mod_counters
3173 a little later in the autoconfiguration process.
3174
3175 (clabel1->mod_counter == clabel2->mod_counter) &&
3176
3177 The reason we don't check for this is that failed disks
3178 will have lower modification counts. If those disks are
3179 not added to the set they used to belong to, then they will
3180 form their own set, which may result in 2 different sets,
3181 for example, competing to be configured at raid0, and
3182 perhaps competing to be the root filesystem set. If the
3183 wrong ones get configured, or both attempt to become /,
3184 weird behaviour and or serious lossage will occur. Thus we
3185 need to bring them into the fold here, and kick them out at
3186 a later point.
3187
3188 */
3189
3190 clabel1 = cset->ac->clabel;
3191 clabel2 = ac->clabel;
3192 if ((clabel1->version == clabel2->version) &&
3193 (clabel1->serial_number == clabel2->serial_number) &&
3194 (clabel1->num_rows == clabel2->num_rows) &&
3195 (clabel1->num_columns == clabel2->num_columns) &&
3196 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3197 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3198 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3199 (clabel1->parityConfig == clabel2->parityConfig) &&
3200 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3201 (clabel1->blockSize == clabel2->blockSize) &&
3202 rf_component_label_numblocks(clabel1) ==
3203 rf_component_label_numblocks(clabel2) &&
3204 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3205 (clabel1->root_partition == clabel2->root_partition) &&
3206 (clabel1->last_unit == clabel2->last_unit) &&
3207 (clabel1->config_order == clabel2->config_order)) {
3208 /* if it get's here, it almost *has* to be a match */
3209 } else {
3210 /* it's not consistent with somebody in the set..
3211 punt */
3212 return(0);
3213 }
3214 /* all was fine.. it must fit... */
3215 return(1);
3216 }
3217
3218 static int
3219 rf_have_enough_components(RF_ConfigSet_t *cset)
3220 {
3221 RF_AutoConfig_t *ac;
3222 RF_AutoConfig_t *auto_config;
3223 RF_ComponentLabel_t *clabel;
3224 int c;
3225 int num_cols;
3226 int num_missing;
3227 int mod_counter;
3228 int mod_counter_found;
3229 int even_pair_failed;
3230 char parity_type;
3231
3232
3233 /* check to see that we have enough 'live' components
3234 of this set. If so, we can configure it if necessary */
3235
3236 num_cols = cset->ac->clabel->num_columns;
3237 parity_type = cset->ac->clabel->parityConfig;
3238
3239 /* XXX Check for duplicate components!?!?!? */
3240
3241 /* Determine what the mod_counter is supposed to be for this set. */
3242
3243 mod_counter_found = 0;
3244 mod_counter = 0;
3245 ac = cset->ac;
3246 while(ac!=NULL) {
3247 if (mod_counter_found==0) {
3248 mod_counter = ac->clabel->mod_counter;
3249 mod_counter_found = 1;
3250 } else {
3251 if (ac->clabel->mod_counter > mod_counter) {
3252 mod_counter = ac->clabel->mod_counter;
3253 }
3254 }
3255 ac = ac->next;
3256 }
3257
3258 num_missing = 0;
3259 auto_config = cset->ac;
3260
3261 even_pair_failed = 0;
3262 for(c=0; c<num_cols; c++) {
3263 ac = auto_config;
3264 while(ac!=NULL) {
3265 if ((ac->clabel->column == c) &&
3266 (ac->clabel->mod_counter == mod_counter)) {
3267 /* it's this one... */
3268 #ifdef DEBUG
3269 printf("Found: %s at %d\n",
3270 ac->devname,c);
3271 #endif
3272 break;
3273 }
3274 ac=ac->next;
3275 }
3276 if (ac==NULL) {
3277 /* Didn't find one here! */
3278 /* special case for RAID 1, especially
3279 where there are more than 2
3280 components (where RAIDframe treats
3281 things a little differently :( ) */
3282 if (parity_type == '1') {
3283 if (c%2 == 0) { /* even component */
3284 even_pair_failed = 1;
3285 } else { /* odd component. If
3286 we're failed, and
3287 so is the even
3288 component, it's
3289 "Good Night, Charlie" */
3290 if (even_pair_failed == 1) {
3291 return(0);
3292 }
3293 }
3294 } else {
3295 /* normal accounting */
3296 num_missing++;
3297 }
3298 }
3299 if ((parity_type == '1') && (c%2 == 1)) {
3300 /* Just did an even component, and we didn't
3301 bail.. reset the even_pair_failed flag,
3302 and go on to the next component.... */
3303 even_pair_failed = 0;
3304 }
3305 }
3306
3307 clabel = cset->ac->clabel;
3308
3309 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3310 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3311 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3312 /* XXX this needs to be made *much* more general */
3313 /* Too many failures */
3314 return(0);
3315 }
3316 /* otherwise, all is well, and we've got enough to take a kick
3317 at autoconfiguring this set */
3318 return(1);
3319 }
3320
3321 static void
3322 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3323 RF_Raid_t *raidPtr)
3324 {
3325 RF_ComponentLabel_t *clabel;
3326 int i;
3327
3328 clabel = ac->clabel;
3329
3330 /* 1. Fill in the common stuff */
3331 config->numCol = clabel->num_columns;
3332 config->numSpare = 0; /* XXX should this be set here? */
3333 config->sectPerSU = clabel->sectPerSU;
3334 config->SUsPerPU = clabel->SUsPerPU;
3335 config->SUsPerRU = clabel->SUsPerRU;
3336 config->parityConfig = clabel->parityConfig;
3337 /* XXX... */
3338 strcpy(config->diskQueueType,"fifo");
3339 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3340 config->layoutSpecificSize = 0; /* XXX ?? */
3341
3342 while(ac!=NULL) {
3343 /* row/col values will be in range due to the checks
3344 in reasonable_label() */
3345 strcpy(config->devnames[0][ac->clabel->column],
3346 ac->devname);
3347 ac = ac->next;
3348 }
3349
3350 for(i=0;i<RF_MAXDBGV;i++) {
3351 config->debugVars[i][0] = 0;
3352 }
3353 }
3354
3355 static int
3356 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3357 {
3358 RF_ComponentLabel_t *clabel;
3359 int column;
3360 int sparecol;
3361
3362 raidPtr->autoconfigure = new_value;
3363
3364 for(column=0; column<raidPtr->numCol; column++) {
3365 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3366 clabel = raidget_component_label(raidPtr, column);
3367 clabel->autoconfigure = new_value;
3368 raidflush_component_label(raidPtr, column);
3369 }
3370 }
3371 for(column = 0; column < raidPtr->numSpare ; column++) {
3372 sparecol = raidPtr->numCol + column;
3373 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3374 clabel = raidget_component_label(raidPtr, sparecol);
3375 clabel->autoconfigure = new_value;
3376 raidflush_component_label(raidPtr, sparecol);
3377 }
3378 }
3379 return(new_value);
3380 }
3381
3382 static int
3383 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3384 {
3385 RF_ComponentLabel_t *clabel;
3386 int column;
3387 int sparecol;
3388
3389 raidPtr->root_partition = new_value;
3390 for(column=0; column<raidPtr->numCol; column++) {
3391 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3392 clabel = raidget_component_label(raidPtr, column);
3393 clabel->root_partition = new_value;
3394 raidflush_component_label(raidPtr, column);
3395 }
3396 }
3397 for(column = 0; column < raidPtr->numSpare ; column++) {
3398 sparecol = raidPtr->numCol + column;
3399 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3400 clabel = raidget_component_label(raidPtr, sparecol);
3401 clabel->root_partition = new_value;
3402 raidflush_component_label(raidPtr, sparecol);
3403 }
3404 }
3405 return(new_value);
3406 }
3407
3408 static void
3409 rf_release_all_vps(RF_ConfigSet_t *cset)
3410 {
3411 RF_AutoConfig_t *ac;
3412
3413 ac = cset->ac;
3414 while(ac!=NULL) {
3415 /* Close the vp, and give it back */
3416 if (ac->vp) {
3417 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3418 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3419 vput(ac->vp);
3420 ac->vp = NULL;
3421 }
3422 ac = ac->next;
3423 }
3424 }
3425
3426
3427 static void
3428 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3429 {
3430 RF_AutoConfig_t *ac;
3431 RF_AutoConfig_t *next_ac;
3432
3433 ac = cset->ac;
3434 while(ac!=NULL) {
3435 next_ac = ac->next;
3436 /* nuke the label */
3437 free(ac->clabel, M_RAIDFRAME);
3438 /* cleanup the config structure */
3439 free(ac, M_RAIDFRAME);
3440 /* "next.." */
3441 ac = next_ac;
3442 }
3443 /* and, finally, nuke the config set */
3444 free(cset, M_RAIDFRAME);
3445 }
3446
3447
3448 void
3449 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3450 {
3451 /* avoid over-writing byteswapped version. */
3452 if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3453 clabel->version = RF_COMPONENT_LABEL_VERSION;
3454 clabel->serial_number = raidPtr->serial_number;
3455 clabel->mod_counter = raidPtr->mod_counter;
3456
3457 clabel->num_rows = 1;
3458 clabel->num_columns = raidPtr->numCol;
3459 clabel->clean = RF_RAID_DIRTY; /* not clean */
3460 clabel->status = rf_ds_optimal; /* "It's good!" */
3461
3462 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3463 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3464 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3465
3466 clabel->blockSize = raidPtr->bytesPerSector;
3467 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3468
3469 /* XXX not portable */
3470 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3471 clabel->maxOutstanding = raidPtr->maxOutstanding;
3472 clabel->autoconfigure = raidPtr->autoconfigure;
3473 clabel->root_partition = raidPtr->root_partition;
3474 clabel->last_unit = raidPtr->raidid;
3475 clabel->config_order = raidPtr->config_order;
3476
3477 #ifndef RF_NO_PARITY_MAP
3478 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3479 #endif
3480 }
3481
3482 static struct raid_softc *
3483 rf_auto_config_set(RF_ConfigSet_t *cset)
3484 {
3485 RF_Raid_t *raidPtr;
3486 RF_Config_t *config;
3487 int raidID;
3488 struct raid_softc *sc;
3489
3490 #ifdef DEBUG
3491 printf("RAID autoconfigure\n");
3492 #endif
3493
3494 /* 1. Create a config structure */
3495 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3496
3497 /*
3498 2. Figure out what RAID ID this one is supposed to live at
3499 See if we can get the same RAID dev that it was configured
3500 on last time..
3501 */
3502
3503 raidID = cset->ac->clabel->last_unit;
3504 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3505 sc = raidget(++raidID, false))
3506 continue;
3507 #ifdef DEBUG
3508 printf("Configuring raid%d:\n",raidID);
3509 #endif
3510
3511 if (sc == NULL)
3512 sc = raidget(raidID, true);
3513 raidPtr = &sc->sc_r;
3514
3515 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3516 raidPtr->softc = sc;
3517 raidPtr->raidid = raidID;
3518 raidPtr->openings = RAIDOUTSTANDING;
3519
3520 /* 3. Build the configuration structure */
3521 rf_create_configuration(cset->ac, config, raidPtr);
3522
3523 /* 4. Do the configuration */
3524 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3525 raidinit(sc);
3526
3527 rf_markalldirty(raidPtr);
3528 raidPtr->autoconfigure = 1; /* XXX do this here? */
3529 switch (cset->ac->clabel->root_partition) {
3530 case 1: /* Force Root */
3531 case 2: /* Soft Root: root when boot partition part of raid */
3532 /*
3533 * everything configured just fine. Make a note
3534 * that this set is eligible to be root,
3535 * or forced to be root
3536 */
3537 cset->rootable = cset->ac->clabel->root_partition;
3538 /* XXX do this here? */
3539 raidPtr->root_partition = cset->rootable;
3540 break;
3541 default:
3542 break;
3543 }
3544 } else {
3545 raidput(sc);
3546 sc = NULL;
3547 }
3548
3549 /* 5. Cleanup */
3550 free(config, M_RAIDFRAME);
3551 return sc;
3552 }
3553
3554 void
3555 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3556 size_t xmin, size_t xmax)
3557 {
3558
3559 /* Format: raid%d_foo */
3560 snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3561
3562 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3563 pool_sethiwat(p, xmax);
3564 pool_prime(p, xmin);
3565 }
3566
3567
3568 /*
3569 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3570 * to see if there is IO pending and if that IO could possibly be done
3571 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3572 * otherwise.
3573 *
3574 */
3575 int
3576 rf_buf_queue_check(RF_Raid_t *raidPtr)
3577 {
3578 struct raid_softc *rs;
3579 struct dk_softc *dksc;
3580
3581 rs = raidPtr->softc;
3582 dksc = &rs->sc_dksc;
3583
3584 if ((rs->sc_flags & RAIDF_INITED) == 0)
3585 return 1;
3586
3587 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3588 /* there is work to do */
3589 return 0;
3590 }
3591 /* default is nothing to do */
3592 return 1;
3593 }
3594
3595 int
3596 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3597 {
3598 uint64_t numsecs;
3599 unsigned secsize;
3600 int error;
3601
3602 error = getdisksize(vp, &numsecs, &secsize);
3603 if (error == 0) {
3604 diskPtr->blockSize = secsize;
3605 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3606 diskPtr->partitionSize = numsecs;
3607 return 0;
3608 }
3609 return error;
3610 }
3611
3612 static int
3613 raid_match(device_t self, cfdata_t cfdata, void *aux)
3614 {
3615 return 1;
3616 }
3617
3618 static void
3619 raid_attach(device_t parent, device_t self, void *aux)
3620 {
3621 }
3622
3623
3624 static int
3625 raid_detach(device_t self, int flags)
3626 {
3627 int error;
3628 struct raid_softc *rs = raidsoftc(self);
3629
3630 if (rs == NULL)
3631 return ENXIO;
3632
3633 if ((error = raidlock(rs)) != 0)
3634 return error;
3635
3636 error = raid_detach_unlocked(rs);
3637
3638 raidunlock(rs);
3639
3640 /* XXX raid can be referenced here */
3641
3642 if (error)
3643 return error;
3644
3645 /* Free the softc */
3646 raidput(rs);
3647
3648 return 0;
3649 }
3650
3651 static void
3652 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3653 {
3654 struct dk_softc *dksc = &rs->sc_dksc;
3655 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3656
3657 memset(dg, 0, sizeof(*dg));
3658
3659 dg->dg_secperunit = raidPtr->totalSectors;
3660 dg->dg_secsize = raidPtr->bytesPerSector;
3661 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3662 dg->dg_ntracks = 4 * raidPtr->numCol;
3663
3664 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3665 }
3666
3667 /*
3668 * Get cache info for all the components (including spares).
3669 * Returns intersection of all the cache flags of all disks, or first
3670 * error if any encountered.
3671 * XXXfua feature flags can change as spares are added - lock down somehow
3672 */
3673 static int
3674 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3675 {
3676 int c;
3677 int error;
3678 int dkwhole = 0, dkpart;
3679
3680 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3681 /*
3682 * Check any non-dead disk, even when currently being
3683 * reconstructed.
3684 */
3685 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3686 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3687 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3688 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3689 if (error) {
3690 if (error != ENODEV) {
3691 printf("raid%d: get cache for component %s failed\n",
3692 raidPtr->raidid,
3693 raidPtr->Disks[c].devname);
3694 }
3695
3696 return error;
3697 }
3698
3699 if (c == 0)
3700 dkwhole = dkpart;
3701 else
3702 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3703 }
3704 }
3705
3706 *data = dkwhole;
3707
3708 return 0;
3709 }
3710
3711 /*
3712 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3713 * We end up returning whatever error was returned by the first cache flush
3714 * that fails.
3715 */
3716
3717 static int
3718 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3719 {
3720 int e = 0;
3721 for (int i = 0; i < 5; i++) {
3722 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3723 &force, FWRITE, NOCRED);
3724 if (!e || e == ENODEV)
3725 return e;
3726 printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3727 raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3728 }
3729 return e;
3730 }
3731
3732 int
3733 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3734 {
3735 int c, error;
3736
3737 error = 0;
3738 for (c = 0; c < raidPtr->numCol; c++) {
3739 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3740 int e = rf_sync_component_cache(raidPtr, c, force);
3741 if (e && !error)
3742 error = e;
3743 }
3744 }
3745
3746 for (c = 0; c < raidPtr->numSpare ; c++) {
3747 int sparecol = raidPtr->numCol + c;
3748 /* Need to ensure that the reconstruct actually completed! */
3749 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3750 int e = rf_sync_component_cache(raidPtr, sparecol,
3751 force);
3752 if (e && !error)
3753 error = e;
3754 }
3755 }
3756 return error;
3757 }
3758
3759 /* Fill in info with the current status */
3760 void
3761 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3762 {
3763
3764 if (raidPtr->status != rf_rs_reconstructing) {
3765 info->total = 100;
3766 info->completed = 100;
3767 } else {
3768 info->total = raidPtr->reconControl->numRUsTotal;
3769 info->completed = raidPtr->reconControl->numRUsComplete;
3770 }
3771 info->remaining = info->total - info->completed;
3772 }
3773
3774 /* Fill in info with the current status */
3775 void
3776 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3777 {
3778
3779 if (raidPtr->parity_rewrite_in_progress == 1) {
3780 info->total = raidPtr->Layout.numStripe;
3781 info->completed = raidPtr->parity_rewrite_stripes_done;
3782 } else {
3783 info->completed = 100;
3784 info->total = 100;
3785 }
3786 info->remaining = info->total - info->completed;
3787 }
3788
3789 /* Fill in info with the current status */
3790 void
3791 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3792 {
3793
3794 if (raidPtr->copyback_in_progress == 1) {
3795 info->total = raidPtr->Layout.numStripe;
3796 info->completed = raidPtr->copyback_stripes_done;
3797 info->remaining = info->total - info->completed;
3798 } else {
3799 info->remaining = 0;
3800 info->completed = 100;
3801 info->total = 100;
3802 }
3803 }
3804
3805 /* Fill in config with the current info */
3806 int
3807 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3808 {
3809 int d, i, j;
3810
3811 if (!raidPtr->valid)
3812 return ENODEV;
3813 config->cols = raidPtr->numCol;
3814 config->ndevs = raidPtr->numCol;
3815 if (config->ndevs >= RF_MAX_DISKS)
3816 return ENOMEM;
3817 config->nspares = raidPtr->numSpare;
3818 if (config->nspares >= RF_MAX_DISKS)
3819 return ENOMEM;
3820 config->maxqdepth = raidPtr->maxQueueDepth;
3821 d = 0;
3822 for (j = 0; j < config->cols; j++) {
3823 config->devs[d] = raidPtr->Disks[j];
3824 d++;
3825 }
3826 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3827 config->spares[i] = raidPtr->Disks[j];
3828 if (config->spares[i].status == rf_ds_rebuilding_spare) {
3829 /* XXX: raidctl(8) expects to see this as a used spare */
3830 config->spares[i].status = rf_ds_used_spare;
3831 }
3832 }
3833 return 0;
3834 }
3835
3836 int
3837 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3838 {
3839 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3840 RF_ComponentLabel_t *raid_clabel;
3841 int column = clabel->column;
3842
3843 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3844 return EINVAL;
3845 raid_clabel = raidget_component_label(raidPtr, column);
3846 memcpy(clabel, raid_clabel, sizeof *clabel);
3847 /* Fix-up for userland. */
3848 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
3849 clabel->version = RF_COMPONENT_LABEL_VERSION;
3850
3851 return 0;
3852 }
3853
3854 /*
3855 * Module interface
3856 */
3857
3858 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3859
3860 #ifdef _MODULE
3861 CFDRIVER_DECL(raid, DV_DISK, NULL);
3862 #endif
3863
3864 static int raid_modcmd(modcmd_t, void *);
3865 static int raid_modcmd_init(void);
3866 static int raid_modcmd_fini(void);
3867
3868 static int
3869 raid_modcmd(modcmd_t cmd, void *data)
3870 {
3871 int error;
3872
3873 error = 0;
3874 switch (cmd) {
3875 case MODULE_CMD_INIT:
3876 error = raid_modcmd_init();
3877 break;
3878 case MODULE_CMD_FINI:
3879 error = raid_modcmd_fini();
3880 break;
3881 default:
3882 error = ENOTTY;
3883 break;
3884 }
3885 return error;
3886 }
3887
3888 static int
3889 raid_modcmd_init(void)
3890 {
3891 int error;
3892 int bmajor, cmajor;
3893
3894 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3895 mutex_enter(&raid_lock);
3896 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3897 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3898 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3899 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3900
3901 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3902 #endif
3903
3904 bmajor = cmajor = -1;
3905 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3906 &raid_cdevsw, &cmajor);
3907 if (error != 0 && error != EEXIST) {
3908 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3909 mutex_exit(&raid_lock);
3910 return error;
3911 }
3912 #ifdef _MODULE
3913 error = config_cfdriver_attach(&raid_cd);
3914 if (error != 0) {
3915 aprint_error("%s: config_cfdriver_attach failed %d\n",
3916 __func__, error);
3917 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3918 mutex_exit(&raid_lock);
3919 return error;
3920 }
3921 #endif
3922 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3923 if (error != 0) {
3924 aprint_error("%s: config_cfattach_attach failed %d\n",
3925 __func__, error);
3926 #ifdef _MODULE
3927 config_cfdriver_detach(&raid_cd);
3928 #endif
3929 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3930 mutex_exit(&raid_lock);
3931 return error;
3932 }
3933
3934 raidautoconfigdone = false;
3935
3936 mutex_exit(&raid_lock);
3937
3938 if (error == 0) {
3939 if (rf_BootRaidframe(true) == 0)
3940 aprint_verbose("Kernelized RAIDframe activated\n");
3941 else
3942 panic("Serious error activating RAID!!");
3943 }
3944
3945 /*
3946 * Register a finalizer which will be used to auto-config RAID
3947 * sets once all real hardware devices have been found.
3948 */
3949 error = config_finalize_register(NULL, rf_autoconfig);
3950 if (error != 0) {
3951 aprint_error("WARNING: unable to register RAIDframe "
3952 "finalizer\n");
3953 error = 0;
3954 }
3955
3956 return error;
3957 }
3958
3959 static int
3960 raid_modcmd_fini(void)
3961 {
3962 int error;
3963
3964 mutex_enter(&raid_lock);
3965
3966 /* Don't allow unload if raid device(s) exist. */
3967 if (!LIST_EMPTY(&raids)) {
3968 mutex_exit(&raid_lock);
3969 return EBUSY;
3970 }
3971
3972 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3973 if (error != 0) {
3974 aprint_error("%s: cannot detach cfattach\n",__func__);
3975 mutex_exit(&raid_lock);
3976 return error;
3977 }
3978 #ifdef _MODULE
3979 error = config_cfdriver_detach(&raid_cd);
3980 if (error != 0) {
3981 aprint_error("%s: cannot detach cfdriver\n",__func__);
3982 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3983 mutex_exit(&raid_lock);
3984 return error;
3985 }
3986 #endif
3987 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3988 if (error != 0) {
3989 aprint_error("%s: cannot detach devsw\n",__func__);
3990 #ifdef _MODULE
3991 config_cfdriver_attach(&raid_cd);
3992 #endif
3993 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3994 mutex_exit(&raid_lock);
3995 return error;
3996 }
3997 rf_BootRaidframe(false);
3998 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3999 rf_destroy_mutex2(rf_sparet_wait_mutex);
4000 rf_destroy_cond2(rf_sparet_wait_cv);
4001 rf_destroy_cond2(rf_sparet_resp_cv);
4002 #endif
4003 mutex_exit(&raid_lock);
4004 mutex_destroy(&raid_lock);
4005
4006 return error;
4007 }
4008