Home | History | Annotate | Line # | Download | only in zpool
zpool_vdev.c revision 1.5
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
     24  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
     25  * Copyright 2016 Igor Kozhukhov <ikozhukhov (at) gmail.com>.
     26  */
     27 
     28 /*
     29  * Functions to convert between a list of vdevs and an nvlist representing the
     30  * configuration.  Each entry in the list can be one of:
     31  *
     32  * 	Device vdevs
     33  * 		disk=(path=..., devid=...)
     34  * 		file=(path=...)
     35  *
     36  * 	Group vdevs
     37  * 		raidz[1|2]=(...)
     38  * 		mirror=(...)
     39  *
     40  * 	Hot spares
     41  *
     42  * While the underlying implementation supports it, group vdevs cannot contain
     43  * other group vdevs.  All userland verification of devices is contained within
     44  * this file.  If successful, the nvlist returned can be passed directly to the
     45  * kernel; we've done as much verification as possible in userland.
     46  *
     47  * Hot spares are a special case, and passed down as an array of disk vdevs, at
     48  * the same level as the root of the vdev tree.
     49  *
     50  * The only function exported by this file is 'make_root_vdev'.  The
     51  * function performs several passes:
     52  *
     53  * 	1. Construct the vdev specification.  Performs syntax validation and
     54  *         makes sure each device is valid.
     55  * 	2. Check for devices in use.  Using libdiskmgt, makes sure that no
     56  *         devices are also in use.  Some can be overridden using the 'force'
     57  *         flag, others cannot.
     58  * 	3. Check for replication errors if the 'force' flag is not specified.
     59  *         validates that the replication level is consistent across the
     60  *         entire pool.
     61  * 	4. Call libzfs to label any whole disks with an EFI label.
     62  */
     63 
     64 #include <assert.h>
     65 #include <devid.h>
     66 #include <errno.h>
     67 #include <fcntl.h>
     68 #include <libintl.h>
     69 #include <libnvpair.h>
     70 #include <limits.h>
     71 #include <stdio.h>
     72 #include <string.h>
     73 #include <unistd.h>
     74 #include <paths.h>
     75 #include <sys/stat.h>
     76 #include <sys/disk.h>
     77 #include <sys/mntent.h>
     78 #ifdef __FreeBSD__
     79 #include <libgeom.h>
     80 #endif
     81 #ifdef __NetBSD__
     82 #include <sys/disklabel.h>
     83 #include <sys/ioctl.h>
     84 #endif
     85 
     86 #include "zpool_util.h"
     87 
     88 #define	BACKUP_SLICE	"s2"
     89 
     90 /*
     91  * For any given vdev specification, we can have multiple errors.  The
     92  * vdev_error() function keeps track of whether we have seen an error yet, and
     93  * prints out a header if its the first error we've seen.
     94  */
     95 boolean_t error_seen;
     96 boolean_t is_force;
     97 
     98 /*PRINTFLIKE1*/
     99 static void
    100 vdev_error(const char *fmt, ...)
    101 {
    102 	va_list ap;
    103 
    104 	if (!error_seen) {
    105 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
    106 		if (!is_force)
    107 			(void) fprintf(stderr, gettext("use '-f' to override "
    108 			    "the following errors:\n"));
    109 		else
    110 			(void) fprintf(stderr, gettext("the following errors "
    111 			    "must be manually repaired:\n"));
    112 		error_seen = B_TRUE;
    113 	}
    114 
    115 	va_start(ap, fmt);
    116 	(void) vfprintf(stderr, fmt, ap);
    117 	va_end(ap);
    118 }
    119 
    120 #ifdef illumos
    121 static void
    122 libdiskmgt_error(int error)
    123 {
    124 	/*
    125 	 * ENXIO/ENODEV is a valid error message if the device doesn't live in
    126 	 * /dev/dsk.  Don't bother printing an error message in this case.
    127 	 */
    128 	if (error == ENXIO || error == ENODEV)
    129 		return;
    130 
    131 	(void) fprintf(stderr, gettext("warning: device in use checking "
    132 	    "failed: %s\n"), strerror(error));
    133 }
    134 
    135 /*
    136  * Validate a device, passing the bulk of the work off to libdiskmgt.
    137  */
    138 static int
    139 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
    140 {
    141 	char *msg;
    142 	int error = 0;
    143 	dm_who_type_t who;
    144 
    145 	if (force)
    146 		who = DM_WHO_ZPOOL_FORCE;
    147 	else if (isspare)
    148 		who = DM_WHO_ZPOOL_SPARE;
    149 	else
    150 		who = DM_WHO_ZPOOL;
    151 
    152 	if (dm_inuse((char *)path, &msg, who, &error) || error) {
    153 		if (error != 0) {
    154 			libdiskmgt_error(error);
    155 			return (0);
    156 		} else {
    157 			vdev_error("%s", msg);
    158 			free(msg);
    159 			return (-1);
    160 		}
    161 	}
    162 
    163 	/*
    164 	 * If we're given a whole disk, ignore overlapping slices since we're
    165 	 * about to label it anyway.
    166 	 */
    167 	error = 0;
    168 	if (!wholedisk && !force &&
    169 	    (dm_isoverlapping((char *)path, &msg, &error) || error)) {
    170 		if (error == 0) {
    171 			/* dm_isoverlapping returned -1 */
    172 			vdev_error(gettext("%s overlaps with %s\n"), path, msg);
    173 			free(msg);
    174 			return (-1);
    175 		} else if (error != ENODEV) {
    176 			/* libdiskmgt's devcache only handles physical drives */
    177 			libdiskmgt_error(error);
    178 			return (0);
    179 		}
    180 	}
    181 
    182 	return (0);
    183 }
    184 
    185 
    186 /*
    187  * Validate a whole disk.  Iterate over all slices on the disk and make sure
    188  * that none is in use by calling check_slice().
    189  */
    190 static int
    191 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
    192 {
    193 	dm_descriptor_t *drive, *media, *slice;
    194 	int err = 0;
    195 	int i;
    196 	int ret;
    197 
    198 	/*
    199 	 * Get the drive associated with this disk.  This should never fail,
    200 	 * because we already have an alias handle open for the device.
    201 	 */
    202 	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
    203 	    &err)) == NULL || *drive == NULL) {
    204 		if (err)
    205 			libdiskmgt_error(err);
    206 		return (0);
    207 	}
    208 
    209 	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
    210 	    &err)) == NULL) {
    211 		dm_free_descriptors(drive);
    212 		if (err)
    213 			libdiskmgt_error(err);
    214 		return (0);
    215 	}
    216 
    217 	dm_free_descriptors(drive);
    218 
    219 	/*
    220 	 * It is possible that the user has specified a removable media drive,
    221 	 * and the media is not present.
    222 	 */
    223 	if (*media == NULL) {
    224 		dm_free_descriptors(media);
    225 		vdev_error(gettext("'%s' has no media in drive\n"), name);
    226 		return (-1);
    227 	}
    228 
    229 	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
    230 	    &err)) == NULL) {
    231 		dm_free_descriptors(media);
    232 		if (err)
    233 			libdiskmgt_error(err);
    234 		return (0);
    235 	}
    236 
    237 	dm_free_descriptors(media);
    238 
    239 	ret = 0;
    240 
    241 	/*
    242 	 * Iterate over all slices and report any errors.  We don't care about
    243 	 * overlapping slices because we are using the whole disk.
    244 	 */
    245 	for (i = 0; slice[i] != NULL; i++) {
    246 		char *name = dm_get_name(slice[i], &err);
    247 
    248 		if (check_slice(name, force, B_TRUE, isspare) != 0)
    249 			ret = -1;
    250 
    251 		dm_free_name(name);
    252 	}
    253 
    254 	dm_free_descriptors(slice);
    255 	return (ret);
    256 }
    257 
    258 /*
    259  * Validate a device.
    260  */
    261 static int
    262 check_device(const char *path, boolean_t force, boolean_t isspare)
    263 {
    264 	dm_descriptor_t desc;
    265 	int err;
    266 	char *dev;
    267 
    268 	/*
    269 	 * For whole disks, libdiskmgt does not include the leading dev path.
    270 	 */
    271 	dev = strrchr(path, '/');
    272 	assert(dev != NULL);
    273 	dev++;
    274 	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
    275 		err = check_disk(path, desc, force, isspare);
    276 		dm_free_descriptor(desc);
    277 		return (err);
    278 	}
    279 
    280 	return (check_slice(path, force, B_FALSE, isspare));
    281 }
    282 #endif	/* illumos */
    283 
    284 /*
    285  * Check that a file is valid.  All we can do in this case is check that it's
    286  * not in use by another pool, and not in use by swap.
    287  */
    288 static int
    289 check_file(const char *file, boolean_t force, boolean_t isspare)
    290 {
    291 	char  *name;
    292 	int fd;
    293 	int ret = 0;
    294 	int err;
    295 	pool_state_t state;
    296 	boolean_t inuse;
    297 
    298 #ifdef illumos
    299 	if (dm_inuse_swap(file, &err)) {
    300 		if (err)
    301 			libdiskmgt_error(err);
    302 		else
    303 			vdev_error(gettext("%s is currently used by swap. "
    304 			    "Please see swap(1M).\n"), file);
    305 		return (-1);
    306 	}
    307 #endif
    308 
    309 	if ((fd = open(file, O_RDONLY)) < 0)
    310 		return (0);
    311 
    312 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
    313 		const char *desc;
    314 
    315 		switch (state) {
    316 		case POOL_STATE_ACTIVE:
    317 			desc = gettext("active");
    318 			break;
    319 
    320 		case POOL_STATE_EXPORTED:
    321 			desc = gettext("exported");
    322 			break;
    323 
    324 		case POOL_STATE_POTENTIALLY_ACTIVE:
    325 			desc = gettext("potentially active");
    326 			break;
    327 
    328 		default:
    329 			desc = gettext("unknown");
    330 			break;
    331 		}
    332 
    333 		/*
    334 		 * Allow hot spares to be shared between pools.
    335 		 */
    336 		if (state == POOL_STATE_SPARE && isspare)
    337 			return (0);
    338 
    339 		if (state == POOL_STATE_ACTIVE ||
    340 		    state == POOL_STATE_SPARE || !force) {
    341 			switch (state) {
    342 			case POOL_STATE_SPARE:
    343 				vdev_error(gettext("%s is reserved as a hot "
    344 				    "spare for pool %s\n"), file, name);
    345 				break;
    346 			default:
    347 				vdev_error(gettext("%s is part of %s pool "
    348 				    "'%s'\n"), file, desc, name);
    349 				break;
    350 			}
    351 			ret = -1;
    352 		}
    353 
    354 		free(name);
    355 	}
    356 
    357 	(void) close(fd);
    358 	return (ret);
    359 }
    360 
    361 static int
    362 check_device(const char *name, boolean_t force, boolean_t isspare)
    363 {
    364 	char path[MAXPATHLEN];
    365 
    366 	if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0)
    367 		snprintf(path, sizeof(path), "%s%s", _PATH_DEV, name);
    368 	else
    369 		strlcpy(path, name, sizeof(path));
    370 
    371 	return (check_file(path, force, isspare));
    372 }
    373 
    374 /*
    375  * By "whole disk" we mean an entire physical disk (something we can
    376  * label, toggle the write cache on, etc.) as opposed to the full
    377  * capacity of a pseudo-device such as lofi or did.  We act as if we
    378  * are labeling the disk, which should be a pretty good test of whether
    379  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
    380  * it isn't.
    381  */
    382 static boolean_t
    383 is_whole_disk(const char *arg)
    384 {
    385 #ifdef illumos
    386 	struct dk_gpt *label;
    387 	int	fd;
    388 	char	path[MAXPATHLEN];
    389 
    390 	(void) snprintf(path, sizeof (path), "%s%s%s",
    391 	    ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
    392 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
    393 		return (B_FALSE);
    394 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
    395 		(void) close(fd);
    396 		return (B_FALSE);
    397 	}
    398 	efi_free(label);
    399 	(void) close(fd);
    400 	return (B_TRUE);
    401 #endif
    402 #ifdef __FreeBSD__
    403 	int fd;
    404 
    405 	fd = g_open(arg, 0);
    406 	if (fd >= 0) {
    407 		g_close(fd);
    408 		return (B_TRUE);
    409 	}
    410 	return (B_FALSE);
    411 #endif
    412 #ifdef __NetBSD__
    413 	struct disklabel dl;
    414 	int fd, rv;
    415 
    416 	if ((fd = open(arg, O_RDWR | O_NONBLOCK)) < 0)
    417 		return (B_FALSE);
    418 
    419 	rv = ioctl(fd, DIOCGDINFO, &dl);
    420 	close(fd);
    421 	return (rv == 0);
    422 #endif
    423 }
    424 
    425 /*
    426  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
    427  * device, fill in the device id to make a complete nvlist.  Valid forms for a
    428  * leaf vdev are:
    429  *
    430  * 	/dev/dsk/xxx	Complete disk path
    431  * 	/xxx		Full path to file
    432  * 	xxx		Shorthand for /dev/dsk/xxx
    433  */
    434 static nvlist_t *
    435 make_leaf_vdev(const char *arg, uint64_t is_log)
    436 {
    437 	char path[MAXPATHLEN];
    438 	struct stat64 statbuf;
    439 	nvlist_t *vdev = NULL;
    440 	char *type = NULL;
    441 	boolean_t wholedisk = B_FALSE;
    442 
    443 	/*
    444 	 * Determine what type of vdev this is, and put the full path into
    445 	 * 'path'.  We detect whether this is a device of file afterwards by
    446 	 * checking the st_mode of the file.
    447 	 */
    448 	if (arg[0] == '/') {
    449 		/*
    450 		 * Complete device or file path.  Exact type is determined by
    451 		 * examining the file descriptor afterwards.
    452 		 */
    453 		wholedisk = is_whole_disk(arg);
    454 		if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
    455 			(void) fprintf(stderr,
    456 			    gettext("cannot open '%s': %s\n"),
    457 			    arg, strerror(errno));
    458 			return (NULL);
    459 		}
    460 
    461 		(void) strlcpy(path, arg, sizeof (path));
    462 	} else {
    463 		/*
    464 		 * This may be a short path for a device, or it could be total
    465 		 * gibberish.  Check to see if it's a known device in
    466 		 * /dev/dsk/.  As part of this check, see if we've been given a
    467 		 * an entire disk (minus the slice number).
    468 		 */
    469 		if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
    470 			strlcpy(path, arg, sizeof (path));
    471 		else
    472 			snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg);
    473 		wholedisk = is_whole_disk(path);
    474 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
    475 			/*
    476 			 * If we got ENOENT, then the user gave us
    477 			 * gibberish, so try to direct them with a
    478 			 * reasonable error message.  Otherwise,
    479 			 * regurgitate strerror() since it's the best we
    480 			 * can do.
    481 			 */
    482 			if (errno == ENOENT) {
    483 				(void) fprintf(stderr,
    484 				    gettext("cannot open '%s': no such "
    485 				    "GEOM provider\n"), arg);
    486 				(void) fprintf(stderr,
    487 				    gettext("must be a full path or "
    488 				    "shorthand device name\n"));
    489 				return (NULL);
    490 			} else {
    491 				(void) fprintf(stderr,
    492 				    gettext("cannot open '%s': %s\n"),
    493 				    path, strerror(errno));
    494 				return (NULL);
    495 			}
    496 		}
    497 	}
    498 
    499 #ifdef __FreeBSD__
    500 	if (S_ISCHR(statbuf.st_mode)) {
    501 		statbuf.st_mode &= ~S_IFCHR;
    502 		statbuf.st_mode |= S_IFBLK;
    503 		wholedisk = B_FALSE;
    504 	}
    505 #endif
    506 
    507 	/*
    508 	 * Determine whether this is a device or a file.
    509 	 */
    510 	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
    511 		type = VDEV_TYPE_DISK;
    512 	} else if (S_ISREG(statbuf.st_mode)) {
    513 		type = VDEV_TYPE_FILE;
    514 	} else {
    515 		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
    516 		    "GEOM provider or regular file\n"), path);
    517 		return (NULL);
    518 	}
    519 
    520 	/*
    521 	 * Finally, we have the complete device or file, and we know that it is
    522 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
    523 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
    524 	 */
    525 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
    526 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
    527 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
    528 	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
    529 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
    530 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
    531 		    (uint64_t)wholedisk) == 0);
    532 
    533 #ifdef have_devid
    534 	/*
    535 	 * For a whole disk, defer getting its devid until after labeling it.
    536 	 */
    537 	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
    538 		/*
    539 		 * Get the devid for the device.
    540 		 */
    541 		int fd;
    542 		ddi_devid_t devid;
    543 		char *minor = NULL, *devid_str = NULL;
    544 
    545 		if ((fd = open(path, O_RDONLY)) < 0) {
    546 			(void) fprintf(stderr, gettext("cannot open '%s': "
    547 			    "%s\n"), path, strerror(errno));
    548 			nvlist_free(vdev);
    549 			return (NULL);
    550 		}
    551 
    552 		if (devid_get(fd, &devid) == 0) {
    553 			if (devid_get_minor_name(fd, &minor) == 0 &&
    554 			    (devid_str = devid_str_encode(devid, minor)) !=
    555 			    NULL) {
    556 				verify(nvlist_add_string(vdev,
    557 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
    558 			}
    559 			if (devid_str != NULL)
    560 				devid_str_free(devid_str);
    561 			if (minor != NULL)
    562 				devid_str_free(minor);
    563 			devid_free(devid);
    564 		}
    565 
    566 		(void) close(fd);
    567 	}
    568 #endif
    569 
    570 	return (vdev);
    571 }
    572 
    573 /*
    574  * Go through and verify the replication level of the pool is consistent.
    575  * Performs the following checks:
    576  *
    577  * 	For the new spec, verifies that devices in mirrors and raidz are the
    578  * 	same size.
    579  *
    580  * 	If the current configuration already has inconsistent replication
    581  * 	levels, ignore any other potential problems in the new spec.
    582  *
    583  * 	Otherwise, make sure that the current spec (if there is one) and the new
    584  * 	spec have consistent replication levels.
    585  */
    586 typedef struct replication_level {
    587 	char *zprl_type;
    588 	uint64_t zprl_children;
    589 	uint64_t zprl_parity;
    590 } replication_level_t;
    591 
    592 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)
    593 
    594 /*
    595  * Given a list of toplevel vdevs, return the current replication level.  If
    596  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
    597  * an error message will be displayed for each self-inconsistent vdev.
    598  */
    599 static replication_level_t *
    600 get_replication(nvlist_t *nvroot, boolean_t fatal)
    601 {
    602 	nvlist_t **top;
    603 	uint_t t, toplevels;
    604 	nvlist_t **child;
    605 	uint_t c, children;
    606 	nvlist_t *nv;
    607 	char *type;
    608 	replication_level_t lastrep = {0};
    609 	replication_level_t rep;
    610 	replication_level_t *ret;
    611 	boolean_t dontreport;
    612 
    613 	ret = safe_malloc(sizeof (replication_level_t));
    614 
    615 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
    616 	    &top, &toplevels) == 0);
    617 
    618 	for (t = 0; t < toplevels; t++) {
    619 		uint64_t is_log = B_FALSE;
    620 
    621 		nv = top[t];
    622 
    623 		/*
    624 		 * For separate logs we ignore the top level vdev replication
    625 		 * constraints.
    626 		 */
    627 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
    628 		if (is_log)
    629 			continue;
    630 
    631 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
    632 		    &type) == 0);
    633 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
    634 		    &child, &children) != 0) {
    635 			/*
    636 			 * This is a 'file' or 'disk' vdev.
    637 			 */
    638 			rep.zprl_type = type;
    639 			rep.zprl_children = 1;
    640 			rep.zprl_parity = 0;
    641 		} else {
    642 			uint64_t vdev_size;
    643 
    644 			/*
    645 			 * This is a mirror or RAID-Z vdev.  Go through and make
    646 			 * sure the contents are all the same (files vs. disks),
    647 			 * keeping track of the number of elements in the
    648 			 * process.
    649 			 *
    650 			 * We also check that the size of each vdev (if it can
    651 			 * be determined) is the same.
    652 			 */
    653 			rep.zprl_type = type;
    654 			rep.zprl_children = 0;
    655 
    656 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
    657 				verify(nvlist_lookup_uint64(nv,
    658 				    ZPOOL_CONFIG_NPARITY,
    659 				    &rep.zprl_parity) == 0);
    660 				assert(rep.zprl_parity != 0);
    661 			} else {
    662 				rep.zprl_parity = 0;
    663 			}
    664 
    665 			/*
    666 			 * The 'dontreport' variable indicates that we've
    667 			 * already reported an error for this spec, so don't
    668 			 * bother doing it again.
    669 			 */
    670 			type = NULL;
    671 			dontreport = 0;
    672 			vdev_size = -1ULL;
    673 			for (c = 0; c < children; c++) {
    674 				boolean_t is_replacing, is_spare;
    675 				nvlist_t *cnv = child[c];
    676 				char *path;
    677 				struct stat64 statbuf;
    678 				uint64_t size = -1ULL;
    679 				char *childtype;
    680 				int fd, err;
    681 
    682 				rep.zprl_children++;
    683 
    684 				verify(nvlist_lookup_string(cnv,
    685 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
    686 
    687 				/*
    688 				 * If this is a replacing or spare vdev, then
    689 				 * get the real first child of the vdev.
    690 				 */
    691 				is_replacing = strcmp(childtype,
    692 				    VDEV_TYPE_REPLACING) == 0;
    693 				is_spare = strcmp(childtype,
    694 				    VDEV_TYPE_SPARE) == 0;
    695 				if (is_replacing || is_spare) {
    696 					nvlist_t **rchild;
    697 					uint_t rchildren;
    698 
    699 					verify(nvlist_lookup_nvlist_array(cnv,
    700 					    ZPOOL_CONFIG_CHILDREN, &rchild,
    701 					    &rchildren) == 0);
    702 					assert((is_replacing && rchildren == 2)
    703 					    || (is_spare && rchildren >= 2));
    704 					cnv = rchild[0];
    705 
    706 					verify(nvlist_lookup_string(cnv,
    707 					    ZPOOL_CONFIG_TYPE,
    708 					    &childtype) == 0);
    709 				}
    710 
    711 				verify(nvlist_lookup_string(cnv,
    712 				    ZPOOL_CONFIG_PATH, &path) == 0);
    713 
    714 				/*
    715 				 * If we have a raidz/mirror that combines disks
    716 				 * with files, report it as an error.
    717 				 */
    718 				if (!dontreport && type != NULL &&
    719 				    strcmp(type, childtype) != 0) {
    720 					if (ret != NULL)
    721 						free(ret);
    722 					ret = NULL;
    723 					if (fatal)
    724 						vdev_error(gettext(
    725 						    "mismatched replication "
    726 						    "level: %s contains both "
    727 						    "files and devices\n"),
    728 						    rep.zprl_type);
    729 					else
    730 						return (NULL);
    731 					dontreport = B_TRUE;
    732 				}
    733 
    734 				/*
    735 				 * According to stat(2), the value of 'st_size'
    736 				 * is undefined for block devices and character
    737 				 * devices.  But there is no effective way to
    738 				 * determine the real size in userland.
    739 				 *
    740 				 * Instead, we'll take advantage of an
    741 				 * implementation detail of spec_size().  If the
    742 				 * device is currently open, then we (should)
    743 				 * return a valid size.
    744 				 *
    745 				 * If we still don't get a valid size (indicated
    746 				 * by a size of 0 or MAXOFFSET_T), then ignore
    747 				 * this device altogether.
    748 				 */
    749 				if ((fd = open(path, O_RDONLY)) >= 0) {
    750 					err = fstat64(fd, &statbuf);
    751 					(void) close(fd);
    752 				} else {
    753 					err = stat64(path, &statbuf);
    754 				}
    755 
    756 				if (err != 0 ||
    757 				    statbuf.st_size == 0 ||
    758 				    statbuf.st_size == MAXOFFSET_T)
    759 					continue;
    760 
    761 				size = statbuf.st_size;
    762 
    763 				/*
    764 				 * Also make sure that devices and
    765 				 * slices have a consistent size.  If
    766 				 * they differ by a significant amount
    767 				 * (~16MB) then report an error.
    768 				 */
    769 				if (!dontreport &&
    770 				    (vdev_size != -1ULL &&
    771 				    (labs(size - vdev_size) >
    772 				    ZPOOL_FUZZ))) {
    773 					if (ret != NULL)
    774 						free(ret);
    775 					ret = NULL;
    776 					if (fatal)
    777 						vdev_error(gettext(
    778 						    "%s contains devices of "
    779 						    "different sizes\n"),
    780 						    rep.zprl_type);
    781 					else
    782 						return (NULL);
    783 					dontreport = B_TRUE;
    784 				}
    785 
    786 				type = childtype;
    787 				vdev_size = size;
    788 			}
    789 		}
    790 
    791 		/*
    792 		 * At this point, we have the replication of the last toplevel
    793 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if its
    794 		 * different.
    795 		 */
    796 		if (lastrep.zprl_type != NULL) {
    797 			if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
    798 				if (ret != NULL)
    799 					free(ret);
    800 				ret = NULL;
    801 				if (fatal)
    802 					vdev_error(gettext(
    803 					    "mismatched replication level: "
    804 					    "both %s and %s vdevs are "
    805 					    "present\n"),
    806 					    lastrep.zprl_type, rep.zprl_type);
    807 				else
    808 					return (NULL);
    809 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
    810 				if (ret)
    811 					free(ret);
    812 				ret = NULL;
    813 				if (fatal)
    814 					vdev_error(gettext(
    815 					    "mismatched replication level: "
    816 					    "both %llu and %llu device parity "
    817 					    "%s vdevs are present\n"),
    818 					    lastrep.zprl_parity,
    819 					    rep.zprl_parity,
    820 					    rep.zprl_type);
    821 				else
    822 					return (NULL);
    823 			} else if (lastrep.zprl_children != rep.zprl_children) {
    824 				if (ret)
    825 					free(ret);
    826 				ret = NULL;
    827 				if (fatal)
    828 					vdev_error(gettext(
    829 					    "mismatched replication level: "
    830 					    "both %llu-way and %llu-way %s "
    831 					    "vdevs are present\n"),
    832 					    lastrep.zprl_children,
    833 					    rep.zprl_children,
    834 					    rep.zprl_type);
    835 				else
    836 					return (NULL);
    837 			}
    838 		}
    839 		lastrep = rep;
    840 	}
    841 
    842 	if (ret != NULL)
    843 		*ret = rep;
    844 
    845 	return (ret);
    846 }
    847 
    848 /*
    849  * Check the replication level of the vdev spec against the current pool.  Calls
    850  * get_replication() to make sure the new spec is self-consistent.  If the pool
    851  * has a consistent replication level, then we ignore any errors.  Otherwise,
    852  * report any difference between the two.
    853  */
    854 static int
    855 check_replication(nvlist_t *config, nvlist_t *newroot)
    856 {
    857 	nvlist_t **child;
    858 	uint_t	children;
    859 	replication_level_t *current = NULL, *new;
    860 	int ret;
    861 
    862 	/*
    863 	 * If we have a current pool configuration, check to see if it's
    864 	 * self-consistent.  If not, simply return success.
    865 	 */
    866 	if (config != NULL) {
    867 		nvlist_t *nvroot;
    868 
    869 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
    870 		    &nvroot) == 0);
    871 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
    872 			return (0);
    873 	}
    874 	/*
    875 	 * for spares there may be no children, and therefore no
    876 	 * replication level to check
    877 	 */
    878 	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
    879 	    &child, &children) != 0) || (children == 0)) {
    880 		free(current);
    881 		return (0);
    882 	}
    883 
    884 	/*
    885 	 * If all we have is logs then there's no replication level to check.
    886 	 */
    887 	if (num_logs(newroot) == children) {
    888 		free(current);
    889 		return (0);
    890 	}
    891 
    892 	/*
    893 	 * Get the replication level of the new vdev spec, reporting any
    894 	 * inconsistencies found.
    895 	 */
    896 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
    897 		free(current);
    898 		return (-1);
    899 	}
    900 
    901 	/*
    902 	 * Check to see if the new vdev spec matches the replication level of
    903 	 * the current pool.
    904 	 */
    905 	ret = 0;
    906 	if (current != NULL) {
    907 		if (strcmp(current->zprl_type, new->zprl_type) != 0) {
    908 			vdev_error(gettext(
    909 			    "mismatched replication level: pool uses %s "
    910 			    "and new vdev is %s\n"),
    911 			    current->zprl_type, new->zprl_type);
    912 			ret = -1;
    913 		} else if (current->zprl_parity != new->zprl_parity) {
    914 			vdev_error(gettext(
    915 			    "mismatched replication level: pool uses %llu "
    916 			    "device parity and new vdev uses %llu\n"),
    917 			    current->zprl_parity, new->zprl_parity);
    918 			ret = -1;
    919 		} else if (current->zprl_children != new->zprl_children) {
    920 			vdev_error(gettext(
    921 			    "mismatched replication level: pool uses %llu-way "
    922 			    "%s and new vdev uses %llu-way %s\n"),
    923 			    current->zprl_children, current->zprl_type,
    924 			    new->zprl_children, new->zprl_type);
    925 			ret = -1;
    926 		}
    927 	}
    928 
    929 	free(new);
    930 	if (current != NULL)
    931 		free(current);
    932 
    933 	return (ret);
    934 }
    935 
    936 #ifdef illumos
    937 /*
    938  * Go through and find any whole disks in the vdev specification, labelling them
    939  * as appropriate.  When constructing the vdev spec, we were unable to open this
    940  * device in order to provide a devid.  Now that we have labelled the disk and
    941  * know that slice 0 is valid, we can construct the devid now.
    942  *
    943  * If the disk was already labeled with an EFI label, we will have gotten the
    944  * devid already (because we were able to open the whole disk).  Otherwise, we
    945  * need to get the devid after we label the disk.
    946  */
    947 static int
    948 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
    949 {
    950 	nvlist_t **child;
    951 	uint_t c, children;
    952 	char *type, *path, *diskname;
    953 	char buf[MAXPATHLEN];
    954 	uint64_t wholedisk;
    955 	int fd;
    956 	int ret;
    957 	ddi_devid_t devid;
    958 	char *minor = NULL, *devid_str = NULL;
    959 
    960 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
    961 
    962 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
    963 	    &child, &children) != 0) {
    964 
    965 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
    966 			return (0);
    967 
    968 		/*
    969 		 * We have a disk device.  Get the path to the device
    970 		 * and see if it's a whole disk by appending the backup
    971 		 * slice and stat()ing the device.
    972 		 */
    973 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
    974 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
    975 		    &wholedisk) != 0 || !wholedisk)
    976 			return (0);
    977 
    978 		diskname = strrchr(path, '/');
    979 		assert(diskname != NULL);
    980 		diskname++;
    981 		if (zpool_label_disk(g_zfs, zhp, diskname) == -1)
    982 			return (-1);
    983 
    984 		/*
    985 		 * Fill in the devid, now that we've labeled the disk.
    986 		 */
    987 		(void) snprintf(buf, sizeof (buf), "%ss0", path);
    988 		if ((fd = open(buf, O_RDONLY)) < 0) {
    989 			(void) fprintf(stderr,
    990 			    gettext("cannot open '%s': %s\n"),
    991 			    buf, strerror(errno));
    992 			return (-1);
    993 		}
    994 
    995 		if (devid_get(fd, &devid) == 0) {
    996 			if (devid_get_minor_name(fd, &minor) == 0 &&
    997 			    (devid_str = devid_str_encode(devid, minor)) !=
    998 			    NULL) {
    999 				verify(nvlist_add_string(nv,
   1000 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
   1001 			}
   1002 			if (devid_str != NULL)
   1003 				devid_str_free(devid_str);
   1004 			if (minor != NULL)
   1005 				devid_str_free(minor);
   1006 			devid_free(devid);
   1007 		}
   1008 
   1009 		/*
   1010 		 * Update the path to refer to the 's0' slice.  The presence of
   1011 		 * the 'whole_disk' field indicates to the CLI that we should
   1012 		 * chop off the slice number when displaying the device in
   1013 		 * future output.
   1014 		 */
   1015 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
   1016 
   1017 		(void) close(fd);
   1018 
   1019 		return (0);
   1020 	}
   1021 
   1022 	for (c = 0; c < children; c++)
   1023 		if ((ret = make_disks(zhp, child[c])) != 0)
   1024 			return (ret);
   1025 
   1026 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
   1027 	    &child, &children) == 0)
   1028 		for (c = 0; c < children; c++)
   1029 			if ((ret = make_disks(zhp, child[c])) != 0)
   1030 				return (ret);
   1031 
   1032 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
   1033 	    &child, &children) == 0)
   1034 		for (c = 0; c < children; c++)
   1035 			if ((ret = make_disks(zhp, child[c])) != 0)
   1036 				return (ret);
   1037 
   1038 	return (0);
   1039 }
   1040 #endif	/* illumos */
   1041 
   1042 /*
   1043  * Determine if the given path is a hot spare within the given configuration.
   1044  */
   1045 static boolean_t
   1046 is_spare(nvlist_t *config, const char *path)
   1047 {
   1048 	int fd;
   1049 	pool_state_t state;
   1050 	char *name = NULL;
   1051 	nvlist_t *label;
   1052 	uint64_t guid, spareguid;
   1053 	nvlist_t *nvroot;
   1054 	nvlist_t **spares;
   1055 	uint_t i, nspares;
   1056 	boolean_t inuse;
   1057 
   1058 	if ((fd = open(path, O_RDONLY)) < 0)
   1059 		return (B_FALSE);
   1060 
   1061 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
   1062 	    !inuse ||
   1063 	    state != POOL_STATE_SPARE ||
   1064 	    zpool_read_label(fd, &label) != 0) {
   1065 		free(name);
   1066 		(void) close(fd);
   1067 		return (B_FALSE);
   1068 	}
   1069 	free(name);
   1070 	(void) close(fd);
   1071 
   1072 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
   1073 	nvlist_free(label);
   1074 
   1075 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
   1076 	    &nvroot) == 0);
   1077 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
   1078 	    &spares, &nspares) == 0) {
   1079 		for (i = 0; i < nspares; i++) {
   1080 			verify(nvlist_lookup_uint64(spares[i],
   1081 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
   1082 			if (spareguid == guid)
   1083 				return (B_TRUE);
   1084 		}
   1085 	}
   1086 
   1087 	return (B_FALSE);
   1088 }
   1089 
   1090 /*
   1091  * Go through and find any devices that are in use.  We rely on libdiskmgt for
   1092  * the majority of this task.
   1093  */
   1094 static boolean_t
   1095 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
   1096     boolean_t replacing, boolean_t isspare)
   1097 {
   1098 	nvlist_t **child;
   1099 	uint_t c, children;
   1100 	char *type, *path;
   1101 	int ret = 0;
   1102 	char buf[MAXPATHLEN];
   1103 	uint64_t wholedisk;
   1104 	boolean_t anyinuse = B_FALSE;
   1105 
   1106 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
   1107 
   1108 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
   1109 	    &child, &children) != 0) {
   1110 
   1111 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
   1112 
   1113 		/*
   1114 		 * As a generic check, we look to see if this is a replace of a
   1115 		 * hot spare within the same pool.  If so, we allow it
   1116 		 * regardless of what libdiskmgt or zpool_in_use() says.
   1117 		 */
   1118 		if (replacing) {
   1119 #ifdef illumos
   1120 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
   1121 			    &wholedisk) == 0 && wholedisk)
   1122 				(void) snprintf(buf, sizeof (buf), "%ss0",
   1123 				    path);
   1124 			else
   1125 #endif
   1126 				(void) strlcpy(buf, path, sizeof (buf));
   1127 
   1128 			if (is_spare(config, buf))
   1129 				return (B_FALSE);
   1130 		}
   1131 
   1132 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
   1133 			ret = check_device(path, force, isspare);
   1134 		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
   1135 			ret = check_file(path, force, isspare);
   1136 
   1137 		return (ret != 0);
   1138 	}
   1139 
   1140 	for (c = 0; c < children; c++)
   1141 		if (is_device_in_use(config, child[c], force, replacing,
   1142 		    B_FALSE))
   1143 			anyinuse = B_TRUE;
   1144 
   1145 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
   1146 	    &child, &children) == 0)
   1147 		for (c = 0; c < children; c++)
   1148 			if (is_device_in_use(config, child[c], force, replacing,
   1149 			    B_TRUE))
   1150 				anyinuse = B_TRUE;
   1151 
   1152 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
   1153 	    &child, &children) == 0)
   1154 		for (c = 0; c < children; c++)
   1155 			if (is_device_in_use(config, child[c], force, replacing,
   1156 			    B_FALSE))
   1157 				anyinuse = B_TRUE;
   1158 
   1159 	return (anyinuse);
   1160 }
   1161 
   1162 static const char *
   1163 is_grouping(const char *type, int *mindev, int *maxdev)
   1164 {
   1165 	if (strncmp(type, "raidz", 5) == 0) {
   1166 		const char *p = type + 5;
   1167 		char *end;
   1168 		long nparity;
   1169 
   1170 		if (*p == '\0') {
   1171 			nparity = 1;
   1172 		} else if (*p == '0') {
   1173 			return (NULL); /* no zero prefixes allowed */
   1174 		} else {
   1175 			errno = 0;
   1176 			nparity = strtol(p, &end, 10);
   1177 			if (errno != 0 || nparity < 1 || nparity >= 255 ||
   1178 			    *end != '\0')
   1179 				return (NULL);
   1180 		}
   1181 
   1182 		if (mindev != NULL)
   1183 			*mindev = nparity + 1;
   1184 		if (maxdev != NULL)
   1185 			*maxdev = 255;
   1186 		return (VDEV_TYPE_RAIDZ);
   1187 	}
   1188 
   1189 	if (maxdev != NULL)
   1190 		*maxdev = INT_MAX;
   1191 
   1192 	if (strcmp(type, "mirror") == 0) {
   1193 		if (mindev != NULL)
   1194 			*mindev = 2;
   1195 		return (VDEV_TYPE_MIRROR);
   1196 	}
   1197 
   1198 	if (strcmp(type, "spare") == 0) {
   1199 		if (mindev != NULL)
   1200 			*mindev = 1;
   1201 		return (VDEV_TYPE_SPARE);
   1202 	}
   1203 
   1204 	if (strcmp(type, "log") == 0) {
   1205 		if (mindev != NULL)
   1206 			*mindev = 1;
   1207 		return (VDEV_TYPE_LOG);
   1208 	}
   1209 
   1210 	if (strcmp(type, "cache") == 0) {
   1211 		if (mindev != NULL)
   1212 			*mindev = 1;
   1213 		return (VDEV_TYPE_L2CACHE);
   1214 	}
   1215 
   1216 	return (NULL);
   1217 }
   1218 
   1219 /*
   1220  * Construct a syntactically valid vdev specification,
   1221  * and ensure that all devices and files exist and can be opened.
   1222  * Note: we don't bother freeing anything in the error paths
   1223  * because the program is just going to exit anyway.
   1224  */
   1225 nvlist_t *
   1226 construct_spec(int argc, char **argv)
   1227 {
   1228 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
   1229 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
   1230 	const char *type;
   1231 	uint64_t is_log;
   1232 	boolean_t seen_logs;
   1233 
   1234 	top = NULL;
   1235 	toplevels = 0;
   1236 	spares = NULL;
   1237 	l2cache = NULL;
   1238 	nspares = 0;
   1239 	nlogs = 0;
   1240 	nl2cache = 0;
   1241 	is_log = B_FALSE;
   1242 	seen_logs = B_FALSE;
   1243 
   1244 	while (argc > 0) {
   1245 		nv = NULL;
   1246 
   1247 		/*
   1248 		 * If it's a mirror or raidz, the subsequent arguments are
   1249 		 * its leaves -- until we encounter the next mirror or raidz.
   1250 		 */
   1251 		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
   1252 			nvlist_t **child = NULL;
   1253 			int c, children = 0;
   1254 
   1255 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
   1256 				if (spares != NULL) {
   1257 					(void) fprintf(stderr,
   1258 					    gettext("invalid vdev "
   1259 					    "specification: 'spare' can be "
   1260 					    "specified only once\n"));
   1261 					return (NULL);
   1262 				}
   1263 				is_log = B_FALSE;
   1264 			}
   1265 
   1266 			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
   1267 				if (seen_logs) {
   1268 					(void) fprintf(stderr,
   1269 					    gettext("invalid vdev "
   1270 					    "specification: 'log' can be "
   1271 					    "specified only once\n"));
   1272 					return (NULL);
   1273 				}
   1274 				seen_logs = B_TRUE;
   1275 				is_log = B_TRUE;
   1276 				argc--;
   1277 				argv++;
   1278 				/*
   1279 				 * A log is not a real grouping device.
   1280 				 * We just set is_log and continue.
   1281 				 */
   1282 				continue;
   1283 			}
   1284 
   1285 			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
   1286 				if (l2cache != NULL) {
   1287 					(void) fprintf(stderr,
   1288 					    gettext("invalid vdev "
   1289 					    "specification: 'cache' can be "
   1290 					    "specified only once\n"));
   1291 					return (NULL);
   1292 				}
   1293 				is_log = B_FALSE;
   1294 			}
   1295 
   1296 			if (is_log) {
   1297 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
   1298 					(void) fprintf(stderr,
   1299 					    gettext("invalid vdev "
   1300 					    "specification: unsupported 'log' "
   1301 					    "device: %s\n"), type);
   1302 					return (NULL);
   1303 				}
   1304 				nlogs++;
   1305 			}
   1306 
   1307 			for (c = 1; c < argc; c++) {
   1308 				if (is_grouping(argv[c], NULL, NULL) != NULL)
   1309 					break;
   1310 				children++;
   1311 				child = realloc(child,
   1312 				    children * sizeof (nvlist_t *));
   1313 				if (child == NULL)
   1314 					zpool_no_memory();
   1315 				if ((nv = make_leaf_vdev(argv[c], B_FALSE))
   1316 				    == NULL)
   1317 					return (NULL);
   1318 				child[children - 1] = nv;
   1319 			}
   1320 
   1321 			if (children < mindev) {
   1322 				(void) fprintf(stderr, gettext("invalid vdev "
   1323 				    "specification: %s requires at least %d "
   1324 				    "devices\n"), argv[0], mindev);
   1325 				return (NULL);
   1326 			}
   1327 
   1328 			if (children > maxdev) {
   1329 				(void) fprintf(stderr, gettext("invalid vdev "
   1330 				    "specification: %s supports no more than "
   1331 				    "%d devices\n"), argv[0], maxdev);
   1332 				return (NULL);
   1333 			}
   1334 
   1335 			argc -= c;
   1336 			argv += c;
   1337 
   1338 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
   1339 				spares = child;
   1340 				nspares = children;
   1341 				continue;
   1342 			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
   1343 				l2cache = child;
   1344 				nl2cache = children;
   1345 				continue;
   1346 			} else {
   1347 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
   1348 				    0) == 0);
   1349 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
   1350 				    type) == 0);
   1351 				verify(nvlist_add_uint64(nv,
   1352 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
   1353 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
   1354 					verify(nvlist_add_uint64(nv,
   1355 					    ZPOOL_CONFIG_NPARITY,
   1356 					    mindev - 1) == 0);
   1357 				}
   1358 				verify(nvlist_add_nvlist_array(nv,
   1359 				    ZPOOL_CONFIG_CHILDREN, child,
   1360 				    children) == 0);
   1361 
   1362 				for (c = 0; c < children; c++)
   1363 					nvlist_free(child[c]);
   1364 				free(child);
   1365 			}
   1366 		} else {
   1367 			/*
   1368 			 * We have a device.  Pass off to make_leaf_vdev() to
   1369 			 * construct the appropriate nvlist describing the vdev.
   1370 			 */
   1371 			if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL)
   1372 				return (NULL);
   1373 			if (is_log)
   1374 				nlogs++;
   1375 			argc--;
   1376 			argv++;
   1377 		}
   1378 
   1379 		toplevels++;
   1380 		top = realloc(top, toplevels * sizeof (nvlist_t *));
   1381 		if (top == NULL)
   1382 			zpool_no_memory();
   1383 		top[toplevels - 1] = nv;
   1384 	}
   1385 
   1386 	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
   1387 		(void) fprintf(stderr, gettext("invalid vdev "
   1388 		    "specification: at least one toplevel vdev must be "
   1389 		    "specified\n"));
   1390 		return (NULL);
   1391 	}
   1392 
   1393 	if (seen_logs && nlogs == 0) {
   1394 		(void) fprintf(stderr, gettext("invalid vdev specification: "
   1395 		    "log requires at least 1 device\n"));
   1396 		return (NULL);
   1397 	}
   1398 
   1399 	/*
   1400 	 * Finally, create nvroot and add all top-level vdevs to it.
   1401 	 */
   1402 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
   1403 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
   1404 	    VDEV_TYPE_ROOT) == 0);
   1405 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
   1406 	    top, toplevels) == 0);
   1407 	if (nspares != 0)
   1408 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
   1409 		    spares, nspares) == 0);
   1410 	if (nl2cache != 0)
   1411 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
   1412 		    l2cache, nl2cache) == 0);
   1413 
   1414 	for (t = 0; t < toplevels; t++)
   1415 		nvlist_free(top[t]);
   1416 	for (t = 0; t < nspares; t++)
   1417 		nvlist_free(spares[t]);
   1418 	for (t = 0; t < nl2cache; t++)
   1419 		nvlist_free(l2cache[t]);
   1420 	if (spares)
   1421 		free(spares);
   1422 	if (l2cache)
   1423 		free(l2cache);
   1424 	free(top);
   1425 
   1426 	return (nvroot);
   1427 }
   1428 
   1429 nvlist_t *
   1430 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
   1431     splitflags_t flags, int argc, char **argv)
   1432 {
   1433 	nvlist_t *newroot = NULL, **child;
   1434 	uint_t c, children;
   1435 
   1436 	if (argc > 0) {
   1437 		if ((newroot = construct_spec(argc, argv)) == NULL) {
   1438 			(void) fprintf(stderr, gettext("Unable to build a "
   1439 			    "pool from the specified devices\n"));
   1440 			return (NULL);
   1441 		}
   1442 
   1443 #ifdef illumos
   1444 		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
   1445 			nvlist_free(newroot);
   1446 			return (NULL);
   1447 		}
   1448 #endif
   1449 
   1450 		/* avoid any tricks in the spec */
   1451 		verify(nvlist_lookup_nvlist_array(newroot,
   1452 		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
   1453 		for (c = 0; c < children; c++) {
   1454 			char *path;
   1455 			const char *type;
   1456 			int min, max;
   1457 
   1458 			verify(nvlist_lookup_string(child[c],
   1459 			    ZPOOL_CONFIG_PATH, &path) == 0);
   1460 			if ((type = is_grouping(path, &min, &max)) != NULL) {
   1461 				(void) fprintf(stderr, gettext("Cannot use "
   1462 				    "'%s' as a device for splitting\n"), type);
   1463 				nvlist_free(newroot);
   1464 				return (NULL);
   1465 			}
   1466 		}
   1467 	}
   1468 
   1469 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
   1470 		nvlist_free(newroot);
   1471 		return (NULL);
   1472 	}
   1473 
   1474 	return (newroot);
   1475 }
   1476 
   1477 /*
   1478  * Get and validate the contents of the given vdev specification.  This ensures
   1479  * that the nvlist returned is well-formed, that all the devices exist, and that
   1480  * they are not currently in use by any other known consumer.  The 'poolconfig'
   1481  * parameter is the current configuration of the pool when adding devices
   1482  * existing pool, and is used to perform additional checks, such as changing the
   1483  * replication level of the pool.  It can be 'NULL' to indicate that this is a
   1484  * new pool.  The 'force' flag controls whether devices should be forcefully
   1485  * added, even if they appear in use.
   1486  */
   1487 nvlist_t *
   1488 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
   1489     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
   1490 {
   1491 	nvlist_t *newroot;
   1492 	nvlist_t *poolconfig = NULL;
   1493 	is_force = force;
   1494 
   1495 	/*
   1496 	 * Construct the vdev specification.  If this is successful, we know
   1497 	 * that we have a valid specification, and that all devices can be
   1498 	 * opened.
   1499 	 */
   1500 	if ((newroot = construct_spec(argc, argv)) == NULL)
   1501 		return (NULL);
   1502 
   1503 	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
   1504 		return (NULL);
   1505 
   1506 	/*
   1507 	 * Validate each device to make sure that its not shared with another
   1508 	 * subsystem.  We do this even if 'force' is set, because there are some
   1509 	 * uses (such as a dedicated dump device) that even '-f' cannot
   1510 	 * override.
   1511 	 */
   1512 	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
   1513 		nvlist_free(newroot);
   1514 		return (NULL);
   1515 	}
   1516 
   1517 	/*
   1518 	 * Check the replication level of the given vdevs and report any errors
   1519 	 * found.  We include the existing pool spec, if any, as we need to
   1520 	 * catch changes against the existing replication level.
   1521 	 */
   1522 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
   1523 		nvlist_free(newroot);
   1524 		return (NULL);
   1525 	}
   1526 
   1527 #ifdef illumos
   1528 	/*
   1529 	 * Run through the vdev specification and label any whole disks found.
   1530 	 */
   1531 	if (!dryrun && make_disks(zhp, newroot) != 0) {
   1532 		nvlist_free(newroot);
   1533 		return (NULL);
   1534 	}
   1535 #endif
   1536 
   1537 	return (newroot);
   1538 }
   1539