Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Copyright (c) 2012 by Delphix. All rights reserved.
     28  */
     29 
     30 #include <sys/spa.h>
     31 #include <sys/spa_impl.h>
     32 #include <sys/vdev.h>
     33 #include <sys/vdev_impl.h>
     34 #include <sys/zio.h>
     35 #include <sys/zio_checksum.h>
     36 
     37 #include <sys/fm/fs/zfs.h>
     38 #include <sys/fm/protocol.h>
     39 #include <sys/fm/util.h>
     40 #include <sys/sysevent.h>
     41 
     42 /*
     43  * This general routine is responsible for generating all the different ZFS
     44  * ereports.  The payload is dependent on the class, and which arguments are
     45  * supplied to the function:
     46  *
     47  * 	EREPORT			POOL	VDEV	IO
     48  * 	block			X	X	X
     49  * 	data			X		X
     50  * 	device			X	X
     51  * 	pool			X
     52  *
     53  * If we are in a loading state, all errors are chained together by the same
     54  * SPA-wide ENA (Error Numeric Association).
     55  *
     56  * For isolated I/O requests, we get the ENA from the zio_t. The propagation
     57  * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
     58  * to chain together all ereports associated with a logical piece of data.  For
     59  * read I/Os, there  are basically three 'types' of I/O, which form a roughly
     60  * layered diagram:
     61  *
     62  *      +---------------+
     63  * 	| Aggregate I/O |	No associated logical data or device
     64  * 	+---------------+
     65  *              |
     66  *              V
     67  * 	+---------------+	Reads associated with a piece of logical data.
     68  * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
     69  * 	+---------------+       mirrors, gang blocks, retries, etc.
     70  *              |
     71  *              V
     72  * 	+---------------+	Reads associated with a particular device, but
     73  * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
     74  * 	+---------------+	and I/O aggregation.
     75  *
     76  * Note that 'physical I/O' here is not the same terminology as used in the rest
     77  * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
     78  * blockpointer.  But I/O with no associated block pointer can still be related
     79  * to a logical piece of data (i.e. RAID-Z requests).
     80  *
     81  * Purely physical I/O always have unique ENAs.  They are not related to a
     82  * particular piece of logical data, and therefore cannot be chained together.
     83  * We still generate an ereport, but the DE doesn't correlate it with any
     84  * logical piece of data.  When such an I/O fails, the delegated I/O requests
     85  * will issue a retry, which will trigger the 'real' ereport with the correct
     86  * ENA.
     87  *
     88  * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
     89  * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
     90  * then inherit this pointer, so that when it is first set subsequent failures
     91  * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
     92  * this pointer is set to NULL, and no ereport will be generated (since it
     93  * doesn't actually correspond to any particular device or piece of data,
     94  * and the caller will always retry without caching or queueing anyway).
     95  *
     96  * For checksum errors, we want to include more information about the actual
     97  * error which occurs.  Accordingly, we build an ereport when the error is
     98  * noticed, but instead of sending it in immediately, we hang it off of the
     99  * io_cksum_report field of the logical IO.  When the logical IO completes
    100  * (successfully or not), zfs_ereport_finish_checksum() is called with the
    101  * good and bad versions of the buffer (if available), and we annotate the
    102  * ereport with information about the differences.
    103  */
    104 #ifdef _KERNEL
    105 static void
    106 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
    107     const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
    108     uint64_t stateoroffset, uint64_t size)
    109 {
    110 	nvlist_t *ereport, *detector;
    111 
    112 	uint64_t ena;
    113 	char class[64];
    114 
    115 	/*
    116 	 * If we are doing a spa_tryimport() or in recovery mode,
    117 	 * ignore errors.
    118 	 */
    119 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
    120 	    spa_load_state(spa) == SPA_LOAD_RECOVER)
    121 		return;
    122 
    123 	/*
    124 	 * If we are in the middle of opening a pool, and the previous attempt
    125 	 * failed, don't bother logging any new ereports - we're just going to
    126 	 * get the same diagnosis anyway.
    127 	 */
    128 	if (spa_load_state(spa) != SPA_LOAD_NONE &&
    129 	    spa->spa_last_open_failed)
    130 		return;
    131 
    132 	if (zio != NULL) {
    133 		/*
    134 		 * If this is not a read or write zio, ignore the error.  This
    135 		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
    136 		 */
    137 		if (zio->io_type != ZIO_TYPE_READ &&
    138 		    zio->io_type != ZIO_TYPE_WRITE)
    139 			return;
    140 
    141 		/*
    142 		 * Ignore any errors from speculative I/Os, as failure is an
    143 		 * expected result.
    144 		 */
    145 		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
    146 			return;
    147 
    148 		/*
    149 		 * If this I/O is not a retry I/O, don't post an ereport.
    150 		 * Otherwise, we risk making bad diagnoses based on B_FAILFAST
    151 		 * I/Os.
    152 		 */
    153 		if (zio->io_error == EIO &&
    154 		    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
    155 			return;
    156 
    157 		if (vd != NULL) {
    158 			/*
    159 			 * If the vdev has already been marked as failing due
    160 			 * to a failed probe, then ignore any subsequent I/O
    161 			 * errors, as the DE will automatically fault the vdev
    162 			 * on the first such failure.  This also catches cases
    163 			 * where vdev_remove_wanted is set and the device has
    164 			 * not yet been asynchronously placed into the REMOVED
    165 			 * state.
    166 			 */
    167 			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
    168 				return;
    169 
    170 			/*
    171 			 * Ignore checksum errors for reads from DTL regions of
    172 			 * leaf vdevs.
    173 			 */
    174 			if (zio->io_type == ZIO_TYPE_READ &&
    175 			    zio->io_error == ECKSUM &&
    176 			    vd->vdev_ops->vdev_op_leaf &&
    177 			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
    178 				return;
    179 		}
    180 	}
    181 
    182 	/*
    183 	 * For probe failure, we want to avoid posting ereports if we've
    184 	 * already removed the device in the meantime.
    185 	 */
    186 	if (vd != NULL &&
    187 	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
    188 	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
    189 		return;
    190 
    191 	if ((ereport = fm_nvlist_create(NULL)) == NULL)
    192 		return;
    193 
    194 	if ((detector = fm_nvlist_create(NULL)) == NULL) {
    195 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
    196 		return;
    197 	}
    198 
    199 	/*
    200 	 * Serialize ereport generation
    201 	 */
    202 	mutex_enter(&spa->spa_errlist_lock);
    203 
    204 	/*
    205 	 * Determine the ENA to use for this event.  If we are in a loading
    206 	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
    207 	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
    208 	 */
    209 	if (spa_load_state(spa) != SPA_LOAD_NONE) {
    210 		if (spa->spa_ena == 0)
    211 			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
    212 		ena = spa->spa_ena;
    213 	} else if (zio != NULL && zio->io_logical != NULL) {
    214 		if (zio->io_logical->io_ena == 0)
    215 			zio->io_logical->io_ena =
    216 			    fm_ena_generate(0, FM_ENA_FMT1);
    217 		ena = zio->io_logical->io_ena;
    218 	} else {
    219 		ena = fm_ena_generate(0, FM_ENA_FMT1);
    220 	}
    221 
    222 	/*
    223 	 * Construct the full class, detector, and other standard FMA fields.
    224 	 */
    225 	(void) snprintf(class, sizeof (class), "%s.%s",
    226 	    ZFS_ERROR_CLASS, subclass);
    227 
    228 	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
    229 	    vd != NULL ? vd->vdev_guid : 0);
    230 
    231 	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
    232 
    233 	/*
    234 	 * Construct the per-ereport payload, depending on which parameters are
    235 	 * passed in.
    236 	 */
    237 
    238 	/*
    239 	 * Generic payload members common to all ereports.
    240 	 */
    241 	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
    242 	    DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
    243 	    DATA_TYPE_UINT64, spa_guid(spa),
    244 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
    245 	    spa_load_state(spa), NULL);
    246 
    247 	if (spa != NULL) {
    248 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
    249 		    DATA_TYPE_STRING,
    250 		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
    251 		    FM_EREPORT_FAILMODE_WAIT :
    252 		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
    253 		    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
    254 		    NULL);
    255 	}
    256 
    257 	if (vd != NULL) {
    258 		vdev_t *pvd = vd->vdev_parent;
    259 
    260 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
    261 		    DATA_TYPE_UINT64, vd->vdev_guid,
    262 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
    263 		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
    264 		if (vd->vdev_path != NULL)
    265 			fm_payload_set(ereport,
    266 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
    267 			    DATA_TYPE_STRING, vd->vdev_path, NULL);
    268 		if (vd->vdev_devid != NULL)
    269 			fm_payload_set(ereport,
    270 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
    271 			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
    272 		if (vd->vdev_fru != NULL)
    273 			fm_payload_set(ereport,
    274 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
    275 			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
    276 
    277 		if (pvd != NULL) {
    278 			fm_payload_set(ereport,
    279 			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
    280 			    DATA_TYPE_UINT64, pvd->vdev_guid,
    281 			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
    282 			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
    283 			    NULL);
    284 			if (pvd->vdev_path)
    285 				fm_payload_set(ereport,
    286 				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
    287 				    DATA_TYPE_STRING, pvd->vdev_path, NULL);
    288 			if (pvd->vdev_devid)
    289 				fm_payload_set(ereport,
    290 				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
    291 				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
    292 		}
    293 	}
    294 
    295 	if (zio != NULL) {
    296 		/*
    297 		 * Payload common to all I/Os.
    298 		 */
    299 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
    300 		    DATA_TYPE_INT32, zio->io_error, NULL);
    301 
    302 		/*
    303 		 * If the 'size' parameter is non-zero, it indicates this is a
    304 		 * RAID-Z or other I/O where the physical offset and length are
    305 		 * provided for us, instead of within the zio_t.
    306 		 */
    307 		if (vd != NULL) {
    308 			if (size)
    309 				fm_payload_set(ereport,
    310 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
    311 				    DATA_TYPE_UINT64, stateoroffset,
    312 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
    313 				    DATA_TYPE_UINT64, size, NULL);
    314 			else
    315 				fm_payload_set(ereport,
    316 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
    317 				    DATA_TYPE_UINT64, zio->io_offset,
    318 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
    319 				    DATA_TYPE_UINT64, zio->io_size, NULL);
    320 		}
    321 
    322 		/*
    323 		 * Payload for I/Os with corresponding logical information.
    324 		 */
    325 		if (zio->io_logical != NULL)
    326 			fm_payload_set(ereport,
    327 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
    328 			    DATA_TYPE_UINT64,
    329 			    zio->io_logical->io_bookmark.zb_objset,
    330 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
    331 			    DATA_TYPE_UINT64,
    332 			    zio->io_logical->io_bookmark.zb_object,
    333 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
    334 			    DATA_TYPE_INT64,
    335 			    zio->io_logical->io_bookmark.zb_level,
    336 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
    337 			    DATA_TYPE_UINT64,
    338 			    zio->io_logical->io_bookmark.zb_blkid, NULL);
    339 	} else if (vd != NULL) {
    340 		/*
    341 		 * If we have a vdev but no zio, this is a device fault, and the
    342 		 * 'stateoroffset' parameter indicates the previous state of the
    343 		 * vdev.
    344 		 */
    345 		fm_payload_set(ereport,
    346 		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
    347 		    DATA_TYPE_UINT64, stateoroffset, NULL);
    348 	}
    349 
    350 	mutex_exit(&spa->spa_errlist_lock);
    351 
    352 	*ereport_out = ereport;
    353 	*detector_out = detector;
    354 }
    355 
    356 /* if it's <= 128 bytes, save the corruption directly */
    357 #define	ZFM_MAX_INLINE		(128 / sizeof (uint64_t))
    358 
    359 #define	MAX_RANGES		16
    360 
    361 typedef struct zfs_ecksum_info {
    362 	/* histograms of set and cleared bits by bit number in a 64-bit word */
    363 	uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
    364 	uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
    365 
    366 	/* inline arrays of bits set and cleared. */
    367 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
    368 	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
    369 
    370 	/*
    371 	 * for each range, the number of bits set and cleared.  The Hamming
    372 	 * distance between the good and bad buffers is the sum of them all.
    373 	 */
    374 	uint32_t zei_range_sets[MAX_RANGES];
    375 	uint32_t zei_range_clears[MAX_RANGES];
    376 
    377 	struct zei_ranges {
    378 		uint32_t	zr_start;
    379 		uint32_t	zr_end;
    380 	} zei_ranges[MAX_RANGES];
    381 
    382 	size_t	zei_range_count;
    383 	uint32_t zei_mingap;
    384 	uint32_t zei_allowed_mingap;
    385 
    386 } zfs_ecksum_info_t;
    387 
    388 static void
    389 update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
    390 {
    391 	size_t i;
    392 	size_t bits = 0;
    393 	uint64_t value = BE_64(value_arg);
    394 
    395 	/* We store the bits in big-endian (largest-first) order */
    396 	for (i = 0; i < 64; i++) {
    397 		if (value & (1ull << i)) {
    398 			hist[63 - i]++;
    399 			++bits;
    400 		}
    401 	}
    402 	/* update the count of bits changed */
    403 	*count += bits;
    404 }
    405 
    406 /*
    407  * We've now filled up the range array, and need to increase "mingap" and
    408  * shrink the range list accordingly.  zei_mingap is always the smallest
    409  * distance between array entries, so we set the new_allowed_gap to be
    410  * one greater than that.  We then go through the list, joining together
    411  * any ranges which are closer than the new_allowed_gap.
    412  *
    413  * By construction, there will be at least one.  We also update zei_mingap
    414  * to the new smallest gap, to prepare for our next invocation.
    415  */
    416 static void
    417 shrink_ranges(zfs_ecksum_info_t *eip)
    418 {
    419 	uint32_t mingap = UINT32_MAX;
    420 	uint32_t new_allowed_gap = eip->zei_mingap + 1;
    421 
    422 	size_t idx, output;
    423 	size_t max = eip->zei_range_count;
    424 
    425 	struct zei_ranges *r = eip->zei_ranges;
    426 
    427 	ASSERT3U(eip->zei_range_count, >, 0);
    428 	ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
    429 
    430 	output = idx = 0;
    431 	while (idx < max - 1) {
    432 		uint32_t start = r[idx].zr_start;
    433 		uint32_t end = r[idx].zr_end;
    434 
    435 		while (idx < max - 1) {
    436 			idx++;
    437 
    438 			uint32_t nstart = r[idx].zr_start;
    439 			uint32_t nend = r[idx].zr_end;
    440 
    441 			uint32_t gap = nstart - end;
    442 			if (gap < new_allowed_gap) {
    443 				end = nend;
    444 				continue;
    445 			}
    446 			if (gap < mingap)
    447 				mingap = gap;
    448 			break;
    449 		}
    450 		r[output].zr_start = start;
    451 		r[output].zr_end = end;
    452 		output++;
    453 	}
    454 	ASSERT3U(output, <, eip->zei_range_count);
    455 	eip->zei_range_count = output;
    456 	eip->zei_mingap = mingap;
    457 	eip->zei_allowed_mingap = new_allowed_gap;
    458 }
    459 
    460 static void
    461 add_range(zfs_ecksum_info_t *eip, int start, int end)
    462 {
    463 	struct zei_ranges *r = eip->zei_ranges;
    464 	size_t count = eip->zei_range_count;
    465 
    466 	if (count >= MAX_RANGES) {
    467 		shrink_ranges(eip);
    468 		count = eip->zei_range_count;
    469 	}
    470 	if (count == 0) {
    471 		eip->zei_mingap = UINT32_MAX;
    472 		eip->zei_allowed_mingap = 1;
    473 	} else {
    474 		int gap = start - r[count - 1].zr_end;
    475 
    476 		if (gap < eip->zei_allowed_mingap) {
    477 			r[count - 1].zr_end = end;
    478 			return;
    479 		}
    480 		if (gap < eip->zei_mingap)
    481 			eip->zei_mingap = gap;
    482 	}
    483 	r[count].zr_start = start;
    484 	r[count].zr_end = end;
    485 	eip->zei_range_count++;
    486 }
    487 
    488 static size_t
    489 range_total_size(zfs_ecksum_info_t *eip)
    490 {
    491 	struct zei_ranges *r = eip->zei_ranges;
    492 	size_t count = eip->zei_range_count;
    493 	size_t result = 0;
    494 	size_t idx;
    495 
    496 	for (idx = 0; idx < count; idx++)
    497 		result += (r[idx].zr_end - r[idx].zr_start);
    498 
    499 	return (result);
    500 }
    501 
    502 static zfs_ecksum_info_t *
    503 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
    504     const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
    505     boolean_t drop_if_identical)
    506 {
    507 	const uint64_t *good = (const uint64_t *)goodbuf;
    508 	const uint64_t *bad = (const uint64_t *)badbuf;
    509 
    510 	uint64_t allset = 0;
    511 	uint64_t allcleared = 0;
    512 
    513 	size_t nui64s = size / sizeof (uint64_t);
    514 
    515 	size_t inline_size;
    516 	int no_inline = 0;
    517 	size_t idx;
    518 	size_t range;
    519 
    520 	size_t offset = 0;
    521 	ssize_t start = -1;
    522 
    523 	zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
    524 
    525 	/* don't do any annotation for injected checksum errors */
    526 	if (info != NULL && info->zbc_injected)
    527 		return (eip);
    528 
    529 	if (info != NULL && info->zbc_has_cksum) {
    530 		fm_payload_set(ereport,
    531 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
    532 		    DATA_TYPE_UINT64_ARRAY,
    533 		    sizeof (info->zbc_expected) / sizeof (uint64_t),
    534 		    (uint64_t *)&info->zbc_expected,
    535 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
    536 		    DATA_TYPE_UINT64_ARRAY,
    537 		    sizeof (info->zbc_actual) / sizeof (uint64_t),
    538 		    (uint64_t *)&info->zbc_actual,
    539 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
    540 		    DATA_TYPE_STRING,
    541 		    info->zbc_checksum_name,
    542 		    NULL);
    543 
    544 		if (info->zbc_byteswapped) {
    545 			fm_payload_set(ereport,
    546 			    FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
    547 			    DATA_TYPE_BOOLEAN, 1,
    548 			    NULL);
    549 		}
    550 	}
    551 
    552 	if (badbuf == NULL || goodbuf == NULL)
    553 		return (eip);
    554 
    555 	ASSERT3U(nui64s, <=, UINT16_MAX);
    556 	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
    557 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
    558 	ASSERT3U(size, <=, UINT32_MAX);
    559 
    560 	/* build up the range list by comparing the two buffers. */
    561 	for (idx = 0; idx < nui64s; idx++) {
    562 		if (good[idx] == bad[idx]) {
    563 			if (start == -1)
    564 				continue;
    565 
    566 			add_range(eip, start, idx);
    567 			start = -1;
    568 		} else {
    569 			if (start != -1)
    570 				continue;
    571 
    572 			start = idx;
    573 		}
    574 	}
    575 	if (start != -1)
    576 		add_range(eip, start, idx);
    577 
    578 	/* See if it will fit in our inline buffers */
    579 	inline_size = range_total_size(eip);
    580 	if (inline_size > ZFM_MAX_INLINE)
    581 		no_inline = 1;
    582 
    583 	/*
    584 	 * If there is no change and we want to drop if the buffers are
    585 	 * identical, do so.
    586 	 */
    587 	if (inline_size == 0 && drop_if_identical) {
    588 		kmem_free(eip, sizeof (*eip));
    589 		return (NULL);
    590 	}
    591 
    592 	/*
    593 	 * Now walk through the ranges, filling in the details of the
    594 	 * differences.  Also convert our uint64_t-array offsets to byte
    595 	 * offsets.
    596 	 */
    597 	for (range = 0; range < eip->zei_range_count; range++) {
    598 		size_t start = eip->zei_ranges[range].zr_start;
    599 		size_t end = eip->zei_ranges[range].zr_end;
    600 
    601 		for (idx = start; idx < end; idx++) {
    602 			uint64_t set, cleared;
    603 
    604 			// bits set in bad, but not in good
    605 			set = ((~good[idx]) & bad[idx]);
    606 			// bits set in good, but not in bad
    607 			cleared = (good[idx] & (~bad[idx]));
    608 
    609 			allset |= set;
    610 			allcleared |= cleared;
    611 
    612 			if (!no_inline) {
    613 				ASSERT3U(offset, <, inline_size);
    614 				eip->zei_bits_set[offset] = set;
    615 				eip->zei_bits_cleared[offset] = cleared;
    616 				offset++;
    617 			}
    618 
    619 			update_histogram(set, eip->zei_histogram_set,
    620 			    &eip->zei_range_sets[range]);
    621 			update_histogram(cleared, eip->zei_histogram_cleared,
    622 			    &eip->zei_range_clears[range]);
    623 		}
    624 
    625 		/* convert to byte offsets */
    626 		eip->zei_ranges[range].zr_start	*= sizeof (uint64_t);
    627 		eip->zei_ranges[range].zr_end	*= sizeof (uint64_t);
    628 	}
    629 	eip->zei_allowed_mingap	*= sizeof (uint64_t);
    630 	inline_size		*= sizeof (uint64_t);
    631 
    632 	/* fill in ereport */
    633 	fm_payload_set(ereport,
    634 	    FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
    635 	    DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
    636 	    (uint32_t *)eip->zei_ranges,
    637 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
    638 	    DATA_TYPE_UINT32, eip->zei_allowed_mingap,
    639 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
    640 	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
    641 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
    642 	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
    643 	    NULL);
    644 
    645 	if (!no_inline) {
    646 		fm_payload_set(ereport,
    647 		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
    648 		    DATA_TYPE_UINT8_ARRAY,
    649 		    inline_size, (uint8_t *)eip->zei_bits_set,
    650 		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
    651 		    DATA_TYPE_UINT8_ARRAY,
    652 		    inline_size, (uint8_t *)eip->zei_bits_cleared,
    653 		    NULL);
    654 	} else {
    655 		fm_payload_set(ereport,
    656 		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
    657 		    DATA_TYPE_UINT16_ARRAY,
    658 		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
    659 		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
    660 		    DATA_TYPE_UINT16_ARRAY,
    661 		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
    662 		    NULL);
    663 	}
    664 	return (eip);
    665 }
    666 #endif
    667 
    668 void
    669 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
    670     uint64_t stateoroffset, uint64_t size)
    671 {
    672 #ifdef _KERNEL
    673 	nvlist_t *ereport = NULL;
    674 	nvlist_t *detector = NULL;
    675 
    676 	zfs_ereport_start(&ereport, &detector,
    677 	    subclass, spa, vd, zio, stateoroffset, size);
    678 
    679 	if (ereport == NULL)
    680 		return;
    681 
    682 	fm_ereport_post(ereport, EVCH_SLEEP);
    683 
    684 	fm_nvlist_destroy(ereport, FM_NVA_FREE);
    685 	fm_nvlist_destroy(detector, FM_NVA_FREE);
    686 #endif
    687 }
    688 
    689 void
    690 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
    691     struct zio *zio, uint64_t offset, uint64_t length, void *arg,
    692     zio_bad_cksum_t *info)
    693 {
    694 	zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
    695 
    696 	if (zio->io_vsd != NULL)
    697 		zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
    698 	else
    699 		zio_vsd_default_cksum_report(zio, report, arg);
    700 
    701 	/* copy the checksum failure information if it was provided */
    702 	if (info != NULL) {
    703 		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
    704 		bcopy(info, report->zcr_ckinfo, sizeof (*info));
    705 	}
    706 
    707 	report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
    708 	report->zcr_length = length;
    709 
    710 #ifdef _KERNEL
    711 	zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
    712 	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
    713 
    714 	if (report->zcr_ereport == NULL) {
    715 		report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
    716 		if (report->zcr_ckinfo != NULL) {
    717 			kmem_free(report->zcr_ckinfo,
    718 			    sizeof (*report->zcr_ckinfo));
    719 		}
    720 		kmem_free(report, sizeof (*report));
    721 		return;
    722 	}
    723 #endif
    724 
    725 	mutex_enter(&spa->spa_errlist_lock);
    726 	report->zcr_next = zio->io_logical->io_cksum_report;
    727 	zio->io_logical->io_cksum_report = report;
    728 	mutex_exit(&spa->spa_errlist_lock);
    729 }
    730 
    731 void
    732 zfs_ereport_finish_checksum(zio_cksum_report_t *report,
    733     const void *good_data, const void *bad_data, boolean_t drop_if_identical)
    734 {
    735 #ifdef _KERNEL
    736 	zfs_ecksum_info_t *info = NULL;
    737 	info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
    738 	    good_data, bad_data, report->zcr_length, drop_if_identical);
    739 
    740 	if (info != NULL)
    741 		fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
    742 
    743 	fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
    744 	fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
    745 	report->zcr_ereport = report->zcr_detector = NULL;
    746 
    747 	if (info != NULL)
    748 		kmem_free(info, sizeof (*info));
    749 #endif
    750 }
    751 
    752 void
    753 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
    754 {
    755 #ifdef _KERNEL
    756 	if (rpt->zcr_ereport != NULL) {
    757 		fm_nvlist_destroy(rpt->zcr_ereport,
    758 		    FM_NVA_FREE);
    759 		fm_nvlist_destroy(rpt->zcr_detector,
    760 		    FM_NVA_FREE);
    761 	}
    762 #endif
    763 	rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
    764 
    765 	if (rpt->zcr_ckinfo != NULL)
    766 		kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
    767 
    768 	kmem_free(rpt, sizeof (*rpt));
    769 }
    770 
    771 void
    772 zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
    773 {
    774 #ifdef _KERNEL
    775 	fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
    776 #endif
    777 }
    778 
    779 void
    780 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
    781     struct zio *zio, uint64_t offset, uint64_t length,
    782     const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
    783 {
    784 #ifdef _KERNEL
    785 	nvlist_t *ereport = NULL;
    786 	nvlist_t *detector = NULL;
    787 	zfs_ecksum_info_t *info;
    788 
    789 	zfs_ereport_start(&ereport, &detector,
    790 	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
    791 
    792 	if (ereport == NULL)
    793 		return;
    794 
    795 	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
    796 	    B_FALSE);
    797 
    798 	if (info != NULL)
    799 		fm_ereport_post(ereport, EVCH_SLEEP);
    800 
    801 	fm_nvlist_destroy(ereport, FM_NVA_FREE);
    802 	fm_nvlist_destroy(detector, FM_NVA_FREE);
    803 
    804 	if (info != NULL)
    805 		kmem_free(info, sizeof (*info));
    806 #endif
    807 }
    808 
    809 static void
    810 zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
    811 {
    812 #ifdef _KERNEL
    813 	nvlist_t *resource;
    814 	char class[64];
    815 
    816 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
    817 		return;
    818 
    819 	if ((resource = fm_nvlist_create(NULL)) == NULL)
    820 		return;
    821 
    822 	(void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
    823 	    ZFS_ERROR_CLASS, name);
    824 	VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
    825 	VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
    826 	VERIFY(nvlist_add_uint64(resource,
    827 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
    828 	if (vd)
    829 		VERIFY(nvlist_add_uint64(resource,
    830 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
    831 
    832 	fm_ereport_post(resource, EVCH_SLEEP);
    833 
    834 	fm_nvlist_destroy(resource, FM_NVA_FREE);
    835 #endif
    836 }
    837 
    838 /*
    839  * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
    840  * has been removed from the system.  This will cause the DE to ignore any
    841  * recent I/O errors, inferring that they are due to the asynchronous device
    842  * removal.
    843  */
    844 void
    845 zfs_post_remove(spa_t *spa, vdev_t *vd)
    846 {
    847 	zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
    848 }
    849 
    850 /*
    851  * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
    852  * has the 'autoreplace' property set, and therefore any broken vdevs will be
    853  * handled by higher level logic, and no vdev fault should be generated.
    854  */
    855 void
    856 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
    857 {
    858 	zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
    859 }
    860 
    861 /*
    862  * The 'resource.fs.zfs.statechange' event is an internal signal that the
    863  * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
    864  * cause the retire agent to repair any outstanding fault management cases
    865  * open because the device was not found (fault.fs.zfs.device).
    866  */
    867 void
    868 zfs_post_state_change(spa_t *spa, vdev_t *vd)
    869 {
    870 	zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
    871 }
    872