Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
     23  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
     24  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
     25  * Copyright 2013 Saso Kiselkov. All rights reserved.
     26  */
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/spa.h>
     30 #include <sys/spa_impl.h>
     31 #include <sys/zio.h>
     32 #include <sys/zio_checksum.h>
     33 #include <sys/zil.h>
     34 #include <zfs_fletcher.h>
     35 
     36 /*
     37  * Checksum vectors.
     38  *
     39  * In the SPA, everything is checksummed.  We support checksum vectors
     40  * for three distinct reasons:
     41  *
     42  *   1. Different kinds of data need different levels of protection.
     43  *	For SPA metadata, we always want a very strong checksum.
     44  *	For user data, we let users make the trade-off between speed
     45  *	and checksum strength.
     46  *
     47  *   2. Cryptographic hash and MAC algorithms are an area of active research.
     48  *	It is likely that in future hash functions will be at least as strong
     49  *	as current best-of-breed, and may be substantially faster as well.
     50  *	We want the ability to take advantage of these new hashes as soon as
     51  *	they become available.
     52  *
     53  *   3. If someone develops hardware that can compute a strong hash quickly,
     54  *	we want the ability to take advantage of that hardware.
     55  *
     56  * Of course, we don't want a checksum upgrade to invalidate existing
     57  * data, so we store the checksum *function* in eight bits of the bp.
     58  * This gives us room for up to 256 different checksum functions.
     59  *
     60  * When writing a block, we always checksum it with the latest-and-greatest
     61  * checksum function of the appropriate strength.  When reading a block,
     62  * we compare the expected checksum against the actual checksum, which we
     63  * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
     64  *
     65  * SALTED CHECKSUMS
     66  *
     67  * To enable the use of less secure hash algorithms with dedup, we
     68  * introduce the notion of salted checksums (MACs, really).  A salted
     69  * checksum is fed both a random 256-bit value (the salt) and the data
     70  * to be checksummed.  This salt is kept secret (stored on the pool, but
     71  * never shown to the user).  Thus even if an attacker knew of collision
     72  * weaknesses in the hash algorithm, they won't be able to mount a known
     73  * plaintext attack on the DDT, since the actual hash value cannot be
     74  * known ahead of time.  How the salt is used is algorithm-specific
     75  * (some might simply prefix it to the data block, others might need to
     76  * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
     77  * object in the MOS (DMU_POOL_CHECKSUM_SALT).
     78  *
     79  * CONTEXT TEMPLATES
     80  *
     81  * Some hashing algorithms need to perform a substantial amount of
     82  * initialization work (e.g. salted checksums above may need to pre-hash
     83  * the salt) before being able to process data.  Performing this
     84  * redundant work for each block would be wasteful, so we instead allow
     85  * a checksum algorithm to do the work once (the first time it's used)
     86  * and then keep this pre-initialized context as a template inside the
     87  * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
     88  * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
     89  * construct and destruct the pre-initialized checksum context.  The
     90  * pre-initialized context is then reused during each checksum
     91  * invocation and passed to the checksum function.
     92  */
     93 
     94 /*ARGSUSED*/
     95 static void
     96 zio_checksum_off(const void *buf, uint64_t size,
     97     const void *ctx_template, zio_cksum_t *zcp)
     98 {
     99 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
    100 }
    101 
    102 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
    103 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
    104 	{{NULL, NULL}, NULL, NULL, 0, "on"},
    105 	{{zio_checksum_off,		zio_checksum_off},
    106 	    NULL, NULL, 0, "off"},
    107 	{{zio_checksum_SHA256,		zio_checksum_SHA256},
    108 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
    109 	    "label"},
    110 	{{zio_checksum_SHA256,		zio_checksum_SHA256},
    111 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
    112 	    "gang_header"},
    113 	{{fletcher_2_native,		fletcher_2_byteswap},
    114 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
    115 	{{fletcher_2_native,		fletcher_2_byteswap},
    116 	    NULL, NULL, 0, "fletcher2"},
    117 	{{fletcher_4_native,		fletcher_4_byteswap},
    118 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
    119 	{{zio_checksum_SHA256,		zio_checksum_SHA256},
    120 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
    121 	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
    122 	{{fletcher_4_native,		fletcher_4_byteswap},
    123 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
    124 	{{zio_checksum_off,		zio_checksum_off},
    125 	    NULL, NULL, 0, "noparity"},
    126 #ifndef __NetBSD__
    127 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
    128 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
    129 	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
    130 	{{zio_checksum_skein_native,	zio_checksum_skein_byteswap},
    131 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
    132 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
    133 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
    134 #endif
    135 #ifdef illumos
    136 	{{zio_checksum_edonr_native,	zio_checksum_edonr_byteswap},
    137 	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
    138 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
    139 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
    140 #endif
    141 };
    142 
    143 /*
    144  * The flag corresponding to the "verify" in dedup=[checksum,]verify
    145  * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
    146  */
    147 spa_feature_t
    148 zio_checksum_to_feature(enum zio_checksum cksum)
    149 {
    150 	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
    151 
    152 	switch (cksum) {
    153 #ifndef __NetBSD__
    154 	case ZIO_CHECKSUM_SHA512:
    155 		return (SPA_FEATURE_SHA512);
    156 	case ZIO_CHECKSUM_SKEIN:
    157 		return (SPA_FEATURE_SKEIN);
    158 #endif
    159 #ifdef illumos
    160 	case ZIO_CHECKSUM_EDONR:
    161 		return (SPA_FEATURE_EDONR);
    162 #endif
    163 	}
    164 	return (SPA_FEATURE_NONE);
    165 }
    166 
    167 enum zio_checksum
    168 zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
    169 {
    170 	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
    171 	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
    172 	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
    173 
    174 	if (child == ZIO_CHECKSUM_INHERIT)
    175 		return (parent);
    176 
    177 	if (child == ZIO_CHECKSUM_ON)
    178 		return (ZIO_CHECKSUM_ON_VALUE);
    179 
    180 	return (child);
    181 }
    182 
    183 enum zio_checksum
    184 zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
    185     enum zio_checksum parent)
    186 {
    187 	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
    188 	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
    189 	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
    190 
    191 	if (child == ZIO_CHECKSUM_INHERIT)
    192 		return (parent);
    193 
    194 	if (child == ZIO_CHECKSUM_ON)
    195 		return (spa_dedup_checksum(spa));
    196 
    197 	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
    198 		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
    199 
    200 	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
    201 	    ZCHECKSUM_FLAG_DEDUP) ||
    202 	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
    203 
    204 	return (child);
    205 }
    206 
    207 /*
    208  * Set the external verifier for a gang block based on <vdev, offset, txg>,
    209  * a tuple which is guaranteed to be unique for the life of the pool.
    210  */
    211 static void
    212 zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
    213 {
    214 	dva_t *dva = BP_IDENTITY(bp);
    215 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
    216 
    217 	ASSERT(BP_IS_GANG(bp));
    218 
    219 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
    220 }
    221 
    222 /*
    223  * Set the external verifier for a label block based on its offset.
    224  * The vdev is implicit, and the txg is unknowable at pool open time --
    225  * hence the logic in vdev_uberblock_load() to find the most recent copy.
    226  */
    227 static void
    228 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
    229 {
    230 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
    231 }
    232 
    233 /*
    234  * Calls the template init function of a checksum which supports context
    235  * templates and installs the template into the spa_t.
    236  */
    237 static void
    238 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
    239 {
    240 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
    241 
    242 	if (ci->ci_tmpl_init == NULL)
    243 		return;
    244 	if (spa->spa_cksum_tmpls[checksum] != NULL)
    245 		return;
    246 
    247 	VERIFY(ci->ci_tmpl_free != NULL);
    248 	mutex_enter(&spa->spa_cksum_tmpls_lock);
    249 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
    250 		spa->spa_cksum_tmpls[checksum] =
    251 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
    252 		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
    253 	}
    254 	mutex_exit(&spa->spa_cksum_tmpls_lock);
    255 }
    256 
    257 /*
    258  * Generate the checksum.
    259  */
    260 void
    261 zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
    262     void *data, uint64_t size)
    263 {
    264 	blkptr_t *bp = zio->io_bp;
    265 	uint64_t offset = zio->io_offset;
    266 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
    267 	zio_cksum_t cksum;
    268 	spa_t *spa = zio->io_spa;
    269 
    270 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
    271 	ASSERT(ci->ci_func[0] != NULL);
    272 
    273 	zio_checksum_template_init(checksum, spa);
    274 
    275 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
    276 		zio_eck_t *eck;
    277 
    278 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
    279 			zil_chain_t *zilc = data;
    280 
    281 			size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
    282 			    uint64_t);
    283 			eck = &zilc->zc_eck;
    284 		} else {
    285 			eck = (zio_eck_t *)((char *)data + size) - 1;
    286 		}
    287 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
    288 			zio_checksum_gang_verifier(&eck->zec_cksum, bp);
    289 		else if (checksum == ZIO_CHECKSUM_LABEL)
    290 			zio_checksum_label_verifier(&eck->zec_cksum, offset);
    291 		else
    292 			bp->blk_cksum = eck->zec_cksum;
    293 		eck->zec_magic = ZEC_MAGIC;
    294 		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
    295 		    &cksum);
    296 		eck->zec_cksum = cksum;
    297 	} else {
    298 		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
    299 		    &bp->blk_cksum);
    300 	}
    301 }
    302 
    303 int
    304 zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
    305     void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
    306 {
    307 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
    308 	zio_cksum_t actual_cksum, expected_cksum;
    309 	int byteswap;
    310 
    311 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
    312 		return (SET_ERROR(EINVAL));
    313 
    314 	zio_checksum_template_init(checksum, spa);
    315 
    316 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
    317 		zio_eck_t *eck;
    318 		zio_cksum_t verifier;
    319 
    320 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
    321 			zil_chain_t *zilc = data;
    322 			uint64_t nused;
    323 
    324 			eck = &zilc->zc_eck;
    325 			if (eck->zec_magic == ZEC_MAGIC)
    326 				nused = zilc->zc_nused;
    327 			else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
    328 				nused = BSWAP_64(zilc->zc_nused);
    329 			else
    330 				return (SET_ERROR(ECKSUM));
    331 
    332 			if (nused > size)
    333 				return (SET_ERROR(ECKSUM));
    334 
    335 			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
    336 		} else {
    337 			eck = (zio_eck_t *)((char *)data + size) - 1;
    338 		}
    339 
    340 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
    341 			zio_checksum_gang_verifier(&verifier, bp);
    342 		else if (checksum == ZIO_CHECKSUM_LABEL)
    343 			zio_checksum_label_verifier(&verifier, offset);
    344 		else
    345 			verifier = bp->blk_cksum;
    346 
    347 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
    348 
    349 		if (byteswap)
    350 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
    351 
    352 		expected_cksum = eck->zec_cksum;
    353 		eck->zec_cksum = verifier;
    354 		ci->ci_func[byteswap](data, size,
    355 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
    356 		eck->zec_cksum = expected_cksum;
    357 
    358 		if (byteswap) {
    359 			byteswap_uint64_array(&expected_cksum,
    360 			    sizeof (zio_cksum_t));
    361 		}
    362 	} else {
    363 		byteswap = BP_SHOULD_BYTESWAP(bp);
    364 		expected_cksum = bp->blk_cksum;
    365 		ci->ci_func[byteswap](data, size,
    366 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
    367 	}
    368 
    369 	if (info != NULL) {
    370 		info->zbc_expected = expected_cksum;
    371 		info->zbc_actual = actual_cksum;
    372 		info->zbc_checksum_name = ci->ci_name;
    373 		info->zbc_byteswapped = byteswap;
    374 		info->zbc_injected = 0;
    375 		info->zbc_has_cksum = 1;
    376 	}
    377 
    378 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
    379 		return (SET_ERROR(ECKSUM));
    380 
    381 	return (0);
    382 }
    383 
    384 int
    385 zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
    386 {
    387 	blkptr_t *bp = zio->io_bp;
    388 	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
    389 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
    390 	int error;
    391 	uint64_t size = (bp == NULL ? zio->io_size :
    392 	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
    393 	uint64_t offset = zio->io_offset;
    394 	void *data = zio->io_data;
    395 	spa_t *spa = zio->io_spa;
    396 
    397 	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
    398 	    offset, info);
    399 	if (error != 0 && zio_injection_enabled && !zio->io_error &&
    400 	    (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
    401 
    402 		info->zbc_injected = 1;
    403 		return (error);
    404 	}
    405 	return (error);
    406 }
    407 
    408 /*
    409  * Called by a spa_t that's about to be deallocated. This steps through
    410  * all of the checksum context templates and deallocates any that were
    411  * initialized using the algorithm-specific template init function.
    412  */
    413 void
    414 zio_checksum_templates_free(spa_t *spa)
    415 {
    416 	for (enum zio_checksum checksum = 0;
    417 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
    418 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
    419 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
    420 
    421 			VERIFY(ci->ci_tmpl_free != NULL);
    422 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
    423 			spa->spa_cksum_tmpls[checksum] = NULL;
    424 		}
    425 	}
    426 }
    427