Home | History | Annotate | Line # | Download | only in cprng_fast
cprng_fast.c revision 1.15
      1 /*	$NetBSD: cprng_fast.c,v 1.15 2020/04/30 03:29:45 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2014 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.15 2020/04/30 03:29:45 riastradh Exp $");
     34 
     35 #include <sys/types.h>
     36 #include <sys/param.h>
     37 #include <sys/bitops.h>
     38 #include <sys/cprng.h>
     39 #include <sys/cpu.h>
     40 #include <sys/entropy.h>
     41 #include <sys/evcnt.h>
     42 #include <sys/intr.h>
     43 #include <sys/kmem.h>
     44 #include <sys/percpu.h>
     45 
     46 /* ChaCha core */
     48 
     49 #define	crypto_core_OUTPUTWORDS	16
     50 #define	crypto_core_INPUTWORDS	4
     51 #define	crypto_core_KEYWORDS	8
     52 #define	crypto_core_CONSTWORDS	4
     53 
     54 #define	crypto_core_ROUNDS	8
     55 
     56 static uint32_t
     57 rotate(uint32_t u, unsigned c)
     58 {
     59 
     60 	return (u << c) | (u >> (32 - c));
     61 }
     62 
     63 #define	QUARTERROUND(a, b, c, d) do {					      \
     64 	(a) += (b); (d) ^= (a); (d) = rotate((d), 16);			      \
     65 	(c) += (d); (b) ^= (c); (b) = rotate((b), 12);			      \
     66 	(a) += (b); (d) ^= (a); (d) = rotate((d),  8);			      \
     67 	(c) += (d); (b) ^= (c); (b) = rotate((b),  7);			      \
     68 } while (0)
     69 
     70 static void
     71 crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k,
     72     const uint32_t *c)
     73 {
     74 	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
     75 	int i;
     76 
     77 	x0 = c[0];
     78 	x1 = c[1];
     79 	x2 = c[2];
     80 	x3 = c[3];
     81 	x4 = k[0];
     82 	x5 = k[1];
     83 	x6 = k[2];
     84 	x7 = k[3];
     85 	x8 = k[4];
     86 	x9 = k[5];
     87 	x10 = k[6];
     88 	x11 = k[7];
     89 	x12 = in[0];
     90 	x13 = in[1];
     91 	x14 = in[2];
     92 	x15 = in[3];
     93 
     94 	for (i = crypto_core_ROUNDS; i > 0; i -= 2) {
     95 		QUARTERROUND( x0, x4, x8,x12);
     96 		QUARTERROUND( x1, x5, x9,x13);
     97 		QUARTERROUND( x2, x6,x10,x14);
     98 		QUARTERROUND( x3, x7,x11,x15);
     99 		QUARTERROUND( x0, x5,x10,x15);
    100 		QUARTERROUND( x1, x6,x11,x12);
    101 		QUARTERROUND( x2, x7, x8,x13);
    102 		QUARTERROUND( x3, x4, x9,x14);
    103 	}
    104 
    105 	out[0] = x0 + c[0];
    106 	out[1] = x1 + c[1];
    107 	out[2] = x2 + c[2];
    108 	out[3] = x3 + c[3];
    109 	out[4] = x4 + k[0];
    110 	out[5] = x5 + k[1];
    111 	out[6] = x6 + k[2];
    112 	out[7] = x7 + k[3];
    113 	out[8] = x8 + k[4];
    114 	out[9] = x9 + k[5];
    115 	out[10] = x10 + k[6];
    116 	out[11] = x11 + k[7];
    117 	out[12] = x12 + in[0];
    118 	out[13] = x13 + in[1];
    119 	out[14] = x14 + in[2];
    120 	out[15] = x15 + in[3];
    121 }
    122 
    123 /* `expand 32-byte k' */
    125 static const uint32_t crypto_core_constant32[4] = {
    126 	0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U,
    127 };
    128 
    129 /*
    130  * Test vector for ChaCha20 from
    131  * <http://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-00>,
    132  * test vectors for ChaCha12 and ChaCha8 generated by the same
    133  * crypto_core code with crypto_core_ROUNDS varied.
    134  */
    135 
    136 #define	check(E)	do						\
    137 {									\
    138 	if (!(E))							\
    139 		panic("crypto self-test failed: %s", #E);		\
    140 } while (0)
    141 
    142 static void
    143 crypto_core_selftest(void)
    144 {
    145 	const uint32_t zero32[8] = {0};
    146 	const uint8_t sigma[] = "expand 32-byte k";
    147 	uint32_t block[16];
    148 	unsigned i;
    149 
    150 #if crypto_core_ROUNDS == 8
    151 	static const uint8_t out[64] = {
    152 		0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6,
    153 		0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1,
    154 		0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b,
    155 		0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e,
    156 		0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41,
    157 		0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19,
    158 		0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01,
    159 		0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42,
    160 	};
    161 #elif crypto_core_ROUNDS == 12
    162 	static const uint8_t out[64] = {
    163 		0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53,
    164 		0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5,
    165 		0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14,
    166 		0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f,
    167 		0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0,
    168 		0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79,
    169 		0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19,
    170 		0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe,
    171 	};
    172 #elif crypto_core_ROUNDS == 20
    173 	static const uint8_t out[64] = {
    174 		0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90,
    175 		0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28,
    176 		0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a,
    177 		0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7,
    178 		0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d,
    179 		0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37,
    180 		0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c,
    181 		0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86,
    182 	};
    183 #else
    184 #error crypto_core_ROUNDS must be 8, 12, or 20.
    185 #endif
    186 
    187 	check(crypto_core_constant32[0] == le32dec(&sigma[0]));
    188 	check(crypto_core_constant32[1] == le32dec(&sigma[4]));
    189 	check(crypto_core_constant32[2] == le32dec(&sigma[8]));
    190 	check(crypto_core_constant32[3] == le32dec(&sigma[12]));
    191 
    192 	crypto_core(block, zero32, zero32, crypto_core_constant32);
    193 	for (i = 0; i < 16; i++)
    194 		check(block[i] == le32dec(&out[i*4]));
    195 }
    196 
    197 #undef check
    198 
    199 #define	CPRNG_FAST_SEED_BYTES	(crypto_core_KEYWORDS * sizeof(uint32_t))
    201 
    202 struct cprng_fast {
    203 	uint32_t 	buffer[crypto_core_OUTPUTWORDS];
    204 	uint32_t 	key[crypto_core_KEYWORDS];
    205 	uint32_t 	nonce[crypto_core_INPUTWORDS];
    206 	struct evcnt	*reseed_evcnt;
    207 	unsigned	epoch;
    208 };
    209 
    210 __CTASSERT(sizeof ((struct cprng_fast *)0)->key == CPRNG_FAST_SEED_BYTES);
    211 
    212 static void	cprng_fast_init_cpu(void *, void *, struct cpu_info *);
    213 static void	cprng_fast_schedule_reseed(struct cprng_fast *);
    214 static void	cprng_fast_intr(void *);
    215 
    216 static void	cprng_fast_seed(struct cprng_fast *, const void *);
    217 static void	cprng_fast_buf(struct cprng_fast *, void *, unsigned);
    218 
    219 static void	cprng_fast_buf_short(void *, size_t);
    220 static void	cprng_fast_buf_long(void *, size_t);
    221 
    222 static percpu_t	*cprng_fast_percpu	__read_mostly;
    223 static void	*cprng_fast_softint	__read_mostly;
    224 
    225 void
    226 cprng_fast_init(void)
    227 {
    228 
    229 	crypto_core_selftest();
    230 	cprng_fast_percpu = percpu_create(sizeof(struct cprng_fast),
    231 	    cprng_fast_init_cpu, NULL, NULL);
    232 	cprng_fast_softint = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
    233 	    &cprng_fast_intr, NULL);
    234 }
    235 
    236 static void
    237 cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci)
    238 {
    239 	struct cprng_fast *const cprng = p;
    240 	uint8_t seed[CPRNG_FAST_SEED_BYTES];
    241 
    242 	cprng->epoch = entropy_epoch();
    243 	cprng_strong(kern_cprng, seed, sizeof seed, 0);
    244 	cprng_fast_seed(cprng, seed);
    245 	(void)explicit_memset(seed, 0, sizeof seed);
    246 
    247 	cprng->reseed_evcnt = kmem_alloc(sizeof(*cprng->reseed_evcnt),
    248 	    KM_SLEEP);
    249 	evcnt_attach_dynamic(cprng->reseed_evcnt, EVCNT_TYPE_MISC, NULL,
    250 	    ci->ci_cpuname, "cprng_fast reseed");
    251 }
    252 
    253 static inline int
    255 cprng_fast_get(struct cprng_fast **cprngp)
    256 {
    257 	struct cprng_fast *cprng;
    258 	int s;
    259 
    260 	*cprngp = cprng = percpu_getref(cprng_fast_percpu);
    261 	s = splvm();
    262 
    263 	if (__predict_false(cprng->epoch != entropy_epoch()))
    264 		cprng_fast_schedule_reseed(cprng);
    265 
    266 	return s;
    267 }
    268 
    269 static inline void
    270 cprng_fast_put(struct cprng_fast *cprng, int s)
    271 {
    272 
    273 	KASSERT((cprng == percpu_getref(cprng_fast_percpu)) &&
    274 	    (percpu_putref(cprng_fast_percpu), true));
    275 	splx(s);
    276 	percpu_putref(cprng_fast_percpu);
    277 }
    278 
    279 static void
    280 cprng_fast_schedule_reseed(struct cprng_fast *cprng __unused)
    281 {
    282 
    283 	softint_schedule(cprng_fast_softint);
    284 }
    285 
    286 static void
    287 cprng_fast_intr(void *cookie __unused)
    288 {
    289 	unsigned epoch = entropy_epoch();
    290 	struct cprng_fast *cprng;
    291 	uint8_t seed[CPRNG_FAST_SEED_BYTES];
    292 	int s;
    293 
    294 	cprng_strong(kern_cprng, seed, sizeof(seed), 0);
    295 
    296 	cprng = percpu_getref(cprng_fast_percpu);
    297 	s = splvm();
    298 	cprng_fast_seed(cprng, seed);
    299 	cprng->epoch = epoch;
    300 	cprng->reseed_evcnt->ev_count++;
    301 	splx(s);
    302 	percpu_putref(cprng_fast_percpu);
    303 
    304 	explicit_memset(seed, 0, sizeof(seed));
    305 }
    306 
    307 /* CPRNG algorithm */
    309 
    310 /*
    311  * The state consists of a key, the current nonce, and a 64-byte buffer
    312  * of output.  Since we fill the buffer only when we need output, and
    313  * eat a 32-bit word at a time, one 32-bit word of the buffer would be
    314  * wasted.  Instead, we repurpose it to count the number of entries in
    315  * the buffer remaining, counting from high to low in order to allow
    316  * comparison to zero to detect when we need to refill it.
    317  */
    318 #define	CPRNG_FAST_BUFIDX	(crypto_core_OUTPUTWORDS - 1)
    319 
    320 static void
    321 cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
    322 {
    323 
    324 	(void)memset(cprng->buffer, 0, sizeof cprng->buffer);
    325 	(void)memcpy(cprng->key, seed, sizeof cprng->key);
    326 	(void)memset(cprng->nonce, 0, sizeof cprng->nonce);
    327 }
    328 
    329 static inline uint32_t
    330 cprng_fast_word(struct cprng_fast *cprng)
    331 {
    332 	uint32_t v;
    333 
    334 	if (__predict_true(0 < cprng->buffer[CPRNG_FAST_BUFIDX])) {
    335 		v = cprng->buffer[--cprng->buffer[CPRNG_FAST_BUFIDX]];
    336 	} else {
    337 		/* If we don't have enough words, refill the buffer.  */
    338 		crypto_core(cprng->buffer, cprng->nonce, cprng->key,
    339 		    crypto_core_constant32);
    340 		if (__predict_false(++cprng->nonce[0] == 0)) {
    341 			cprng->nonce[1]++;
    342 			cprng_fast_schedule_reseed(cprng);
    343 		}
    344 		v = cprng->buffer[CPRNG_FAST_BUFIDX];
    345 		cprng->buffer[CPRNG_FAST_BUFIDX] = CPRNG_FAST_BUFIDX;
    346 	}
    347 
    348 	return v;
    349 }
    350 
    351 static inline void
    352 cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned n)
    353 {
    354 	uint8_t *p = buf;
    355 	uint32_t v;
    356 	unsigned w, r;
    357 
    358 	w = n / sizeof(uint32_t);
    359 	while (w--) {
    360 		v = cprng_fast_word(cprng);
    361 		(void)memcpy(p, &v, 4);
    362 		p += 4;
    363 	}
    364 
    365 	r = n % sizeof(uint32_t);
    366 	if (r) {
    367 		v = cprng_fast_word(cprng);
    368 		while (r--) {
    369 			*p++ = (v & 0xff);
    370 			v >>= 8;
    371 		}
    372 	}
    373 }
    374 
    375 /*
    377  * crypto_onetimestream: Expand a short unpredictable one-time seed
    378  * into a long unpredictable output.
    379  */
    380 static void
    381 crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf,
    382     size_t n)
    383 {
    384 	uint32_t block[crypto_core_OUTPUTWORDS];
    385 	uint32_t nonce[crypto_core_INPUTWORDS] = {0};
    386 	uint8_t *p8;
    387 	uint32_t *p32;
    388 	size_t ni, nb, nf;
    389 
    390 	/*
    391 	 * Guarantee we can generate up to n bytes.  We have
    392 	 * 2^(32*INPUTWORDS) possible inputs yielding output of
    393 	 * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes.  It suffices to
    394 	 * require that sizeof n > (1/CHAR_BIT) log_2 n be less than
    395 	 * (1/CHAR_BIT) log_2 of the total output stream length.  We
    396 	 * have
    397 	 *
    398 	 *	log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i)
    399 	 *	  = 2 + log_2 o + 32 i.
    400 	 */
    401 	__CTASSERT(CHAR_BIT*sizeof n <=
    402 	    (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS));
    403 
    404 	p8 = buf;
    405 	p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t));
    406 	ni = (uint8_t *)p32 - p8;
    407 	if (n < ni)
    408 		ni = n;
    409 	nb = (n - ni) / sizeof block;
    410 	nf = (n - ni) % sizeof block;
    411 
    412 	KASSERT(((uintptr_t)p32 & 3) == 0);
    413 	KASSERT(ni <= n);
    414 	KASSERT(nb <= (n / sizeof block));
    415 	KASSERT(nf <= n);
    416 	KASSERT(n == (ni + (nb * sizeof block) + nf));
    417 	KASSERT(ni < sizeof(uint32_t));
    418 	KASSERT(nf < sizeof block);
    419 
    420 	if (ni) {
    421 		crypto_core(block, nonce, seed, crypto_core_constant32);
    422 		nonce[0]++;
    423 		(void)memcpy(p8, block, ni);
    424 	}
    425 	while (nb--) {
    426 		crypto_core(p32, nonce, seed, crypto_core_constant32);
    427 		if (++nonce[0] == 0)
    428 			nonce[1]++;
    429 		p32 += crypto_core_OUTPUTWORDS;
    430 	}
    431 	if (nf) {
    432 		crypto_core(block, nonce, seed, crypto_core_constant32);
    433 		if (++nonce[0] == 0)
    434 			nonce[1]++;
    435 		(void)memcpy(p32, block, nf);
    436 	}
    437 
    438 	if (ni | nf)
    439 		(void)explicit_memset(block, 0, sizeof block);
    440 }
    441 
    442 /* Public API */
    444 
    445 uint32_t
    446 cprng_fast32(void)
    447 {
    448 	struct cprng_fast *cprng;
    449 	uint32_t v;
    450 	int s;
    451 
    452 	s = cprng_fast_get(&cprng);
    453 	v = cprng_fast_word(cprng);
    454 	cprng_fast_put(cprng, s);
    455 
    456 	return v;
    457 }
    458 
    459 uint64_t
    460 cprng_fast64(void)
    461 {
    462 	struct cprng_fast *cprng;
    463 	uint32_t hi, lo;
    464 	int s;
    465 
    466 	s = cprng_fast_get(&cprng);
    467 	hi = cprng_fast_word(cprng);
    468 	lo = cprng_fast_word(cprng);
    469 	cprng_fast_put(cprng, s);
    470 
    471 	return ((uint64_t)hi << 32) | lo;
    472 }
    473 
    474 static void
    475 cprng_fast_buf_short(void *buf, size_t len)
    476 {
    477 	struct cprng_fast *cprng;
    478 	int s;
    479 
    480 	s = cprng_fast_get(&cprng);
    481 	cprng_fast_buf(cprng, buf, len);
    482 	cprng_fast_put(cprng, s);
    483 }
    484 
    485 static __noinline void
    486 cprng_fast_buf_long(void *buf, size_t len)
    487 {
    488 	uint32_t seed[crypto_core_KEYWORDS];
    489 	struct cprng_fast *cprng;
    490 	int s;
    491 
    492 	s = cprng_fast_get(&cprng);
    493 	cprng_fast_buf(cprng, seed, sizeof seed);
    494 	cprng_fast_put(cprng, s);
    495 
    496 	crypto_onetimestream(seed, buf, len);
    497 
    498 	(void)explicit_memset(seed, 0, sizeof seed);
    499 }
    500 
    501 size_t
    502 cprng_fast(void *buf, size_t len)
    503 {
    504 
    505 	/*
    506 	 * We don't want to hog the CPU, so we use the short version,
    507 	 * to generate output without preemption, only if we can do it
    508 	 * with at most one crypto_core.
    509 	 */
    510 	if (len <= (sizeof(uint32_t) * crypto_core_OUTPUTWORDS))
    511 		cprng_fast_buf_short(buf, len);
    512 	else
    513 		cprng_fast_buf_long(buf, len);
    514 
    515 	return len;
    516 }
    517