Home | History | Annotate | Line # | Download | only in cprng_fast
cprng_fast.c revision 1.15
      1  1.15  riastrad /*	$NetBSD: cprng_fast.c,v 1.15 2020/04/30 03:29:45 riastradh Exp $	*/
      2   1.2       tls 
      3   1.2       tls /*-
      4   1.2       tls  * Copyright (c) 2014 The NetBSD Foundation, Inc.
      5   1.2       tls  * All rights reserved.
      6   1.2       tls  *
      7   1.2       tls  * This code is derived from software contributed to The NetBSD Foundation
      8   1.2       tls  * by Taylor R. Campbell.
      9   1.2       tls  *
     10   1.2       tls  * Redistribution and use in source and binary forms, with or without
     11   1.2       tls  * modification, are permitted provided that the following conditions
     12   1.2       tls  * are met:
     13   1.2       tls  * 1. Redistributions of source code must retain the above copyright
     14   1.2       tls  *    notice, this list of conditions and the following disclaimer.
     15   1.2       tls  * 2. Redistributions in binary form must reproduce the above copyright
     16   1.2       tls  *    notice, this list of conditions and the following disclaimer in the
     17   1.2       tls  *    documentation and/or other materials provided with the distribution.
     18   1.2       tls  *
     19   1.2       tls  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20   1.2       tls  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21   1.2       tls  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22   1.2       tls  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23   1.2       tls  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24   1.2       tls  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25   1.2       tls  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26   1.2       tls  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27   1.2       tls  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28   1.2       tls  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29   1.2       tls  * POSSIBILITY OF SUCH DAMAGE.
     30   1.2       tls  */
     31   1.2       tls 
     32   1.2       tls #include <sys/cdefs.h>
     33  1.15  riastrad __KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.15 2020/04/30 03:29:45 riastradh Exp $");
     34   1.2       tls 
     35   1.2       tls #include <sys/types.h>
     36   1.4  riastrad #include <sys/param.h>
     37   1.2       tls #include <sys/bitops.h>
     38   1.4  riastrad #include <sys/cprng.h>
     39   1.2       tls #include <sys/cpu.h>
     40  1.14  riastrad #include <sys/entropy.h>
     41  1.15  riastrad #include <sys/evcnt.h>
     42   1.2       tls #include <sys/intr.h>
     43  1.15  riastrad #include <sys/kmem.h>
     44   1.2       tls #include <sys/percpu.h>
     45   1.2       tls 
     46   1.2       tls /* ChaCha core */
     48   1.2       tls 
     49   1.2       tls #define	crypto_core_OUTPUTWORDS	16
     50   1.2       tls #define	crypto_core_INPUTWORDS	4
     51   1.2       tls #define	crypto_core_KEYWORDS	8
     52   1.2       tls #define	crypto_core_CONSTWORDS	4
     53   1.2       tls 
     54   1.2       tls #define	crypto_core_ROUNDS	8
     55   1.2       tls 
     56   1.2       tls static uint32_t
     57   1.2       tls rotate(uint32_t u, unsigned c)
     58   1.2       tls {
     59   1.2       tls 
     60   1.2       tls 	return (u << c) | (u >> (32 - c));
     61   1.2       tls }
     62   1.2       tls 
     63   1.2       tls #define	QUARTERROUND(a, b, c, d) do {					      \
     64   1.2       tls 	(a) += (b); (d) ^= (a); (d) = rotate((d), 16);			      \
     65   1.2       tls 	(c) += (d); (b) ^= (c); (b) = rotate((b), 12);			      \
     66   1.2       tls 	(a) += (b); (d) ^= (a); (d) = rotate((d),  8);			      \
     67   1.2       tls 	(c) += (d); (b) ^= (c); (b) = rotate((b),  7);			      \
     68   1.2       tls } while (0)
     69   1.2       tls 
     70   1.2       tls static void
     71   1.2       tls crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k,
     72   1.2       tls     const uint32_t *c)
     73   1.2       tls {
     74   1.2       tls 	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
     75   1.2       tls 	int i;
     76   1.2       tls 
     77   1.2       tls 	x0 = c[0];
     78   1.2       tls 	x1 = c[1];
     79   1.2       tls 	x2 = c[2];
     80   1.2       tls 	x3 = c[3];
     81   1.2       tls 	x4 = k[0];
     82   1.2       tls 	x5 = k[1];
     83   1.2       tls 	x6 = k[2];
     84   1.2       tls 	x7 = k[3];
     85   1.2       tls 	x8 = k[4];
     86   1.2       tls 	x9 = k[5];
     87   1.2       tls 	x10 = k[6];
     88   1.2       tls 	x11 = k[7];
     89   1.2       tls 	x12 = in[0];
     90   1.2       tls 	x13 = in[1];
     91   1.2       tls 	x14 = in[2];
     92   1.2       tls 	x15 = in[3];
     93   1.2       tls 
     94   1.2       tls 	for (i = crypto_core_ROUNDS; i > 0; i -= 2) {
     95   1.2       tls 		QUARTERROUND( x0, x4, x8,x12);
     96   1.2       tls 		QUARTERROUND( x1, x5, x9,x13);
     97   1.2       tls 		QUARTERROUND( x2, x6,x10,x14);
     98   1.2       tls 		QUARTERROUND( x3, x7,x11,x15);
     99   1.2       tls 		QUARTERROUND( x0, x5,x10,x15);
    100   1.2       tls 		QUARTERROUND( x1, x6,x11,x12);
    101   1.2       tls 		QUARTERROUND( x2, x7, x8,x13);
    102   1.2       tls 		QUARTERROUND( x3, x4, x9,x14);
    103   1.2       tls 	}
    104   1.2       tls 
    105   1.2       tls 	out[0] = x0 + c[0];
    106   1.2       tls 	out[1] = x1 + c[1];
    107   1.2       tls 	out[2] = x2 + c[2];
    108   1.2       tls 	out[3] = x3 + c[3];
    109   1.2       tls 	out[4] = x4 + k[0];
    110   1.2       tls 	out[5] = x5 + k[1];
    111   1.2       tls 	out[6] = x6 + k[2];
    112   1.2       tls 	out[7] = x7 + k[3];
    113   1.2       tls 	out[8] = x8 + k[4];
    114   1.2       tls 	out[9] = x9 + k[5];
    115   1.2       tls 	out[10] = x10 + k[6];
    116   1.2       tls 	out[11] = x11 + k[7];
    117   1.2       tls 	out[12] = x12 + in[0];
    118   1.2       tls 	out[13] = x13 + in[1];
    119   1.2       tls 	out[14] = x14 + in[2];
    120   1.2       tls 	out[15] = x15 + in[3];
    121   1.2       tls }
    122   1.2       tls 
    123   1.2       tls /* `expand 32-byte k' */
    125   1.2       tls static const uint32_t crypto_core_constant32[4] = {
    126   1.2       tls 	0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U,
    127   1.2       tls };
    128   1.2       tls 
    129   1.2       tls /*
    130   1.2       tls  * Test vector for ChaCha20 from
    131   1.2       tls  * <http://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-00>,
    132   1.2       tls  * test vectors for ChaCha12 and ChaCha8 generated by the same
    133   1.2       tls  * crypto_core code with crypto_core_ROUNDS varied.
    134   1.2       tls  */
    135   1.2       tls 
    136   1.2       tls #define	check(E)	do						\
    137   1.2       tls {									\
    138   1.2       tls 	if (!(E))							\
    139   1.2       tls 		panic("crypto self-test failed: %s", #E);		\
    140   1.2       tls } while (0)
    141   1.2       tls 
    142   1.2       tls static void
    143   1.2       tls crypto_core_selftest(void)
    144   1.2       tls {
    145   1.2       tls 	const uint32_t zero32[8] = {0};
    146   1.2       tls 	const uint8_t sigma[] = "expand 32-byte k";
    147   1.2       tls 	uint32_t block[16];
    148   1.2       tls 	unsigned i;
    149   1.2       tls 
    150   1.2       tls #if crypto_core_ROUNDS == 8
    151   1.2       tls 	static const uint8_t out[64] = {
    152   1.2       tls 		0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6,
    153   1.2       tls 		0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1,
    154   1.2       tls 		0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b,
    155   1.2       tls 		0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e,
    156   1.2       tls 		0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41,
    157   1.2       tls 		0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19,
    158   1.2       tls 		0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01,
    159   1.2       tls 		0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42,
    160   1.2       tls 	};
    161   1.2       tls #elif crypto_core_ROUNDS == 12
    162   1.2       tls 	static const uint8_t out[64] = {
    163   1.2       tls 		0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53,
    164   1.2       tls 		0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5,
    165   1.2       tls 		0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14,
    166   1.2       tls 		0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f,
    167   1.2       tls 		0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0,
    168   1.2       tls 		0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79,
    169   1.2       tls 		0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19,
    170   1.2       tls 		0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe,
    171   1.2       tls 	};
    172   1.2       tls #elif crypto_core_ROUNDS == 20
    173   1.2       tls 	static const uint8_t out[64] = {
    174   1.2       tls 		0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90,
    175   1.2       tls 		0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28,
    176   1.2       tls 		0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a,
    177   1.2       tls 		0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7,
    178   1.2       tls 		0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d,
    179   1.2       tls 		0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37,
    180   1.2       tls 		0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c,
    181   1.2       tls 		0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86,
    182   1.2       tls 	};
    183   1.2       tls #else
    184   1.2       tls #error crypto_core_ROUNDS must be 8, 12, or 20.
    185   1.2       tls #endif
    186   1.2       tls 
    187   1.2       tls 	check(crypto_core_constant32[0] == le32dec(&sigma[0]));
    188   1.2       tls 	check(crypto_core_constant32[1] == le32dec(&sigma[4]));
    189   1.2       tls 	check(crypto_core_constant32[2] == le32dec(&sigma[8]));
    190   1.2       tls 	check(crypto_core_constant32[3] == le32dec(&sigma[12]));
    191   1.2       tls 
    192   1.2       tls 	crypto_core(block, zero32, zero32, crypto_core_constant32);
    193   1.2       tls 	for (i = 0; i < 16; i++)
    194   1.2       tls 		check(block[i] == le32dec(&out[i*4]));
    195   1.2       tls }
    196   1.2       tls 
    197   1.2       tls #undef check
    198   1.2       tls 
    199   1.2       tls #define	CPRNG_FAST_SEED_BYTES	(crypto_core_KEYWORDS * sizeof(uint32_t))
    201   1.2       tls 
    202   1.2       tls struct cprng_fast {
    203  1.15  riastrad 	uint32_t 	buffer[crypto_core_OUTPUTWORDS];
    204  1.14  riastrad 	uint32_t 	key[crypto_core_KEYWORDS];
    205   1.2       tls 	uint32_t 	nonce[crypto_core_INPUTWORDS];
    206   1.2       tls 	struct evcnt	*reseed_evcnt;
    207   1.2       tls 	unsigned	epoch;
    208   1.2       tls };
    209   1.8  riastrad 
    210   1.2       tls __CTASSERT(sizeof ((struct cprng_fast *)0)->key == CPRNG_FAST_SEED_BYTES);
    211   1.2       tls 
    212   1.2       tls static void	cprng_fast_init_cpu(void *, void *, struct cpu_info *);
    213   1.6  riastrad static void	cprng_fast_schedule_reseed(struct cprng_fast *);
    214   1.2       tls static void	cprng_fast_intr(void *);
    215   1.2       tls 
    216   1.2       tls static void	cprng_fast_seed(struct cprng_fast *, const void *);
    217   1.2       tls static void	cprng_fast_buf(struct cprng_fast *, void *, unsigned);
    218   1.2       tls 
    219   1.2       tls static void	cprng_fast_buf_short(void *, size_t);
    220   1.2       tls static void	cprng_fast_buf_long(void *, size_t);
    221   1.2       tls 
    222   1.2       tls static percpu_t	*cprng_fast_percpu	__read_mostly;
    223   1.2       tls static void	*cprng_fast_softint	__read_mostly;
    224   1.2       tls 
    225   1.2       tls void
    226   1.2       tls cprng_fast_init(void)
    227  1.15  riastrad {
    228  1.15  riastrad 
    229   1.2       tls 	crypto_core_selftest();
    230   1.2       tls 	cprng_fast_percpu = percpu_create(sizeof(struct cprng_fast),
    231   1.2       tls 	    cprng_fast_init_cpu, NULL, NULL);
    232   1.2       tls 	cprng_fast_softint = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
    233   1.8  riastrad 	    &cprng_fast_intr, NULL);
    234  1.15  riastrad }
    235   1.8  riastrad 
    236   1.8  riastrad static void
    237   1.8  riastrad cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci)
    238   1.8  riastrad {
    239  1.14  riastrad 	struct cprng_fast *const cprng = p;
    240  1.12  riastrad 	uint8_t seed[CPRNG_FAST_SEED_BYTES];
    241   1.8  riastrad 
    242   1.8  riastrad 	cprng->epoch = entropy_epoch();
    243  1.15  riastrad 	cprng_strong(kern_cprng, seed, sizeof seed, 0);
    244  1.15  riastrad 	cprng_fast_seed(cprng, seed);
    245  1.15  riastrad 	(void)explicit_memset(seed, 0, sizeof seed);
    246  1.15  riastrad 
    247  1.15  riastrad 	cprng->reseed_evcnt = kmem_alloc(sizeof(*cprng->reseed_evcnt),
    248   1.8  riastrad 	    KM_SLEEP);
    249   1.9  riastrad 	evcnt_attach_dynamic(cprng->reseed_evcnt, EVCNT_TYPE_MISC, NULL,
    250   1.2       tls 	    ci->ci_cpuname, "cprng_fast reseed");
    251   1.2       tls }
    252   1.2       tls 
    253   1.9  riastrad static inline int
    255   1.9  riastrad cprng_fast_get(struct cprng_fast **cprngp)
    256   1.9  riastrad {
    257   1.9  riastrad 	struct cprng_fast *cprng;
    258   1.2       tls 	int s;
    259  1.14  riastrad 
    260   1.9  riastrad 	*cprngp = cprng = percpu_getref(cprng_fast_percpu);
    261   1.9  riastrad 	s = splvm();
    262   1.9  riastrad 
    263   1.2       tls 	if (__predict_false(cprng->epoch != entropy_epoch()))
    264   1.2       tls 		cprng_fast_schedule_reseed(cprng);
    265   1.2       tls 
    266   1.2       tls 	return s;
    267   1.2       tls }
    268   1.2       tls 
    269   1.2       tls static inline void
    270   1.2       tls cprng_fast_put(struct cprng_fast *cprng, int s)
    271   1.2       tls {
    272   1.2       tls 
    273   1.2       tls 	KASSERT((cprng == percpu_getref(cprng_fast_percpu)) &&
    274   1.9  riastrad 	    (percpu_putref(cprng_fast_percpu), true));
    275  1.11    justin 	splx(s);
    276   1.2       tls 	percpu_putref(cprng_fast_percpu);
    277   1.2       tls }
    278   1.2       tls 
    279   1.2       tls static void
    280   1.2       tls cprng_fast_schedule_reseed(struct cprng_fast *cprng __unused)
    281   1.2       tls {
    282   1.2       tls 
    283   1.2       tls 	softint_schedule(cprng_fast_softint);
    284   1.2       tls }
    285  1.14  riastrad 
    286   1.2       tls static void
    287   1.2       tls cprng_fast_intr(void *cookie __unused)
    288   1.7  riastrad {
    289   1.2       tls 	unsigned epoch = entropy_epoch();
    290  1.12  riastrad 	struct cprng_fast *cprng;
    291   1.2       tls 	uint8_t seed[CPRNG_FAST_SEED_BYTES];
    292   1.2       tls 	int s;
    293   1.7  riastrad 
    294   1.2       tls 	cprng_strong(kern_cprng, seed, sizeof(seed), 0);
    295  1.14  riastrad 
    296  1.15  riastrad 	cprng = percpu_getref(cprng_fast_percpu);
    297   1.7  riastrad 	s = splvm();
    298   1.2       tls 	cprng_fast_seed(cprng, seed);
    299   1.2       tls 	cprng->epoch = epoch;
    300   1.2       tls 	cprng->reseed_evcnt->ev_count++;
    301   1.2       tls 	splx(s);
    302   1.2       tls 	percpu_putref(cprng_fast_percpu);
    303   1.2       tls 
    304   1.2       tls 	explicit_memset(seed, 0, sizeof(seed));
    305   1.2       tls }
    306   1.2       tls 
    307   1.2       tls /* CPRNG algorithm */
    309   1.2       tls 
    310   1.2       tls /*
    311   1.2       tls  * The state consists of a key, the current nonce, and a 64-byte buffer
    312   1.2       tls  * of output.  Since we fill the buffer only when we need output, and
    313   1.2       tls  * eat a 32-bit word at a time, one 32-bit word of the buffer would be
    314   1.2       tls  * wasted.  Instead, we repurpose it to count the number of entries in
    315   1.6  riastrad  * the buffer remaining, counting from high to low in order to allow
    316   1.2       tls  * comparison to zero to detect when we need to refill it.
    317   1.2       tls  */
    318   1.2       tls #define	CPRNG_FAST_BUFIDX	(crypto_core_OUTPUTWORDS - 1)
    319   1.2       tls 
    320   1.2       tls static void
    321   1.2       tls cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
    322   1.2       tls {
    323   1.2       tls 
    324   1.2       tls 	(void)memset(cprng->buffer, 0, sizeof cprng->buffer);
    325   1.2       tls 	(void)memcpy(cprng->key, seed, sizeof cprng->key);
    326   1.2       tls 	(void)memset(cprng->nonce, 0, sizeof cprng->nonce);
    327   1.2       tls }
    328   1.2       tls 
    329   1.2       tls static inline uint32_t
    330   1.2       tls cprng_fast_word(struct cprng_fast *cprng)
    331   1.2       tls {
    332   1.2       tls 	uint32_t v;
    333   1.2       tls 
    334   1.2       tls 	if (__predict_true(0 < cprng->buffer[CPRNG_FAST_BUFIDX])) {
    335   1.2       tls 		v = cprng->buffer[--cprng->buffer[CPRNG_FAST_BUFIDX]];
    336   1.2       tls 	} else {
    337   1.2       tls 		/* If we don't have enough words, refill the buffer.  */
    338   1.2       tls 		crypto_core(cprng->buffer, cprng->nonce, cprng->key,
    339   1.2       tls 		    crypto_core_constant32);
    340   1.2       tls 		if (__predict_false(++cprng->nonce[0] == 0)) {
    341   1.2       tls 			cprng->nonce[1]++;
    342   1.2       tls 			cprng_fast_schedule_reseed(cprng);
    343   1.2       tls 		}
    344   1.2       tls 		v = cprng->buffer[CPRNG_FAST_BUFIDX];
    345   1.2       tls 		cprng->buffer[CPRNG_FAST_BUFIDX] = CPRNG_FAST_BUFIDX;
    346   1.2       tls 	}
    347   1.2       tls 
    348   1.2       tls 	return v;
    349   1.2       tls }
    350   1.2       tls 
    351  1.10  riastrad static inline void
    352   1.2       tls cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned n)
    353  1.10  riastrad {
    354  1.10  riastrad 	uint8_t *p = buf;
    355  1.10  riastrad 	uint32_t v;
    356  1.10  riastrad 	unsigned w, r;
    357  1.10  riastrad 
    358  1.10  riastrad 	w = n / sizeof(uint32_t);
    359  1.10  riastrad 	while (w--) {
    360  1.10  riastrad 		v = cprng_fast_word(cprng);
    361  1.10  riastrad 		(void)memcpy(p, &v, 4);
    362   1.2       tls 		p += 4;
    363   1.2       tls 	}
    364   1.2       tls 
    365   1.2       tls 	r = n % sizeof(uint32_t);
    366   1.2       tls 	if (r) {
    367   1.2       tls 		v = cprng_fast_word(cprng);
    368   1.2       tls 		while (r--) {
    369   1.2       tls 			*p++ = (v & 0xff);
    370   1.2       tls 			v >>= 8;
    371   1.2       tls 		}
    372   1.2       tls 	}
    373   1.2       tls }
    374   1.2       tls 
    375   1.2       tls /*
    377   1.2       tls  * crypto_onetimestream: Expand a short unpredictable one-time seed
    378   1.2       tls  * into a long unpredictable output.
    379   1.2       tls  */
    380   1.2       tls static void
    381   1.2       tls crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf,
    382   1.2       tls     size_t n)
    383   1.2       tls {
    384   1.2       tls 	uint32_t block[crypto_core_OUTPUTWORDS];
    385   1.2       tls 	uint32_t nonce[crypto_core_INPUTWORDS] = {0};
    386   1.2       tls 	uint8_t *p8;
    387   1.2       tls 	uint32_t *p32;
    388   1.2       tls 	size_t ni, nb, nf;
    389   1.2       tls 
    390   1.2       tls 	/*
    391   1.2       tls 	 * Guarantee we can generate up to n bytes.  We have
    392   1.2       tls 	 * 2^(32*INPUTWORDS) possible inputs yielding output of
    393   1.2       tls 	 * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes.  It suffices to
    394   1.2       tls 	 * require that sizeof n > (1/CHAR_BIT) log_2 n be less than
    395   1.2       tls 	 * (1/CHAR_BIT) log_2 of the total output stream length.  We
    396   1.2       tls 	 * have
    397   1.2       tls 	 *
    398   1.2       tls 	 *	log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i)
    399   1.2       tls 	 *	  = 2 + log_2 o + 32 i.
    400   1.2       tls 	 */
    401   1.2       tls 	__CTASSERT(CHAR_BIT*sizeof n <=
    402   1.2       tls 	    (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS));
    403   1.2       tls 
    404   1.2       tls 	p8 = buf;
    405   1.2       tls 	p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t));
    406   1.2       tls 	ni = (uint8_t *)p32 - p8;
    407   1.2       tls 	if (n < ni)
    408   1.2       tls 		ni = n;
    409   1.2       tls 	nb = (n - ni) / sizeof block;
    410   1.2       tls 	nf = (n - ni) % sizeof block;
    411   1.2       tls 
    412   1.2       tls 	KASSERT(((uintptr_t)p32 & 3) == 0);
    413   1.2       tls 	KASSERT(ni <= n);
    414   1.2       tls 	KASSERT(nb <= (n / sizeof block));
    415   1.2       tls 	KASSERT(nf <= n);
    416   1.2       tls 	KASSERT(n == (ni + (nb * sizeof block) + nf));
    417   1.2       tls 	KASSERT(ni < sizeof(uint32_t));
    418   1.2       tls 	KASSERT(nf < sizeof block);
    419   1.2       tls 
    420   1.2       tls 	if (ni) {
    421   1.2       tls 		crypto_core(block, nonce, seed, crypto_core_constant32);
    422   1.2       tls 		nonce[0]++;
    423   1.2       tls 		(void)memcpy(p8, block, ni);
    424   1.2       tls 	}
    425   1.2       tls 	while (nb--) {
    426   1.2       tls 		crypto_core(p32, nonce, seed, crypto_core_constant32);
    427   1.2       tls 		if (++nonce[0] == 0)
    428   1.2       tls 			nonce[1]++;
    429   1.2       tls 		p32 += crypto_core_OUTPUTWORDS;
    430   1.2       tls 	}
    431   1.2       tls 	if (nf) {
    432   1.2       tls 		crypto_core(block, nonce, seed, crypto_core_constant32);
    433   1.2       tls 		if (++nonce[0] == 0)
    434   1.2       tls 			nonce[1]++;
    435   1.2       tls 		(void)memcpy(p32, block, nf);
    436   1.2       tls 	}
    437   1.2       tls 
    438   1.2       tls 	if (ni | nf)
    439   1.2       tls 		(void)explicit_memset(block, 0, sizeof block);
    440   1.2       tls }
    441   1.2       tls 
    442   1.2       tls /* Public API */
    444   1.2       tls 
    445   1.2       tls uint32_t
    446   1.2       tls cprng_fast32(void)
    447   1.2       tls {
    448   1.2       tls 	struct cprng_fast *cprng;
    449   1.2       tls 	uint32_t v;
    450   1.2       tls 	int s;
    451   1.2       tls 
    452   1.2       tls 	s = cprng_fast_get(&cprng);
    453   1.2       tls 	v = cprng_fast_word(cprng);
    454   1.2       tls 	cprng_fast_put(cprng, s);
    455   1.2       tls 
    456   1.2       tls 	return v;
    457   1.2       tls }
    458   1.2       tls 
    459   1.2       tls uint64_t
    460   1.2       tls cprng_fast64(void)
    461   1.2       tls {
    462   1.2       tls 	struct cprng_fast *cprng;
    463   1.2       tls 	uint32_t hi, lo;
    464   1.2       tls 	int s;
    465   1.2       tls 
    466   1.2       tls 	s = cprng_fast_get(&cprng);
    467   1.2       tls 	hi = cprng_fast_word(cprng);
    468   1.2       tls 	lo = cprng_fast_word(cprng);
    469   1.2       tls 	cprng_fast_put(cprng, s);
    470   1.2       tls 
    471   1.2       tls 	return ((uint64_t)hi << 32) | lo;
    472   1.2       tls }
    473   1.2       tls 
    474   1.2       tls static void
    475   1.2       tls cprng_fast_buf_short(void *buf, size_t len)
    476   1.2       tls {
    477   1.2       tls 	struct cprng_fast *cprng;
    478   1.2       tls 	int s;
    479   1.2       tls 
    480   1.2       tls 	s = cprng_fast_get(&cprng);
    481   1.2       tls 	cprng_fast_buf(cprng, buf, len);
    482   1.2       tls 	cprng_fast_put(cprng, s);
    483   1.2       tls }
    484   1.2       tls 
    485   1.2       tls static __noinline void
    486   1.2       tls cprng_fast_buf_long(void *buf, size_t len)
    487   1.2       tls {
    488   1.2       tls 	uint32_t seed[crypto_core_KEYWORDS];
    489   1.2       tls 	struct cprng_fast *cprng;
    490   1.2       tls 	int s;
    491   1.2       tls 
    492   1.2       tls 	s = cprng_fast_get(&cprng);
    493   1.2       tls 	cprng_fast_buf(cprng, seed, sizeof seed);
    494   1.2       tls 	cprng_fast_put(cprng, s);
    495   1.2       tls 
    496   1.2       tls 	crypto_onetimestream(seed, buf, len);
    497   1.2       tls 
    498   1.2       tls 	(void)explicit_memset(seed, 0, sizeof seed);
    499   1.2       tls }
    500   1.2       tls 
    501   1.2       tls size_t
    502   1.2       tls cprng_fast(void *buf, size_t len)
    503   1.2       tls {
    504   1.2       tls 
    505   1.2       tls 	/*
    506   1.2       tls 	 * We don't want to hog the CPU, so we use the short version,
    507   1.2       tls 	 * to generate output without preemption, only if we can do it
    508   1.2       tls 	 * with at most one crypto_core.
    509   1.2       tls 	 */
    510                 	if (len <= (sizeof(uint32_t) * crypto_core_OUTPUTWORDS))
    511                 		cprng_fast_buf_short(buf, len);
    512                 	else
    513                 		cprng_fast_buf_long(buf, len);
    514                 
    515                 	return len;
    516                 }
    517