cprng_fast.c revision 1.15 1 /* $NetBSD: cprng_fast.c,v 1.15 2020/04/30 03:29:45 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2014 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.15 2020/04/30 03:29:45 riastradh Exp $");
34
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/bitops.h>
38 #include <sys/cprng.h>
39 #include <sys/cpu.h>
40 #include <sys/entropy.h>
41 #include <sys/evcnt.h>
42 #include <sys/intr.h>
43 #include <sys/kmem.h>
44 #include <sys/percpu.h>
45
46 /* ChaCha core */
48
49 #define crypto_core_OUTPUTWORDS 16
50 #define crypto_core_INPUTWORDS 4
51 #define crypto_core_KEYWORDS 8
52 #define crypto_core_CONSTWORDS 4
53
54 #define crypto_core_ROUNDS 8
55
56 static uint32_t
57 rotate(uint32_t u, unsigned c)
58 {
59
60 return (u << c) | (u >> (32 - c));
61 }
62
63 #define QUARTERROUND(a, b, c, d) do { \
64 (a) += (b); (d) ^= (a); (d) = rotate((d), 16); \
65 (c) += (d); (b) ^= (c); (b) = rotate((b), 12); \
66 (a) += (b); (d) ^= (a); (d) = rotate((d), 8); \
67 (c) += (d); (b) ^= (c); (b) = rotate((b), 7); \
68 } while (0)
69
70 static void
71 crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k,
72 const uint32_t *c)
73 {
74 uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
75 int i;
76
77 x0 = c[0];
78 x1 = c[1];
79 x2 = c[2];
80 x3 = c[3];
81 x4 = k[0];
82 x5 = k[1];
83 x6 = k[2];
84 x7 = k[3];
85 x8 = k[4];
86 x9 = k[5];
87 x10 = k[6];
88 x11 = k[7];
89 x12 = in[0];
90 x13 = in[1];
91 x14 = in[2];
92 x15 = in[3];
93
94 for (i = crypto_core_ROUNDS; i > 0; i -= 2) {
95 QUARTERROUND( x0, x4, x8,x12);
96 QUARTERROUND( x1, x5, x9,x13);
97 QUARTERROUND( x2, x6,x10,x14);
98 QUARTERROUND( x3, x7,x11,x15);
99 QUARTERROUND( x0, x5,x10,x15);
100 QUARTERROUND( x1, x6,x11,x12);
101 QUARTERROUND( x2, x7, x8,x13);
102 QUARTERROUND( x3, x4, x9,x14);
103 }
104
105 out[0] = x0 + c[0];
106 out[1] = x1 + c[1];
107 out[2] = x2 + c[2];
108 out[3] = x3 + c[3];
109 out[4] = x4 + k[0];
110 out[5] = x5 + k[1];
111 out[6] = x6 + k[2];
112 out[7] = x7 + k[3];
113 out[8] = x8 + k[4];
114 out[9] = x9 + k[5];
115 out[10] = x10 + k[6];
116 out[11] = x11 + k[7];
117 out[12] = x12 + in[0];
118 out[13] = x13 + in[1];
119 out[14] = x14 + in[2];
120 out[15] = x15 + in[3];
121 }
122
123 /* `expand 32-byte k' */
125 static const uint32_t crypto_core_constant32[4] = {
126 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U,
127 };
128
129 /*
130 * Test vector for ChaCha20 from
131 * <http://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-00>,
132 * test vectors for ChaCha12 and ChaCha8 generated by the same
133 * crypto_core code with crypto_core_ROUNDS varied.
134 */
135
136 #define check(E) do \
137 { \
138 if (!(E)) \
139 panic("crypto self-test failed: %s", #E); \
140 } while (0)
141
142 static void
143 crypto_core_selftest(void)
144 {
145 const uint32_t zero32[8] = {0};
146 const uint8_t sigma[] = "expand 32-byte k";
147 uint32_t block[16];
148 unsigned i;
149
150 #if crypto_core_ROUNDS == 8
151 static const uint8_t out[64] = {
152 0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6,
153 0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1,
154 0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b,
155 0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e,
156 0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41,
157 0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19,
158 0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01,
159 0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42,
160 };
161 #elif crypto_core_ROUNDS == 12
162 static const uint8_t out[64] = {
163 0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53,
164 0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5,
165 0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14,
166 0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f,
167 0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0,
168 0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79,
169 0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19,
170 0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe,
171 };
172 #elif crypto_core_ROUNDS == 20
173 static const uint8_t out[64] = {
174 0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90,
175 0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28,
176 0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a,
177 0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7,
178 0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d,
179 0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37,
180 0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c,
181 0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86,
182 };
183 #else
184 #error crypto_core_ROUNDS must be 8, 12, or 20.
185 #endif
186
187 check(crypto_core_constant32[0] == le32dec(&sigma[0]));
188 check(crypto_core_constant32[1] == le32dec(&sigma[4]));
189 check(crypto_core_constant32[2] == le32dec(&sigma[8]));
190 check(crypto_core_constant32[3] == le32dec(&sigma[12]));
191
192 crypto_core(block, zero32, zero32, crypto_core_constant32);
193 for (i = 0; i < 16; i++)
194 check(block[i] == le32dec(&out[i*4]));
195 }
196
197 #undef check
198
199 #define CPRNG_FAST_SEED_BYTES (crypto_core_KEYWORDS * sizeof(uint32_t))
201
202 struct cprng_fast {
203 uint32_t buffer[crypto_core_OUTPUTWORDS];
204 uint32_t key[crypto_core_KEYWORDS];
205 uint32_t nonce[crypto_core_INPUTWORDS];
206 struct evcnt *reseed_evcnt;
207 unsigned epoch;
208 };
209
210 __CTASSERT(sizeof ((struct cprng_fast *)0)->key == CPRNG_FAST_SEED_BYTES);
211
212 static void cprng_fast_init_cpu(void *, void *, struct cpu_info *);
213 static void cprng_fast_schedule_reseed(struct cprng_fast *);
214 static void cprng_fast_intr(void *);
215
216 static void cprng_fast_seed(struct cprng_fast *, const void *);
217 static void cprng_fast_buf(struct cprng_fast *, void *, unsigned);
218
219 static void cprng_fast_buf_short(void *, size_t);
220 static void cprng_fast_buf_long(void *, size_t);
221
222 static percpu_t *cprng_fast_percpu __read_mostly;
223 static void *cprng_fast_softint __read_mostly;
224
225 void
226 cprng_fast_init(void)
227 {
228
229 crypto_core_selftest();
230 cprng_fast_percpu = percpu_create(sizeof(struct cprng_fast),
231 cprng_fast_init_cpu, NULL, NULL);
232 cprng_fast_softint = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
233 &cprng_fast_intr, NULL);
234 }
235
236 static void
237 cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci)
238 {
239 struct cprng_fast *const cprng = p;
240 uint8_t seed[CPRNG_FAST_SEED_BYTES];
241
242 cprng->epoch = entropy_epoch();
243 cprng_strong(kern_cprng, seed, sizeof seed, 0);
244 cprng_fast_seed(cprng, seed);
245 (void)explicit_memset(seed, 0, sizeof seed);
246
247 cprng->reseed_evcnt = kmem_alloc(sizeof(*cprng->reseed_evcnt),
248 KM_SLEEP);
249 evcnt_attach_dynamic(cprng->reseed_evcnt, EVCNT_TYPE_MISC, NULL,
250 ci->ci_cpuname, "cprng_fast reseed");
251 }
252
253 static inline int
255 cprng_fast_get(struct cprng_fast **cprngp)
256 {
257 struct cprng_fast *cprng;
258 int s;
259
260 *cprngp = cprng = percpu_getref(cprng_fast_percpu);
261 s = splvm();
262
263 if (__predict_false(cprng->epoch != entropy_epoch()))
264 cprng_fast_schedule_reseed(cprng);
265
266 return s;
267 }
268
269 static inline void
270 cprng_fast_put(struct cprng_fast *cprng, int s)
271 {
272
273 KASSERT((cprng == percpu_getref(cprng_fast_percpu)) &&
274 (percpu_putref(cprng_fast_percpu), true));
275 splx(s);
276 percpu_putref(cprng_fast_percpu);
277 }
278
279 static void
280 cprng_fast_schedule_reseed(struct cprng_fast *cprng __unused)
281 {
282
283 softint_schedule(cprng_fast_softint);
284 }
285
286 static void
287 cprng_fast_intr(void *cookie __unused)
288 {
289 unsigned epoch = entropy_epoch();
290 struct cprng_fast *cprng;
291 uint8_t seed[CPRNG_FAST_SEED_BYTES];
292 int s;
293
294 cprng_strong(kern_cprng, seed, sizeof(seed), 0);
295
296 cprng = percpu_getref(cprng_fast_percpu);
297 s = splvm();
298 cprng_fast_seed(cprng, seed);
299 cprng->epoch = epoch;
300 cprng->reseed_evcnt->ev_count++;
301 splx(s);
302 percpu_putref(cprng_fast_percpu);
303
304 explicit_memset(seed, 0, sizeof(seed));
305 }
306
307 /* CPRNG algorithm */
309
310 /*
311 * The state consists of a key, the current nonce, and a 64-byte buffer
312 * of output. Since we fill the buffer only when we need output, and
313 * eat a 32-bit word at a time, one 32-bit word of the buffer would be
314 * wasted. Instead, we repurpose it to count the number of entries in
315 * the buffer remaining, counting from high to low in order to allow
316 * comparison to zero to detect when we need to refill it.
317 */
318 #define CPRNG_FAST_BUFIDX (crypto_core_OUTPUTWORDS - 1)
319
320 static void
321 cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
322 {
323
324 (void)memset(cprng->buffer, 0, sizeof cprng->buffer);
325 (void)memcpy(cprng->key, seed, sizeof cprng->key);
326 (void)memset(cprng->nonce, 0, sizeof cprng->nonce);
327 }
328
329 static inline uint32_t
330 cprng_fast_word(struct cprng_fast *cprng)
331 {
332 uint32_t v;
333
334 if (__predict_true(0 < cprng->buffer[CPRNG_FAST_BUFIDX])) {
335 v = cprng->buffer[--cprng->buffer[CPRNG_FAST_BUFIDX]];
336 } else {
337 /* If we don't have enough words, refill the buffer. */
338 crypto_core(cprng->buffer, cprng->nonce, cprng->key,
339 crypto_core_constant32);
340 if (__predict_false(++cprng->nonce[0] == 0)) {
341 cprng->nonce[1]++;
342 cprng_fast_schedule_reseed(cprng);
343 }
344 v = cprng->buffer[CPRNG_FAST_BUFIDX];
345 cprng->buffer[CPRNG_FAST_BUFIDX] = CPRNG_FAST_BUFIDX;
346 }
347
348 return v;
349 }
350
351 static inline void
352 cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned n)
353 {
354 uint8_t *p = buf;
355 uint32_t v;
356 unsigned w, r;
357
358 w = n / sizeof(uint32_t);
359 while (w--) {
360 v = cprng_fast_word(cprng);
361 (void)memcpy(p, &v, 4);
362 p += 4;
363 }
364
365 r = n % sizeof(uint32_t);
366 if (r) {
367 v = cprng_fast_word(cprng);
368 while (r--) {
369 *p++ = (v & 0xff);
370 v >>= 8;
371 }
372 }
373 }
374
375 /*
377 * crypto_onetimestream: Expand a short unpredictable one-time seed
378 * into a long unpredictable output.
379 */
380 static void
381 crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf,
382 size_t n)
383 {
384 uint32_t block[crypto_core_OUTPUTWORDS];
385 uint32_t nonce[crypto_core_INPUTWORDS] = {0};
386 uint8_t *p8;
387 uint32_t *p32;
388 size_t ni, nb, nf;
389
390 /*
391 * Guarantee we can generate up to n bytes. We have
392 * 2^(32*INPUTWORDS) possible inputs yielding output of
393 * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes. It suffices to
394 * require that sizeof n > (1/CHAR_BIT) log_2 n be less than
395 * (1/CHAR_BIT) log_2 of the total output stream length. We
396 * have
397 *
398 * log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i)
399 * = 2 + log_2 o + 32 i.
400 */
401 __CTASSERT(CHAR_BIT*sizeof n <=
402 (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS));
403
404 p8 = buf;
405 p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t));
406 ni = (uint8_t *)p32 - p8;
407 if (n < ni)
408 ni = n;
409 nb = (n - ni) / sizeof block;
410 nf = (n - ni) % sizeof block;
411
412 KASSERT(((uintptr_t)p32 & 3) == 0);
413 KASSERT(ni <= n);
414 KASSERT(nb <= (n / sizeof block));
415 KASSERT(nf <= n);
416 KASSERT(n == (ni + (nb * sizeof block) + nf));
417 KASSERT(ni < sizeof(uint32_t));
418 KASSERT(nf < sizeof block);
419
420 if (ni) {
421 crypto_core(block, nonce, seed, crypto_core_constant32);
422 nonce[0]++;
423 (void)memcpy(p8, block, ni);
424 }
425 while (nb--) {
426 crypto_core(p32, nonce, seed, crypto_core_constant32);
427 if (++nonce[0] == 0)
428 nonce[1]++;
429 p32 += crypto_core_OUTPUTWORDS;
430 }
431 if (nf) {
432 crypto_core(block, nonce, seed, crypto_core_constant32);
433 if (++nonce[0] == 0)
434 nonce[1]++;
435 (void)memcpy(p32, block, nf);
436 }
437
438 if (ni | nf)
439 (void)explicit_memset(block, 0, sizeof block);
440 }
441
442 /* Public API */
444
445 uint32_t
446 cprng_fast32(void)
447 {
448 struct cprng_fast *cprng;
449 uint32_t v;
450 int s;
451
452 s = cprng_fast_get(&cprng);
453 v = cprng_fast_word(cprng);
454 cprng_fast_put(cprng, s);
455
456 return v;
457 }
458
459 uint64_t
460 cprng_fast64(void)
461 {
462 struct cprng_fast *cprng;
463 uint32_t hi, lo;
464 int s;
465
466 s = cprng_fast_get(&cprng);
467 hi = cprng_fast_word(cprng);
468 lo = cprng_fast_word(cprng);
469 cprng_fast_put(cprng, s);
470
471 return ((uint64_t)hi << 32) | lo;
472 }
473
474 static void
475 cprng_fast_buf_short(void *buf, size_t len)
476 {
477 struct cprng_fast *cprng;
478 int s;
479
480 s = cprng_fast_get(&cprng);
481 cprng_fast_buf(cprng, buf, len);
482 cprng_fast_put(cprng, s);
483 }
484
485 static __noinline void
486 cprng_fast_buf_long(void *buf, size_t len)
487 {
488 uint32_t seed[crypto_core_KEYWORDS];
489 struct cprng_fast *cprng;
490 int s;
491
492 s = cprng_fast_get(&cprng);
493 cprng_fast_buf(cprng, seed, sizeof seed);
494 cprng_fast_put(cprng, s);
495
496 crypto_onetimestream(seed, buf, len);
497
498 (void)explicit_memset(seed, 0, sizeof seed);
499 }
500
501 size_t
502 cprng_fast(void *buf, size_t len)
503 {
504
505 /*
506 * We don't want to hog the CPU, so we use the short version,
507 * to generate output without preemption, only if we can do it
508 * with at most one crypto_core.
509 */
510 if (len <= (sizeof(uint32_t) * crypto_core_OUTPUTWORDS))
511 cprng_fast_buf_short(buf, len);
512 else
513 cprng_fast_buf_long(buf, len);
514
515 return len;
516 }
517