aes_sse2_impl.c revision 1.1 1 /* $NetBSD: aes_sse2_impl.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(1, "$NetBSD: aes_sse2_impl.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $");
31
32 #include <sys/types.h>
33 #include <sys/endian.h>
34 #include <sys/systm.h>
35
36 #include <crypto/aes/aes.h>
37 #include <crypto/aes/arch/x86/aes_sse2.h>
38
39 #include <x86/cpu.h>
40 #include <x86/cpuvar.h>
41 #include <x86/fpu.h>
42 #include <x86/specialreg.h>
43
44 #include "aes_sse2_impl.h"
45
46 static void
47 aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
48 {
49 size_t key_len;
50
51 switch (nrounds) {
52 case 10:
53 key_len = 16;
54 break;
55 case 12:
56 key_len = 24;
57 break;
58 case 14:
59 key_len = 32;
60 break;
61 default:
62 panic("invalid AES nrounds: %u", nrounds);
63 }
64
65 fpu_kern_enter();
66 aes_sse2_keysched(rk, key, key_len);
67 fpu_kern_leave();
68 }
69
70 static void
71 aes_sse2_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds)
72 {
73
74 aes_sse2_setkey(enc->aese_aes.aes_rk64, key, nrounds);
75 }
76
77 static void
78 aes_sse2_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
79 {
80
81 /*
82 * BearSSL computes InvMixColumns on the fly -- no need for
83 * distinct decryption round keys.
84 */
85 aes_sse2_setkey(dec->aesd_aes.aes_rk64, key, nrounds);
86 }
87
88 static void
89 aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
90 uint8_t out[static 16], uint32_t nrounds)
91 {
92 uint64_t sk_exp[120];
93 __m128i q[4];
94
95 fpu_kern_enter();
96
97 /* Expand round keys for bitslicing. */
98 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
99
100 /* Load input block interleaved with garbage blocks. */
101 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
102 q[1] = q[2] = q[3] = _mm_setzero_si128();
103
104 /* Transform to bitslice, decrypt, transform from bitslice. */
105 aes_sse2_ortho(q);
106 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
107 aes_sse2_ortho(q);
108
109 /* Store output block. */
110 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
111
112 /* Paranoia: Zero temporary buffers. */
113 explicit_memset(sk_exp, 0, sizeof sk_exp);
114 explicit_memset(q, 0, sizeof q);
115
116 fpu_kern_leave();
117 }
118
119 static void
120 aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
121 uint8_t out[static 16], uint32_t nrounds)
122 {
123 uint64_t sk_exp[120];
124 __m128i q[4];
125
126 fpu_kern_enter();
127
128 /* Expand round keys for bitslicing. */
129 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
130
131 /* Load input block interleaved with garbage blocks. */
132 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
133 q[1] = q[2] = q[3] = _mm_setzero_si128();
134
135 /* Transform to bitslice, decrypt, transform from bitslice. */
136 aes_sse2_ortho(q);
137 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
138 aes_sse2_ortho(q);
139
140 /* Store output block. */
141 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
142
143 /* Paranoia: Zero temporary buffers. */
144 explicit_memset(sk_exp, 0, sizeof sk_exp);
145 explicit_memset(q, 0, sizeof q);
146
147 fpu_kern_leave();
148 }
149
150 static void
151 aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
152 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
153 uint32_t nrounds)
154 {
155 uint64_t sk_exp[120];
156 __m128i q[4];
157 __m128i cv;
158
159 KASSERT(nbytes % 16 == 0);
160
161 /* Skip if there's nothing to do. */
162 if (nbytes == 0)
163 return;
164
165 fpu_kern_enter();
166
167 /* Expand round keys for bitslicing. */
168 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
169
170 /* Load the IV. */
171 cv = _mm_loadu_epi8(iv);
172
173 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
174 /* Load input block and apply CV. */
175 q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
176
177 /* Transform to bitslice, encrypt, transform from bitslice. */
178 aes_sse2_ortho(q);
179 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
180 aes_sse2_ortho(q);
181
182 /* Remember ciphertext as CV and store output block. */
183 cv = aes_sse2_interleave_out(q[0]);
184 _mm_storeu_epi8(out, cv);
185 }
186
187 /* Store updated IV. */
188 _mm_storeu_epi8(iv, cv);
189
190 /* Paranoia: Zero temporary buffers. */
191 explicit_memset(sk_exp, 0, sizeof sk_exp);
192 explicit_memset(q, 0, sizeof q);
193
194 fpu_kern_leave();
195 }
196
197 static void
198 aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
199 uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
200 uint32_t nrounds)
201 {
202 uint64_t sk_exp[120];
203 __m128i q[4];
204 __m128i cv, iv, w;
205
206 KASSERT(nbytes % 16 == 0);
207
208 /* Skip if there's nothing to do. */
209 if (nbytes == 0)
210 return;
211
212 fpu_kern_enter();
213
214 /* Expand round keys for bitslicing. */
215 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
216
217 /* Load the IV. */
218 iv = _mm_loadu_epi8(ivp);
219
220 /* Load the last cipher block. */
221 cv = _mm_loadu_epi8(in + nbytes - 16);
222
223 /* Store the updated IV. */
224 _mm_storeu_epi8(ivp, cv);
225
226 /* Process the last blocks if not an even multiple of four. */
227 if (nbytes % (4*16)) {
228 unsigned n = (nbytes/16) % 4;
229
230 KASSERT(n > 0);
231 KASSERT(n < 4);
232
233 q[1] = q[2] = q[3] = _mm_setzero_si128();
234 q[n - 1] = aes_sse2_interleave_in(cv);
235 switch (nbytes % 64) {
236 case 48:
237 w = _mm_loadu_epi8(in + nbytes - 32);
238 q[1] = aes_sse2_interleave_in(w);
239 /*FALLTHROUGH*/
240 case 32:
241 w = _mm_loadu_epi8(in + nbytes - 48);
242 q[0] = aes_sse2_interleave_in(w);
243 /*FALLTHROUGH*/
244 case 16:
245 break;
246 }
247
248 /* Decrypt. */
249 aes_sse2_ortho(q);
250 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
251 aes_sse2_ortho(q);
252
253 do {
254 n--;
255 w = aes_sse2_interleave_out(q[n]);
256 if ((nbytes -= 16) == 0)
257 goto out;
258 cv = _mm_loadu_epi8(in + nbytes - 16);
259 _mm_storeu_epi8(out + nbytes, w ^ cv);
260 } while (n);
261 }
262
263 for (;;) {
264 KASSERT(nbytes >= 64);
265 nbytes -= 64;
266
267 /*
268 * 1. Set up upper cipher block from cv.
269 * 2. Load lower cipher block into cv and set it up.
270 * 3. Decrypt.
271 */
272 q[3] = aes_sse2_interleave_in(cv);
273
274 w = _mm_loadu_epi8(in + nbytes + 4*8);
275 q[2] = aes_sse2_interleave_in(w);
276
277 w = _mm_loadu_epi8(in + nbytes + 4*4);
278 q[1] = aes_sse2_interleave_in(w);
279
280 w = _mm_loadu_epi8(in + nbytes + 4*0);
281 q[0] = aes_sse2_interleave_in(w);
282
283 aes_sse2_ortho(q);
284 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
285 aes_sse2_ortho(q);
286
287 /* Store the upper output block. */
288 w = aes_sse2_interleave_out(q[3]);
289 cv = _mm_loadu_epi8(in + nbytes + 4*8);
290 _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
291
292 /* Store the middle output blocks. */
293 w = aes_sse2_interleave_out(q[2]);
294 cv = _mm_loadu_epi8(in + nbytes + 4*4);
295 _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
296
297 w = aes_sse2_interleave_out(q[1]);
298 cv = _mm_loadu_epi8(in + nbytes + 4*0);
299 _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
300
301 /*
302 * Get the first output block, but don't load the CV
303 * yet -- it might be the previous ciphertext block, or
304 * it might be the IV.
305 */
306 w = aes_sse2_interleave_out(q[0]);
307
308 /* Stop if we've reached the first output block. */
309 if (nbytes == 0)
310 goto out;
311
312 /*
313 * Load the preceding cipher block, and apply it as the
314 * chaining value to this one.
315 */
316 cv = _mm_loadu_epi8(in + nbytes - 16);
317 _mm_storeu_epi8(out + nbytes, w ^ cv);
318 }
319
320 out: /* Store the first output block. */
321 _mm_storeu_epi8(out, w ^ iv);
322
323 /* Paranoia: Zero temporary buffers. */
324 explicit_memset(sk_exp, 0, sizeof sk_exp);
325 explicit_memset(q, 0, sizeof q);
326
327 fpu_kern_leave();
328 }
329
330 static inline __m128i
331 aes_sse2_xts_update(__m128i t)
332 {
333 const __m128i one = _mm_set_epi64x(1, 1);
334 __m128i s, m, c;
335
336 s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */
337 m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */
338 m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */
339 c = _mm_set_epi64x(1, 0x87); /* carry */
340
341 return _mm_slli_epi64(t, 1) ^ (c & ~m);
342 }
343
344 static int
345 aes_sse2_xts_update_selftest(void)
346 {
347 static const struct {
348 uint32_t in[4], out[4];
349 } cases[] = {
350 [0] = { {1}, {2} },
351 [1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
352 [2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
353 [3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
354 [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
355 [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
356 };
357 unsigned i;
358 uint32_t t[4];
359 int result = 0;
360
361 for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
362 t[0] = cases[i].in[0];
363 t[1] = cases[i].in[1];
364 t[2] = cases[i].in[2];
365 t[3] = cases[i].in[3];
366 _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
367 if (t[0] != cases[i].out[0] ||
368 t[1] != cases[i].out[1] ||
369 t[2] != cases[i].out[2] ||
370 t[3] != cases[i].out[3]) {
371 printf("%s %u:"
372 " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
373 __func__, i, t[0], t[1], t[2], t[3]);
374 result = -1;
375 }
376 }
377
378 return result;
379 }
380
381 static void
382 aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
383 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
384 uint32_t nrounds)
385 {
386 uint64_t sk_exp[120];
387 __m128i q[4];
388 __m128i w;
389 __m128i t[5];
390 unsigned i;
391
392 KASSERT(nbytes % 16 == 0);
393
394 /* Skip if there's nothing to do. */
395 if (nbytes == 0)
396 return;
397
398 fpu_kern_enter();
399
400 /* Expand round keys for bitslicing. */
401 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
402
403 /* Load tweak. */
404 t[0] = _mm_loadu_epi8(tweak);
405
406 /* Handle the first block separately if odd number. */
407 if (nbytes % (4*16)) {
408 /* Load up the tweaked inputs. */
409 for (i = 0; i < (nbytes/16) % 4; i++) {
410 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
411 q[i] = aes_sse2_interleave_in(w);
412 t[i + 1] = aes_sse2_xts_update(t[i]);
413 }
414 for (; i < 4; i++)
415 q[i] = _mm_setzero_si128();
416
417 /* Encrypt up to four blocks. */
418 aes_sse2_ortho(q);
419 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
420 aes_sse2_ortho(q);
421
422 /* Store the tweaked outputs. */
423 for (i = 0; i < (nbytes/16) % 4; i++) {
424 w = aes_sse2_interleave_out(q[i]);
425 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
426 }
427
428 /* Advance to the next block. */
429 t[0] = t[i];
430 in += nbytes % (4*16);
431 out += nbytes % (4*16);
432 nbytes -= nbytes % (4*16);
433 if (nbytes == 0)
434 goto out;
435 }
436
437 do {
438 KASSERT(nbytes % 64 == 0);
439 KASSERT(nbytes >= 64);
440
441 /* Load up the tweaked inputs. */
442 for (i = 0; i < 4; i++) {
443 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
444 q[i] = aes_sse2_interleave_in(w);
445 t[i + 1] = aes_sse2_xts_update(t[i]);
446 }
447
448 /* Encrypt four blocks. */
449 aes_sse2_ortho(q);
450 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
451 aes_sse2_ortho(q);
452
453 /* Store the tweaked outputs. */
454 for (i = 0; i < 4; i++) {
455 w = aes_sse2_interleave_out(q[i]);
456 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
457 }
458
459 /* Advance to the next block. */
460 t[0] = t[4];
461 in += 64;
462 out += 64;
463 nbytes -= 64;
464 } while (nbytes);
465
466 out: /* Store the updated tweak. */
467 _mm_storeu_epi8(tweak, t[0]);
468
469 /* Paranoia: Zero temporary buffers. */
470 explicit_memset(sk_exp, 0, sizeof sk_exp);
471 explicit_memset(q, 0, sizeof q);
472 explicit_memset(t, 0, sizeof t);
473
474 fpu_kern_leave();
475 }
476
477 static void
478 aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
479 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
480 uint32_t nrounds)
481 {
482 uint64_t sk_exp[120];
483 __m128i q[4];
484 __m128i w;
485 __m128i t[5];
486 unsigned i;
487
488 KASSERT(nbytes % 16 == 0);
489
490 /* Skip if there's nothing to do. */
491 if (nbytes == 0)
492 return;
493
494 fpu_kern_enter();
495
496 /* Expand round keys for bitslicing. */
497 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
498
499 /* Load tweak. */
500 t[0] = _mm_loadu_epi8(tweak);
501
502 /* Handle the first block separately if odd number. */
503 if (nbytes % (4*16)) {
504 /* Load up the tweaked inputs. */
505 for (i = 0; i < (nbytes/16) % 4; i++) {
506 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
507 q[i] = aes_sse2_interleave_in(w);
508 t[i + 1] = aes_sse2_xts_update(t[i]);
509 }
510 for (; i < 4; i++)
511 q[i] = _mm_setzero_si128();
512
513 /* Decrypt up to four blocks. */
514 aes_sse2_ortho(q);
515 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
516 aes_sse2_ortho(q);
517
518 /* Store the tweaked outputs. */
519 for (i = 0; i < (nbytes/16) % 4; i++) {
520 w = aes_sse2_interleave_out(q[i]);
521 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
522 }
523
524 /* Advance to the next block. */
525 t[0] = t[i];
526 in += nbytes % (4*16);
527 out += nbytes % (4*16);
528 nbytes -= nbytes % (4*16);
529 if (nbytes == 0)
530 goto out;
531 }
532
533 do {
534 KASSERT(nbytes % 64 == 0);
535 KASSERT(nbytes >= 64);
536
537 /* Load up the tweaked inputs. */
538 for (i = 0; i < 4; i++) {
539 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
540 q[i] = aes_sse2_interleave_in(w);
541 t[i + 1] = aes_sse2_xts_update(t[i]);
542 }
543
544 /* Decrypt four blocks. */
545 aes_sse2_ortho(q);
546 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
547 aes_sse2_ortho(q);
548
549 /* Store the tweaked outputs. */
550 for (i = 0; i < 4; i++) {
551 w = aes_sse2_interleave_out(q[i]);
552 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
553 }
554
555 /* Advance to the next block. */
556 t[0] = t[4];
557 in += 64;
558 out += 64;
559 nbytes -= 64;
560 } while (nbytes);
561
562 out: /* Store the updated tweak. */
563 _mm_storeu_epi8(tweak, t[0]);
564
565 /* Paranoia: Zero temporary buffers. */
566 explicit_memset(sk_exp, 0, sizeof sk_exp);
567 explicit_memset(q, 0, sizeof q);
568 explicit_memset(t, 0, sizeof t);
569
570 fpu_kern_leave();
571 }
572
573 static int
574 aes_sse2_probe(void)
575 {
576 int result = 0;
577
578 /* Verify that the CPU supports SSE and SSE2. */
579 if (!i386_has_sse)
580 return -1;
581 if (!i386_has_sse2)
582 return -1;
583
584 fpu_kern_enter();
585
586 if (aes_sse2_xts_update_selftest())
587 result = -1;
588
589 fpu_kern_leave();
590
591 /* XXX test aes_sse2_bitslice_decrypt */
592 /* XXX test aes_sse2_bitslice_encrypt */
593 /* XXX test aes_sse2_keysched */
594 /* XXX test aes_sse2_ortho */
595 /* XXX test aes_sse2_skey_expand */
596
597 return result;
598 }
599
600 struct aes_impl aes_sse2_impl = {
601 .ai_name = "Intel SSE2 bitsliced",
602 .ai_probe = aes_sse2_probe,
603 .ai_setenckey = aes_sse2_setenckey,
604 .ai_setdeckey = aes_sse2_setdeckey,
605 .ai_enc = aes_sse2_enc,
606 .ai_dec = aes_sse2_dec,
607 .ai_cbc_enc = aes_sse2_cbc_enc,
608 .ai_cbc_dec = aes_sse2_cbc_dec,
609 .ai_xts_enc = aes_sse2_xts_enc,
610 .ai_xts_dec = aes_sse2_xts_dec,
611 };
612