aes_sse2_subr.c revision 1.2 1 /* $NetBSD: aes_sse2_subr.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $");
31
32 #ifdef _KERNEL
33 #include <sys/systm.h>
34 #include <lib/libkern/libkern.h>
35 #else
36 #include <err.h>
37 #include <assert.h>
38 #include <inttypes.h>
39 #include <stdio.h>
40 #include <string.h>
41 #define KASSERT assert
42 #define panic(fmt, args...) err(1, fmt, ##args)
43 #endif
44
45 #include <crypto/aes/aes.h>
46 #include <crypto/aes/arch/x86/aes_sse2.h>
47
48 #include "aes_sse2_impl.h"
49
50 void
51 aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
52 {
53 size_t key_len;
54
55 switch (nrounds) {
56 case 10:
57 key_len = 16;
58 break;
59 case 12:
60 key_len = 24;
61 break;
62 case 14:
63 key_len = 32;
64 break;
65 default:
66 panic("invalid AES nrounds: %u", nrounds);
67 }
68
69 aes_sse2_keysched(rk, key, key_len);
70 }
71
72 void
73 aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
74 uint8_t out[static 16], uint32_t nrounds)
75 {
76 uint64_t sk_exp[120];
77 __m128i q[4];
78
79 /* Expand round keys for bitslicing. */
80 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
81
82 /* Load input block interleaved with garbage blocks. */
83 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
84 q[1] = q[2] = q[3] = _mm_setzero_si128();
85
86 /* Transform to bitslice, decrypt, transform from bitslice. */
87 aes_sse2_ortho(q);
88 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
89 aes_sse2_ortho(q);
90
91 /* Store output block. */
92 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
93
94 /* Paranoia: Zero temporary buffers. */
95 explicit_memset(sk_exp, 0, sizeof sk_exp);
96 explicit_memset(q, 0, sizeof q);
97 }
98
99 void
100 aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
101 uint8_t out[static 16], uint32_t nrounds)
102 {
103 uint64_t sk_exp[120];
104 __m128i q[4];
105
106 /* Expand round keys for bitslicing. */
107 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
108
109 /* Load input block interleaved with garbage blocks. */
110 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
111 q[1] = q[2] = q[3] = _mm_setzero_si128();
112
113 /* Transform to bitslice, decrypt, transform from bitslice. */
114 aes_sse2_ortho(q);
115 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
116 aes_sse2_ortho(q);
117
118 /* Store output block. */
119 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
120
121 /* Paranoia: Zero temporary buffers. */
122 explicit_memset(sk_exp, 0, sizeof sk_exp);
123 explicit_memset(q, 0, sizeof q);
124 }
125
126 void
127 aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
128 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
129 uint32_t nrounds)
130 {
131 uint64_t sk_exp[120];
132 __m128i q[4];
133 __m128i cv;
134
135 KASSERT(nbytes);
136 KASSERT(nbytes % 16 == 0);
137
138 /* Expand round keys for bitslicing. */
139 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
140
141 /* Load the IV. */
142 cv = _mm_loadu_epi8(iv);
143
144 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
145 /* Load input block and apply CV. */
146 q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
147
148 /* Transform to bitslice, encrypt, transform from bitslice. */
149 aes_sse2_ortho(q);
150 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
151 aes_sse2_ortho(q);
152
153 /* Remember ciphertext as CV and store output block. */
154 cv = aes_sse2_interleave_out(q[0]);
155 _mm_storeu_epi8(out, cv);
156 }
157
158 /* Store updated IV. */
159 _mm_storeu_epi8(iv, cv);
160
161 /* Paranoia: Zero temporary buffers. */
162 explicit_memset(sk_exp, 0, sizeof sk_exp);
163 explicit_memset(q, 0, sizeof q);
164 }
165
166 void
167 aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
168 uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
169 uint32_t nrounds)
170 {
171 uint64_t sk_exp[120];
172 __m128i q[4];
173 __m128i cv, iv, w;
174
175 KASSERT(nbytes);
176 KASSERT(nbytes % 16 == 0);
177
178 /* Expand round keys for bitslicing. */
179 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
180
181 /* Load the IV. */
182 iv = _mm_loadu_epi8(ivp);
183
184 /* Load the last cipher block. */
185 cv = _mm_loadu_epi8(in + nbytes - 16);
186
187 /* Store the updated IV. */
188 _mm_storeu_epi8(ivp, cv);
189
190 /* Process the last blocks if not an even multiple of four. */
191 if (nbytes % (4*16)) {
192 unsigned n = (nbytes/16) % 4;
193
194 KASSERT(n > 0);
195 KASSERT(n < 4);
196
197 q[1] = q[2] = q[3] = _mm_setzero_si128();
198 q[n - 1] = aes_sse2_interleave_in(cv);
199 switch (nbytes % 64) {
200 case 48:
201 w = _mm_loadu_epi8(in + nbytes - 32);
202 q[1] = aes_sse2_interleave_in(w);
203 /*FALLTHROUGH*/
204 case 32:
205 w = _mm_loadu_epi8(in + nbytes - 48);
206 q[0] = aes_sse2_interleave_in(w);
207 /*FALLTHROUGH*/
208 case 16:
209 break;
210 }
211
212 /* Decrypt. */
213 aes_sse2_ortho(q);
214 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
215 aes_sse2_ortho(q);
216
217 do {
218 n--;
219 w = aes_sse2_interleave_out(q[n]);
220 if ((nbytes -= 16) == 0)
221 goto out;
222 cv = _mm_loadu_epi8(in + nbytes - 16);
223 _mm_storeu_epi8(out + nbytes, w ^ cv);
224 } while (n);
225 }
226
227 for (;;) {
228 KASSERT(nbytes >= 64);
229 nbytes -= 64;
230
231 /*
232 * 1. Set up upper cipher block from cv.
233 * 2. Load lower cipher block into cv and set it up.
234 * 3. Decrypt.
235 */
236 q[3] = aes_sse2_interleave_in(cv);
237
238 w = _mm_loadu_epi8(in + nbytes + 4*8);
239 q[2] = aes_sse2_interleave_in(w);
240
241 w = _mm_loadu_epi8(in + nbytes + 4*4);
242 q[1] = aes_sse2_interleave_in(w);
243
244 w = _mm_loadu_epi8(in + nbytes + 4*0);
245 q[0] = aes_sse2_interleave_in(w);
246
247 aes_sse2_ortho(q);
248 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
249 aes_sse2_ortho(q);
250
251 /* Store the upper output block. */
252 w = aes_sse2_interleave_out(q[3]);
253 cv = _mm_loadu_epi8(in + nbytes + 4*8);
254 _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
255
256 /* Store the middle output blocks. */
257 w = aes_sse2_interleave_out(q[2]);
258 cv = _mm_loadu_epi8(in + nbytes + 4*4);
259 _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
260
261 w = aes_sse2_interleave_out(q[1]);
262 cv = _mm_loadu_epi8(in + nbytes + 4*0);
263 _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
264
265 /*
266 * Get the first output block, but don't load the CV
267 * yet -- it might be the previous ciphertext block, or
268 * it might be the IV.
269 */
270 w = aes_sse2_interleave_out(q[0]);
271
272 /* Stop if we've reached the first output block. */
273 if (nbytes == 0)
274 goto out;
275
276 /*
277 * Load the preceding cipher block, and apply it as the
278 * chaining value to this one.
279 */
280 cv = _mm_loadu_epi8(in + nbytes - 16);
281 _mm_storeu_epi8(out + nbytes, w ^ cv);
282 }
283
284 out: /* Store the first output block. */
285 _mm_storeu_epi8(out, w ^ iv);
286
287 /* Paranoia: Zero temporary buffers. */
288 explicit_memset(sk_exp, 0, sizeof sk_exp);
289 explicit_memset(q, 0, sizeof q);
290 }
291
292 static inline __m128i
293 aes_sse2_xts_update(__m128i t)
294 {
295 const __m128i one = _mm_set_epi64x(1, 1);
296 __m128i s, m, c;
297
298 s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */
299 m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */
300 m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */
301 c = _mm_set_epi64x(1, 0x87); /* carry */
302
303 return _mm_slli_epi64(t, 1) ^ (c & ~m);
304 }
305
306 static int
307 aes_sse2_xts_update_selftest(void)
308 {
309 static const struct {
310 uint32_t in[4], out[4];
311 } cases[] = {
312 [0] = { {1}, {2} },
313 [1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
314 [2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
315 [3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
316 [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
317 [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
318 };
319 unsigned i;
320 uint32_t t[4];
321 int result = 0;
322
323 for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
324 t[0] = cases[i].in[0];
325 t[1] = cases[i].in[1];
326 t[2] = cases[i].in[2];
327 t[3] = cases[i].in[3];
328 _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
329 if (t[0] != cases[i].out[0] ||
330 t[1] != cases[i].out[1] ||
331 t[2] != cases[i].out[2] ||
332 t[3] != cases[i].out[3]) {
333 printf("%s %u:"
334 " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
335 __func__, i, t[0], t[1], t[2], t[3]);
336 result = -1;
337 }
338 }
339
340 return result;
341 }
342
343 void
344 aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
345 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
346 uint32_t nrounds)
347 {
348 uint64_t sk_exp[120];
349 __m128i q[4];
350 __m128i w;
351 __m128i t[5];
352 unsigned i;
353
354 KASSERT(nbytes);
355 KASSERT(nbytes % 16 == 0);
356
357 /* Expand round keys for bitslicing. */
358 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
359
360 /* Load tweak. */
361 t[0] = _mm_loadu_epi8(tweak);
362
363 /* Handle the first block separately if odd number. */
364 if (nbytes % (4*16)) {
365 /* Load up the tweaked inputs. */
366 for (i = 0; i < (nbytes/16) % 4; i++) {
367 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
368 q[i] = aes_sse2_interleave_in(w);
369 t[i + 1] = aes_sse2_xts_update(t[i]);
370 }
371 for (; i < 4; i++)
372 q[i] = _mm_setzero_si128();
373
374 /* Encrypt up to four blocks. */
375 aes_sse2_ortho(q);
376 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
377 aes_sse2_ortho(q);
378
379 /* Store the tweaked outputs. */
380 for (i = 0; i < (nbytes/16) % 4; i++) {
381 w = aes_sse2_interleave_out(q[i]);
382 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
383 }
384
385 /* Advance to the next block. */
386 t[0] = t[i];
387 in += nbytes % (4*16);
388 out += nbytes % (4*16);
389 nbytes -= nbytes % (4*16);
390 if (nbytes == 0)
391 goto out;
392 }
393
394 do {
395 KASSERT(nbytes % 64 == 0);
396 KASSERT(nbytes >= 64);
397
398 /* Load up the tweaked inputs. */
399 for (i = 0; i < 4; i++) {
400 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
401 q[i] = aes_sse2_interleave_in(w);
402 t[i + 1] = aes_sse2_xts_update(t[i]);
403 }
404
405 /* Encrypt four blocks. */
406 aes_sse2_ortho(q);
407 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
408 aes_sse2_ortho(q);
409
410 /* Store the tweaked outputs. */
411 for (i = 0; i < 4; i++) {
412 w = aes_sse2_interleave_out(q[i]);
413 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
414 }
415
416 /* Advance to the next block. */
417 t[0] = t[4];
418 in += 64;
419 out += 64;
420 nbytes -= 64;
421 } while (nbytes);
422
423 out: /* Store the updated tweak. */
424 _mm_storeu_epi8(tweak, t[0]);
425
426 /* Paranoia: Zero temporary buffers. */
427 explicit_memset(sk_exp, 0, sizeof sk_exp);
428 explicit_memset(q, 0, sizeof q);
429 explicit_memset(t, 0, sizeof t);
430 }
431
432 void
433 aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
434 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
435 uint32_t nrounds)
436 {
437 uint64_t sk_exp[120];
438 __m128i q[4];
439 __m128i w;
440 __m128i t[5];
441 unsigned i;
442
443 KASSERT(nbytes);
444 KASSERT(nbytes % 16 == 0);
445
446 /* Expand round keys for bitslicing. */
447 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
448
449 /* Load tweak. */
450 t[0] = _mm_loadu_epi8(tweak);
451
452 /* Handle the first block separately if odd number. */
453 if (nbytes % (4*16)) {
454 /* Load up the tweaked inputs. */
455 for (i = 0; i < (nbytes/16) % 4; i++) {
456 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
457 q[i] = aes_sse2_interleave_in(w);
458 t[i + 1] = aes_sse2_xts_update(t[i]);
459 }
460 for (; i < 4; i++)
461 q[i] = _mm_setzero_si128();
462
463 /* Decrypt up to four blocks. */
464 aes_sse2_ortho(q);
465 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
466 aes_sse2_ortho(q);
467
468 /* Store the tweaked outputs. */
469 for (i = 0; i < (nbytes/16) % 4; i++) {
470 w = aes_sse2_interleave_out(q[i]);
471 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
472 }
473
474 /* Advance to the next block. */
475 t[0] = t[i];
476 in += nbytes % (4*16);
477 out += nbytes % (4*16);
478 nbytes -= nbytes % (4*16);
479 if (nbytes == 0)
480 goto out;
481 }
482
483 do {
484 KASSERT(nbytes % 64 == 0);
485 KASSERT(nbytes >= 64);
486
487 /* Load up the tweaked inputs. */
488 for (i = 0; i < 4; i++) {
489 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
490 q[i] = aes_sse2_interleave_in(w);
491 t[i + 1] = aes_sse2_xts_update(t[i]);
492 }
493
494 /* Decrypt four blocks. */
495 aes_sse2_ortho(q);
496 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
497 aes_sse2_ortho(q);
498
499 /* Store the tweaked outputs. */
500 for (i = 0; i < 4; i++) {
501 w = aes_sse2_interleave_out(q[i]);
502 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
503 }
504
505 /* Advance to the next block. */
506 t[0] = t[4];
507 in += 64;
508 out += 64;
509 nbytes -= 64;
510 } while (nbytes);
511
512 out: /* Store the updated tweak. */
513 _mm_storeu_epi8(tweak, t[0]);
514
515 /* Paranoia: Zero temporary buffers. */
516 explicit_memset(sk_exp, 0, sizeof sk_exp);
517 explicit_memset(q, 0, sizeof q);
518 explicit_memset(t, 0, sizeof t);
519 }
520
521 int
522 aes_sse2_selftest(void)
523 {
524
525 if (aes_sse2_xts_update_selftest())
526 return -1;
527
528 /* XXX test aes_sse2_bitslice_decrypt */
529 /* XXX test aes_sse2_bitslice_encrypt */
530 /* XXX test aes_sse2_keysched */
531 /* XXX test aes_sse2_ortho */
532 /* XXX test aes_sse2_skey_expand */
533
534 return 0;
535 }
536