aes_sse2_subr.c revision 1.3 1 /* $NetBSD: aes_sse2_subr.c,v 1.3 2020/07/25 22:29:56 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.3 2020/07/25 22:29:56 riastradh Exp $");
31
32 #ifdef _KERNEL
33 #include <sys/systm.h>
34 #include <lib/libkern/libkern.h>
35 #else
36 #include <err.h>
37 #include <assert.h>
38 #include <inttypes.h>
39 #include <stdio.h>
40 #include <string.h>
41 #define KASSERT assert
42 #define panic(fmt, args...) err(1, fmt, ##args)
43 #endif
44
45 #include <crypto/aes/aes.h>
46 #include <crypto/aes/arch/x86/aes_sse2.h>
47
48 #include "aes_sse2_impl.h"
49
50 void
51 aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
52 {
53 size_t key_len;
54
55 switch (nrounds) {
56 case 10:
57 key_len = 16;
58 break;
59 case 12:
60 key_len = 24;
61 break;
62 case 14:
63 key_len = 32;
64 break;
65 default:
66 panic("invalid AES nrounds: %u", nrounds);
67 }
68
69 aes_sse2_keysched(rk, key, key_len);
70 }
71
72 void
73 aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
74 uint8_t out[static 16], uint32_t nrounds)
75 {
76 uint64_t sk_exp[120];
77 __m128i q[4];
78
79 /* Expand round keys for bitslicing. */
80 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
81
82 /* Load input block interleaved with garbage blocks. */
83 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
84 q[1] = q[2] = q[3] = _mm_setzero_si128();
85
86 /* Transform to bitslice, decrypt, transform from bitslice. */
87 aes_sse2_ortho(q);
88 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
89 aes_sse2_ortho(q);
90
91 /* Store output block. */
92 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
93
94 /* Paranoia: Zero temporary buffers. */
95 explicit_memset(sk_exp, 0, sizeof sk_exp);
96 explicit_memset(q, 0, sizeof q);
97 }
98
99 void
100 aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
101 uint8_t out[static 16], uint32_t nrounds)
102 {
103 uint64_t sk_exp[120];
104 __m128i q[4];
105
106 /* Expand round keys for bitslicing. */
107 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
108
109 /* Load input block interleaved with garbage blocks. */
110 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
111 q[1] = q[2] = q[3] = _mm_setzero_si128();
112
113 /* Transform to bitslice, decrypt, transform from bitslice. */
114 aes_sse2_ortho(q);
115 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
116 aes_sse2_ortho(q);
117
118 /* Store output block. */
119 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
120
121 /* Paranoia: Zero temporary buffers. */
122 explicit_memset(sk_exp, 0, sizeof sk_exp);
123 explicit_memset(q, 0, sizeof q);
124 }
125
126 void
127 aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
128 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
129 uint32_t nrounds)
130 {
131 uint64_t sk_exp[120];
132 __m128i q[4];
133 __m128i cv;
134
135 KASSERT(nbytes);
136 KASSERT(nbytes % 16 == 0);
137
138 /* Expand round keys for bitslicing. */
139 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
140
141 /* Load the IV. */
142 cv = _mm_loadu_epi8(iv);
143
144 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
145 /* Load input block and apply CV. */
146 q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
147
148 /* Transform to bitslice, encrypt, transform from bitslice. */
149 aes_sse2_ortho(q);
150 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
151 aes_sse2_ortho(q);
152
153 /* Remember ciphertext as CV and store output block. */
154 cv = aes_sse2_interleave_out(q[0]);
155 _mm_storeu_epi8(out, cv);
156 }
157
158 /* Store updated IV. */
159 _mm_storeu_epi8(iv, cv);
160
161 /* Paranoia: Zero temporary buffers. */
162 explicit_memset(sk_exp, 0, sizeof sk_exp);
163 explicit_memset(q, 0, sizeof q);
164 }
165
166 void
167 aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
168 uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
169 uint32_t nrounds)
170 {
171 uint64_t sk_exp[120];
172 __m128i q[4];
173 __m128i cv, iv, w;
174
175 KASSERT(nbytes);
176 KASSERT(nbytes % 16 == 0);
177
178 /* Expand round keys for bitslicing. */
179 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
180
181 /* Load the IV. */
182 iv = _mm_loadu_epi8(ivp);
183
184 /* Load the last cipher block. */
185 cv = _mm_loadu_epi8(in + nbytes - 16);
186
187 /* Store the updated IV. */
188 _mm_storeu_epi8(ivp, cv);
189
190 /* Process the last blocks if not an even multiple of four. */
191 if (nbytes % (4*16)) {
192 unsigned n = (nbytes/16) % 4;
193
194 KASSERT(n > 0);
195 KASSERT(n < 4);
196
197 q[1] = q[2] = q[3] = _mm_setzero_si128();
198 q[n - 1] = aes_sse2_interleave_in(cv);
199 switch (nbytes % 64) {
200 case 48:
201 w = _mm_loadu_epi8(in + nbytes - 32);
202 q[1] = aes_sse2_interleave_in(w);
203 /*FALLTHROUGH*/
204 case 32:
205 w = _mm_loadu_epi8(in + nbytes - 48);
206 q[0] = aes_sse2_interleave_in(w);
207 /*FALLTHROUGH*/
208 case 16:
209 break;
210 }
211
212 /* Decrypt. */
213 aes_sse2_ortho(q);
214 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
215 aes_sse2_ortho(q);
216
217 do {
218 n--;
219 w = aes_sse2_interleave_out(q[n]);
220 if ((nbytes -= 16) == 0)
221 goto out;
222 cv = _mm_loadu_epi8(in + nbytes - 16);
223 _mm_storeu_epi8(out + nbytes, w ^ cv);
224 } while (n);
225 }
226
227 for (;;) {
228 KASSERT(nbytes >= 64);
229 nbytes -= 64;
230
231 /*
232 * 1. Set up upper cipher block from cv.
233 * 2. Load lower cipher block into cv and set it up.
234 * 3. Decrypt.
235 */
236 q[3] = aes_sse2_interleave_in(cv);
237
238 w = _mm_loadu_epi8(in + nbytes + 4*8);
239 q[2] = aes_sse2_interleave_in(w);
240
241 w = _mm_loadu_epi8(in + nbytes + 4*4);
242 q[1] = aes_sse2_interleave_in(w);
243
244 w = _mm_loadu_epi8(in + nbytes + 4*0);
245 q[0] = aes_sse2_interleave_in(w);
246
247 aes_sse2_ortho(q);
248 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
249 aes_sse2_ortho(q);
250
251 /* Store the upper output block. */
252 w = aes_sse2_interleave_out(q[3]);
253 cv = _mm_loadu_epi8(in + nbytes + 4*8);
254 _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
255
256 /* Store the middle output blocks. */
257 w = aes_sse2_interleave_out(q[2]);
258 cv = _mm_loadu_epi8(in + nbytes + 4*4);
259 _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
260
261 w = aes_sse2_interleave_out(q[1]);
262 cv = _mm_loadu_epi8(in + nbytes + 4*0);
263 _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
264
265 /*
266 * Get the first output block, but don't load the CV
267 * yet -- it might be the previous ciphertext block, or
268 * it might be the IV.
269 */
270 w = aes_sse2_interleave_out(q[0]);
271
272 /* Stop if we've reached the first output block. */
273 if (nbytes == 0)
274 goto out;
275
276 /*
277 * Load the preceding cipher block, and apply it as the
278 * chaining value to this one.
279 */
280 cv = _mm_loadu_epi8(in + nbytes - 16);
281 _mm_storeu_epi8(out + nbytes, w ^ cv);
282 }
283
284 out: /* Store the first output block. */
285 _mm_storeu_epi8(out, w ^ iv);
286
287 /* Paranoia: Zero temporary buffers. */
288 explicit_memset(sk_exp, 0, sizeof sk_exp);
289 explicit_memset(q, 0, sizeof q);
290 }
291
292 static inline __m128i
293 aes_sse2_xts_update(__m128i t)
294 {
295 const __m128i one = _mm_set_epi64x(1, 1);
296 __m128i s, m, c;
297
298 s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */
299 m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */
300 m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */
301 c = _mm_set_epi64x(1, 0x87); /* carry */
302
303 return _mm_slli_epi64(t, 1) ^ (c & ~m);
304 }
305
306 static int
307 aes_sse2_xts_update_selftest(void)
308 {
309 static const struct {
310 uint32_t in[4], out[4];
311 } cases[] = {
312 [0] = { {1}, {2} },
313 [1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
314 [2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
315 [3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
316 [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
317 [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
318 };
319 unsigned i;
320 uint32_t t[4];
321 int result = 0;
322
323 for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
324 t[0] = cases[i].in[0];
325 t[1] = cases[i].in[1];
326 t[2] = cases[i].in[2];
327 t[3] = cases[i].in[3];
328 _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
329 if (t[0] != cases[i].out[0] ||
330 t[1] != cases[i].out[1] ||
331 t[2] != cases[i].out[2] ||
332 t[3] != cases[i].out[3]) {
333 printf("%s %u:"
334 " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
335 __func__, i, t[0], t[1], t[2], t[3]);
336 result = -1;
337 }
338 }
339
340 return result;
341 }
342
343 void
344 aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
345 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
346 uint32_t nrounds)
347 {
348 uint64_t sk_exp[120];
349 __m128i q[4];
350 __m128i w;
351 __m128i t[5];
352 unsigned i;
353
354 KASSERT(nbytes);
355 KASSERT(nbytes % 16 == 0);
356
357 /* Expand round keys for bitslicing. */
358 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
359
360 /* Load tweak. */
361 t[0] = _mm_loadu_epi8(tweak);
362
363 /* Handle the first block separately if odd number. */
364 if (nbytes % (4*16)) {
365 /* Load up the tweaked inputs. */
366 for (i = 0; i < (nbytes/16) % 4; i++) {
367 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
368 q[i] = aes_sse2_interleave_in(w);
369 t[i + 1] = aes_sse2_xts_update(t[i]);
370 }
371 for (; i < 4; i++)
372 q[i] = _mm_setzero_si128();
373
374 /* Encrypt up to four blocks. */
375 aes_sse2_ortho(q);
376 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
377 aes_sse2_ortho(q);
378
379 /* Store the tweaked outputs. */
380 for (i = 0; i < (nbytes/16) % 4; i++) {
381 w = aes_sse2_interleave_out(q[i]);
382 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
383 }
384
385 /* Advance to the next block. */
386 t[0] = t[i];
387 in += nbytes % (4*16);
388 out += nbytes % (4*16);
389 nbytes -= nbytes % (4*16);
390 if (nbytes == 0)
391 goto out;
392 }
393
394 do {
395 KASSERT(nbytes % 64 == 0);
396 KASSERT(nbytes >= 64);
397
398 /* Load up the tweaked inputs. */
399 for (i = 0; i < 4; i++) {
400 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
401 q[i] = aes_sse2_interleave_in(w);
402 t[i + 1] = aes_sse2_xts_update(t[i]);
403 }
404
405 /* Encrypt four blocks. */
406 aes_sse2_ortho(q);
407 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
408 aes_sse2_ortho(q);
409
410 /* Store the tweaked outputs. */
411 for (i = 0; i < 4; i++) {
412 w = aes_sse2_interleave_out(q[i]);
413 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
414 }
415
416 /* Advance to the next block. */
417 t[0] = t[4];
418 in += 64;
419 out += 64;
420 nbytes -= 64;
421 } while (nbytes);
422
423 out: /* Store the updated tweak. */
424 _mm_storeu_epi8(tweak, t[0]);
425
426 /* Paranoia: Zero temporary buffers. */
427 explicit_memset(sk_exp, 0, sizeof sk_exp);
428 explicit_memset(q, 0, sizeof q);
429 explicit_memset(t, 0, sizeof t);
430 }
431
432 void
433 aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
434 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
435 uint32_t nrounds)
436 {
437 uint64_t sk_exp[120];
438 __m128i q[4];
439 __m128i w;
440 __m128i t[5];
441 unsigned i;
442
443 KASSERT(nbytes);
444 KASSERT(nbytes % 16 == 0);
445
446 /* Expand round keys for bitslicing. */
447 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
448
449 /* Load tweak. */
450 t[0] = _mm_loadu_epi8(tweak);
451
452 /* Handle the first block separately if odd number. */
453 if (nbytes % (4*16)) {
454 /* Load up the tweaked inputs. */
455 for (i = 0; i < (nbytes/16) % 4; i++) {
456 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
457 q[i] = aes_sse2_interleave_in(w);
458 t[i + 1] = aes_sse2_xts_update(t[i]);
459 }
460 for (; i < 4; i++)
461 q[i] = _mm_setzero_si128();
462
463 /* Decrypt up to four blocks. */
464 aes_sse2_ortho(q);
465 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
466 aes_sse2_ortho(q);
467
468 /* Store the tweaked outputs. */
469 for (i = 0; i < (nbytes/16) % 4; i++) {
470 w = aes_sse2_interleave_out(q[i]);
471 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
472 }
473
474 /* Advance to the next block. */
475 t[0] = t[i];
476 in += nbytes % (4*16);
477 out += nbytes % (4*16);
478 nbytes -= nbytes % (4*16);
479 if (nbytes == 0)
480 goto out;
481 }
482
483 do {
484 KASSERT(nbytes % 64 == 0);
485 KASSERT(nbytes >= 64);
486
487 /* Load up the tweaked inputs. */
488 for (i = 0; i < 4; i++) {
489 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
490 q[i] = aes_sse2_interleave_in(w);
491 t[i + 1] = aes_sse2_xts_update(t[i]);
492 }
493
494 /* Decrypt four blocks. */
495 aes_sse2_ortho(q);
496 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
497 aes_sse2_ortho(q);
498
499 /* Store the tweaked outputs. */
500 for (i = 0; i < 4; i++) {
501 w = aes_sse2_interleave_out(q[i]);
502 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
503 }
504
505 /* Advance to the next block. */
506 t[0] = t[4];
507 in += 64;
508 out += 64;
509 nbytes -= 64;
510 } while (nbytes);
511
512 out: /* Store the updated tweak. */
513 _mm_storeu_epi8(tweak, t[0]);
514
515 /* Paranoia: Zero temporary buffers. */
516 explicit_memset(sk_exp, 0, sizeof sk_exp);
517 explicit_memset(q, 0, sizeof q);
518 explicit_memset(t, 0, sizeof t);
519 }
520
521 void
522 aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
523 size_t nbytes, uint8_t auth[static 16], uint32_t nrounds)
524 {
525 uint64_t sk_exp[120];
526 __m128i q[4];
527
528 KASSERT(nbytes);
529 KASSERT(nbytes % 16 == 0);
530
531 /* Expand round keys for bitslicing. */
532 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
533
534 /* Load initial authenticator. */
535 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth));
536
537 for (; nbytes; nbytes -= 16, in += 16) {
538 q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
539 aes_sse2_ortho(q);
540 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
541 aes_sse2_ortho(q);
542 }
543
544 /* Store updated authenticator. */
545 _mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0]));
546
547 /* Paranoia: Zero temporary buffers. */
548 explicit_memset(sk_exp, 0, sizeof sk_exp);
549 explicit_memset(q, 0, sizeof q);
550 }
551
552 void
553 aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
554 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
555 uint32_t nrounds)
556 {
557 uint64_t sk_exp[120];
558 __m128i q[4];
559 __m128i ctr;
560 uint32_t c0, c1, c2, c3;
561
562 KASSERT(nbytes);
563 KASSERT(nbytes % 16 == 0);
564
565 /* Expand round keys for bitslicing. */
566 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
567
568 /* Set first block to authenticator. */
569 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
570
571 /* Load initial counter block, big-endian so we can increment it. */
572 c0 = le32dec(authctr + 16 + 4*0);
573 c1 = le32dec(authctr + 16 + 4*1);
574 c2 = le32dec(authctr + 16 + 4*2);
575 c3 = be32dec(authctr + 16 + 4*3);
576
577 /* Set other blocks to garbage -- can't take advantage. */
578 q[2] = q[3] = _mm_setzero_si128();
579
580 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
581 /* Update authenticator. */
582 q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
583
584 /* Increment 32-bit counter. */
585 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
586 q[1] = aes_sse2_interleave_in(ctr);
587
588 /* Encrypt authenticator and counter. */
589 aes_sse2_ortho(q);
590 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
591 aes_sse2_ortho(q);
592
593 /* Encrypt with CTR output. */
594 _mm_storeu_epi8(out,
595 _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1]));
596 }
597
598 /* Update authenticator. */
599 _mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0]));
600
601 /* Update counter. */
602 be32enc(authctr + 16 + 4*3, c3);
603
604 /* Paranoia: Zero temporary buffers. */
605 explicit_memset(sk_exp, 0, sizeof sk_exp);
606 explicit_memset(q, 0, sizeof q);
607 }
608
609 void
610 aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
611 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
612 uint32_t nrounds)
613 {
614 uint64_t sk_exp[120];
615 __m128i q[4];
616 __m128i ctr, block;
617 uint32_t c0, c1, c2, c3;
618
619 KASSERT(nbytes);
620 KASSERT(nbytes % 16 == 0);
621
622 /* Expand round keys for bitslicing. */
623 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
624
625 /* Load initial counter block, big-endian so we can increment it. */
626 c0 = le32dec(authctr + 16 + 4*0);
627 c1 = le32dec(authctr + 16 + 4*1);
628 c2 = le32dec(authctr + 16 + 4*2);
629 c3 = be32dec(authctr + 16 + 4*3);
630
631 /* Increment 32-bit counter. */
632 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
633 q[0] = aes_sse2_interleave_in(ctr);
634
635 /*
636 * Set the other blocks to garbage -- we don't have any
637 * plaintext to authenticate yet.
638 */
639 q[1] = q[2] = q[3] = _mm_setzero_si128();
640
641 /* Encrypt first CTR. */
642 aes_sse2_ortho(q);
643 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
644 aes_sse2_ortho(q);
645
646 /* Load the initial authenticator. */
647 q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
648
649 for (;; in += 16, out += 16) {
650 /* Decrypt the block. */
651 block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]);
652
653 /* Update authenticator. */
654 q[1] ^= aes_sse2_interleave_in(block);
655
656 /* Store plaintext. */
657 _mm_storeu_epi8(out, block);
658
659 /* If this is the last block, stop. */
660 if ((nbytes -= 16) == 0)
661 break;
662
663 /* Increment 32-bit counter. */
664 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
665 q[0] = aes_sse2_interleave_in(ctr);
666
667 /* Authenticate previous plaintext, encrypt next CTR. */
668 aes_sse2_ortho(q);
669 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
670 aes_sse2_ortho(q);
671 }
672
673 /*
674 * Authenticate last plaintext. We're only doing this for the
675 * authenticator, not for the counter, so don't bother to
676 * initialize q[0], q[2], q[3]. (Even for the sake of
677 * sanitizers, they're already initialized to something by
678 * now.)
679 */
680 aes_sse2_ortho(q);
681 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
682 aes_sse2_ortho(q);
683
684 /* Update authenticator. */
685 _mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1]));
686
687 /* Update counter. */
688 be32enc(authctr + 16 + 4*3, c3);
689
690 /* Paranoia: Zero temporary buffers. */
691 explicit_memset(sk_exp, 0, sizeof sk_exp);
692 explicit_memset(q, 0, sizeof q);
693 }
694
695 int
696 aes_sse2_selftest(void)
697 {
698
699 if (aes_sse2_xts_update_selftest())
700 return -1;
701
702 /* XXX test aes_sse2_bitslice_decrypt */
703 /* XXX test aes_sse2_bitslice_encrypt */
704 /* XXX test aes_sse2_keysched */
705 /* XXX test aes_sse2_ortho */
706 /* XXX test aes_sse2_skey_expand */
707
708 return 0;
709 }
710