aes_ssse3.c revision 1.1 1 /* $NetBSD: aes_ssse3.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Permutation-based AES using SSSE3, derived from Mike Hamburg's VPAES
31 * software, at <https://crypto.stanford.edu/vpaes/>, described in
32 *
33 * Mike Hamburg, `Accelerating AES with Vector Permute
34 * Instructions', in Christophe Clavier and Kris Gaj (eds.),
35 * Cryptographic Hardware and Embedded Systems -- CHES 2009,
36 * Springer LNCS 5747, pp. 18-32.
37 *
38 * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39 */
40
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(1, "$NetBSD: aes_ssse3.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $");
43
44 #include <sys/types.h>
45
46 #include <sys/systm.h>
47
48 #include "aes_ssse3_impl.h"
49
50 static const union m128const {
51 uint64_t u64[2];
52 __m128i m;
53 }
54 mc_forward[4] = {
55 {.u64 = {0x0407060500030201, 0x0C0F0E0D080B0A09}},
56 {.u64 = {0x080B0A0904070605, 0x000302010C0F0E0D}},
57 {.u64 = {0x0C0F0E0D080B0A09, 0x0407060500030201}},
58 {.u64 = {0x000302010C0F0E0D, 0x080B0A0904070605}},
59 },
60 mc_backward[4] = {
61 {.u64 = {0x0605040702010003, 0x0E0D0C0F0A09080B}},
62 {.u64 = {0x020100030E0D0C0F, 0x0A09080B06050407}},
63 {.u64 = {0x0E0D0C0F0A09080B, 0x0605040702010003}},
64 {.u64 = {0x0A09080B06050407, 0x020100030E0D0C0F}},
65 },
66 ipt[2] = {
67 {.u64 = {0xC2B2E8985A2A7000, 0xCABAE09052227808}},
68 {.u64 = {0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81}},
69 },
70 opt[2] = {
71 {.u64 = {0xFF9F4929D6B66000, 0xF7974121DEBE6808}},
72 {.u64 = {0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0}},
73 },
74 dipt[2] = {
75 {.u64 = {0x0F505B040B545F00, 0x154A411E114E451A}},
76 {.u64 = {0x86E383E660056500, 0x12771772F491F194}},
77 },
78 sb1[2] = {
79 {.u64 = {0xB19BE18FCB503E00, 0xA5DF7A6E142AF544}},
80 {.u64 = {0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF}},
81 },
82 sb2[2] = {
83 {.u64 = {0xE27A93C60B712400, 0x5EB7E955BC982FCD}},
84 {.u64 = {0x69EB88400AE12900, 0xC2A163C8AB82234A}},
85 },
86 sbo[2] = {
87 {.u64 = {0xD0D26D176FBDC700, 0x15AABF7AC502A878}},
88 {.u64 = {0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA}},
89 },
90 dsb9[2] = {
91 {.u64 = {0x851C03539A86D600, 0xCAD51F504F994CC9}},
92 {.u64 = {0xC03B1789ECD74900, 0x725E2C9EB2FBA565}},
93 },
94 dsbd[2] = {
95 {.u64 = {0x7D57CCDFE6B1A200, 0xF56E9B13882A4439}},
96 {.u64 = {0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3}},
97 },
98 dsbb[2] = {
99 {.u64 = {0xD022649296B44200, 0x602646F6B0F2D404}},
100 {.u64 = {0xC19498A6CD596700, 0xF3FF0C3E3255AA6B}},
101 },
102 dsbe[2] = {
103 {.u64 = {0x46F2929626D4D000, 0x2242600464B4F6B0}},
104 {.u64 = {0x0C55A6CDFFAAC100, 0x9467F36B98593E32}},
105 },
106 dsbo[2] = {
107 {.u64 = {0x1387EA537EF94000, 0xC7AA6DB9D4943E2D}},
108 {.u64 = {0x12D7560F93441D00, 0xCA4B8159D8C58E9C}},
109 },
110 dks1[2] = {
111 {.u64 = {0xB6116FC87ED9A700, 0x4AED933482255BFC}},
112 {.u64 = {0x4576516227143300, 0x8BB89FACE9DAFDCE}},
113 },
114 dks2[2] = {
115 {.u64 = {0x27438FEBCCA86400, 0x4622EE8AADC90561}},
116 {.u64 = {0x815C13CE4F92DD00, 0x73AEE13CBD602FF2}},
117 },
118 dks3[2] = {
119 {.u64 = {0x03C4C50201C6C700, 0xF83F3EF9FA3D3CFB}},
120 {.u64 = {0xEE1921D638CFF700, 0xA5526A9D7384BC4B}},
121 },
122 dks4[2] = {
123 {.u64 = {0xE3C390B053732000, 0xA080D3F310306343}},
124 {.u64 = {0xA0CA214B036982E8, 0x2F45AEC48CE60D67}},
125 },
126 deskew[2] = {
127 {.u64 = {0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A}},
128 {.u64 = {0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77}},
129 },
130 sr[4] = {
131 {.u64 = {0x0706050403020100, 0x0F0E0D0C0B0A0908}},
132 {.u64 = {0x030E09040F0A0500, 0x0B06010C07020D08}},
133 {.u64 = {0x0F060D040B020900, 0x070E050C030A0108}},
134 {.u64 = {0x0B0E0104070A0D00, 0x0306090C0F020508}},
135 },
136 rcon = {.u64 = {0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81}},
137 s63 = {.u64 = {0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B}},
138 of = {.u64 = {0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F}},
139 inv = {.u64 = {0x0E05060F0D080180, 0x040703090A0B0C02}},
140 inva = {.u64 = {0x01040A060F0B0780, 0x030D0E0C02050809}};
141
142 static inline __m128i
143 loadroundkey(const uint32_t *rk32)
144 {
145 return _mm_load_si128((const void *)rk32);
146 }
147
148 static inline void
149 storeroundkey(uint32_t *rk32, __m128i rk)
150 {
151 _mm_store_si128((void *)rk32, rk);
152 }
153
154 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */
155 static inline void
156 bytes2nybbles(__m128i *restrict lo, __m128i *restrict hi, __m128i x)
157 {
158
159 *lo = x & of.m;
160 *hi = _mm_srli_epi32(x & ~of.m, 4);
161 }
162
163 /* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c. */
164 static inline __m128i
165 gf16_inva(__m128i x)
166 {
167 return _mm_shuffle_epi8(inva.m, x);
168 }
169
170 /* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c. */
171 static inline __m128i
172 gf16_inv(__m128i x)
173 {
174 return _mm_shuffle_epi8(inv.m, x);
175 }
176
177 /*
178 * t is a pair of maps respectively from low and high nybbles to bytes.
179 * Apply t the nybbles, and add the results in GF(2).
180 */
181 static __m128i
182 aes_schedule_transform(__m128i x, const union m128const t[static 2])
183 {
184 __m128i lo, hi;
185
186 bytes2nybbles(&lo, &hi, x);
187 return _mm_shuffle_epi8(t[0].m, lo) ^ _mm_shuffle_epi8(t[1].m, hi);
188 }
189
190 static inline void
191 subbytes(__m128i *io, __m128i *jo, __m128i x)
192 {
193 __m128i k, i, ak, j;
194
195 bytes2nybbles(&k, &i, x);
196 ak = gf16_inva(k);
197 j = i ^ k;
198 *io = j ^ gf16_inv(ak ^ gf16_inv(i));
199 *jo = i ^ gf16_inv(ak ^ gf16_inv(j));
200 }
201
202 static __m128i
203 aes_schedule_low_round(__m128i rk, __m128i prk)
204 {
205 __m128i io, jo;
206
207 /* smear prk */
208 prk ^= _mm_slli_si128(prk, 4);
209 prk ^= _mm_slli_si128(prk, 8);
210 prk ^= s63.m;
211
212 /* subbytes */
213 subbytes(&io, &jo, rk);
214 rk = _mm_shuffle_epi8(sb1[0].m, io) ^ _mm_shuffle_epi8(sb1[1].m, jo);
215
216 /* add in smeared stuff */
217 return rk ^ prk;
218 }
219
220 static __m128i
221 aes_schedule_round(__m128i rk, __m128i prk, __m128i *rcon_rot)
222 {
223
224 /* extract rcon from rcon_rot */
225 prk ^= _mm_alignr_epi8(_mm_setzero_si128(), *rcon_rot, 15);
226 *rcon_rot = _mm_alignr_epi8(*rcon_rot, *rcon_rot, 15);
227
228 /* rotate */
229 rk = _mm_shuffle_epi32(rk, 0xff);
230 rk = _mm_alignr_epi8(rk, rk, 1);
231
232 return aes_schedule_low_round(rk, prk);
233 }
234
235 static __m128i
236 aes_schedule_mangle_enc(__m128i x, __m128i sr_i)
237 {
238 __m128i y = _mm_setzero_si128();
239
240 x ^= s63.m;
241
242 x = _mm_shuffle_epi8(x, mc_forward[0].m);
243 y ^= x;
244 x = _mm_shuffle_epi8(x, mc_forward[0].m);
245 y ^= x;
246 x = _mm_shuffle_epi8(x, mc_forward[0].m);
247 y ^= x;
248
249 return _mm_shuffle_epi8(y, sr_i);
250 }
251
252 static __m128i
253 aes_schedule_mangle_last_enc(__m128i x, __m128i sr_i)
254 {
255
256 return aes_schedule_transform(_mm_shuffle_epi8(x, sr_i) ^ s63.m, opt);
257 }
258
259 static __m128i
260 aes_schedule_mangle_dec(__m128i x, __m128i sr_i)
261 {
262 __m128i y = _mm_setzero_si128();
263
264 x = aes_schedule_transform(x, dks1);
265 y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
266 x = aes_schedule_transform(x, dks2);
267 y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
268 x = aes_schedule_transform(x, dks3);
269 y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
270 x = aes_schedule_transform(x, dks4);
271 y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
272
273 return _mm_shuffle_epi8(y, sr_i);
274 }
275
276 static __m128i
277 aes_schedule_mangle_last_dec(__m128i x)
278 {
279
280 return aes_schedule_transform(x ^ s63.m, deskew);
281 }
282
283 static __m128i
284 aes_schedule_192_smear(__m128i prkhi, __m128i prk)
285 {
286 __m128i rk;
287
288 rk = prkhi;
289 rk ^= _mm_shuffle_epi32(prkhi, 0x80);
290 rk ^= _mm_shuffle_epi32(prk, 0xfe);
291
292 return rk;
293 }
294
295 static __m128i
296 aes_schedule_192_smearhi(__m128i rk)
297 {
298 return (__m128i)_mm_movehl_ps((__m128)rk, _mm_setzero_ps());
299 }
300
301 void
302 aes_ssse3_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
303 {
304 uint32_t *rk32 = enc->aese_aes.aes_rk;
305 __m128i mrk; /* mangled round key */
306 __m128i rk; /* round key */
307 __m128i prk; /* previous round key */
308 __m128i rcon_rot = rcon.m;
309 uint64_t i = 3;
310
311 /* input transform */
312 rk = aes_schedule_transform(_mm_loadu_epi8(key), ipt);
313 storeroundkey(rk32, rk);
314 rk32 += 4;
315
316 switch (nrounds) {
317 case 10:
318 for (;;) {
319 rk = aes_schedule_round(rk, rk, &rcon_rot);
320 if (--nrounds == 0)
321 break;
322 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
323 storeroundkey(rk32, mrk);
324 rk32 += 4;
325 }
326 break;
327 case 12: {
328 __m128i prkhi; /* high half of previous round key */
329
330 prk = rk;
331 rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
332 prkhi = aes_schedule_192_smearhi(rk);
333 for (;;) {
334 prk = aes_schedule_round(rk, prk, &rcon_rot);
335 rk = _mm_alignr_epi8(prk, prkhi, 8);
336
337 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
338 storeroundkey(rk32, mrk);
339 rk32 += 4;
340 rk = aes_schedule_192_smear(prkhi, prk);
341 prkhi = aes_schedule_192_smearhi(rk);
342
343 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
344 storeroundkey(rk32, mrk);
345 rk32 += 4;
346 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
347 if ((nrounds -= 3) == 0)
348 break;
349
350 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
351 storeroundkey(rk32, mrk);
352 rk32 += 4;
353 rk = aes_schedule_192_smear(prkhi, prk);
354 prkhi = aes_schedule_192_smearhi(rk);
355 }
356 break;
357 }
358 case 14: {
359 __m128i pprk; /* previous previous round key */
360
361 prk = rk;
362 rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
363 for (;;) {
364 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
365 storeroundkey(rk32, mrk);
366 rk32 += 4;
367 pprk = rk;
368
369 /* high round */
370 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
371 if ((nrounds -= 2) == 0)
372 break;
373 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
374 storeroundkey(rk32, mrk);
375 rk32 += 4;
376
377 /* low round */
378 rk = _mm_shuffle_epi32(rk, 0xff);
379 rk = aes_schedule_low_round(rk, pprk);
380 }
381 break;
382 }
383 default:
384 panic("invalid number of AES rounds: %u", nrounds);
385 }
386 storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4].m));
387 }
388
389 void
390 aes_ssse3_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
391 {
392 uint32_t *rk32 = dec->aesd_aes.aes_rk;
393 __m128i mrk; /* mangled round key */
394 __m128i ork; /* original round key */
395 __m128i rk; /* round key */
396 __m128i prk; /* previous round key */
397 __m128i rcon_rot = rcon.m;
398 unsigned i = nrounds == 12 ? 0 : 2;
399
400 ork = _mm_loadu_epi8(key);
401
402 /* input transform */
403 rk = aes_schedule_transform(ork, ipt);
404
405 /* go from end */
406 rk32 += 4*nrounds;
407 storeroundkey(rk32, _mm_shuffle_epi8(ork, sr[i].m));
408 rk32 -= 4;
409 i ^= 3;
410
411 switch (nrounds) {
412 case 10:
413 for (;;) {
414 rk = aes_schedule_round(rk, rk, &rcon_rot);
415 if (--nrounds == 0)
416 break;
417 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
418 storeroundkey(rk32, mrk);
419 rk32 -= 4;
420 }
421 break;
422 case 12: {
423 __m128i prkhi; /* high half of previous round key */
424
425 prk = rk;
426 rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
427 prkhi = aes_schedule_192_smearhi(rk);
428 for (;;) {
429 prk = aes_schedule_round(rk, prk, &rcon_rot);
430 rk = _mm_alignr_epi8(prk, prkhi, 8);
431
432 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
433 storeroundkey(rk32, mrk);
434 rk32 -= 4;
435 rk = aes_schedule_192_smear(prkhi, prk);
436 prkhi = aes_schedule_192_smearhi(rk);
437
438 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
439 storeroundkey(rk32, mrk);
440 rk32 -= 4;
441 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
442 if ((nrounds -= 3) == 0)
443 break;
444
445 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
446 storeroundkey(rk32, mrk);
447 rk32 -= 4;
448 rk = aes_schedule_192_smear(prkhi, prk);
449 prkhi = aes_schedule_192_smearhi(rk);
450 }
451 break;
452 }
453 case 14: {
454 __m128i pprk; /* previous previous round key */
455
456 prk = rk;
457 rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
458 for (;;) {
459 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
460 storeroundkey(rk32, mrk);
461 rk32 -= 4;
462 pprk = rk;
463
464 /* high round */
465 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
466 if ((nrounds -= 2) == 0)
467 break;
468 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
469 storeroundkey(rk32, mrk);
470 rk32 -= 4;
471
472 /* low round */
473 rk = _mm_shuffle_epi32(rk, 0xff);
474 rk = aes_schedule_low_round(rk, pprk);
475 }
476 break;
477 }
478 default:
479 panic("invalid number of AES rounds: %u", nrounds);
480 }
481 storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
482 }
483
484 __m128i
485 aes_ssse3_enc1(const struct aesenc *enc, __m128i x, unsigned nrounds)
486 {
487 const uint32_t *rk32 = enc->aese_aes.aes_rk;
488 __m128i io, jo;
489 unsigned rmod4 = 0;
490
491 x = aes_schedule_transform(x, ipt);
492 x ^= loadroundkey(rk32);
493 for (;;) {
494 __m128i A, A2, A2_B, A2_B_D;
495
496 subbytes(&io, &jo, x);
497
498 rk32 += 4;
499 rmod4 = (rmod4 + 1) % 4;
500 if (--nrounds == 0)
501 break;
502
503 A = _mm_shuffle_epi8(sb1[0].m, io) ^
504 _mm_shuffle_epi8(sb1[1].m, jo);
505 A ^= loadroundkey(rk32);
506 A2 = _mm_shuffle_epi8(sb2[0].m, io) ^
507 _mm_shuffle_epi8(sb2[1].m, jo);
508 A2_B = A2 ^ _mm_shuffle_epi8(A, mc_forward[rmod4].m);
509 A2_B_D = A2_B ^ _mm_shuffle_epi8(A, mc_backward[rmod4].m);
510 x = A2_B_D ^ _mm_shuffle_epi8(A2_B, mc_forward[rmod4].m);
511 }
512 x = _mm_shuffle_epi8(sbo[0].m, io) ^ _mm_shuffle_epi8(sbo[1].m, jo);
513 x ^= loadroundkey(rk32);
514 return _mm_shuffle_epi8(x, sr[rmod4].m);
515 }
516
517 __m128i
518 aes_ssse3_dec1(const struct aesdec *dec, __m128i x, unsigned nrounds)
519 {
520 const uint32_t *rk32 = dec->aesd_aes.aes_rk;
521 unsigned i = 3 & ~(nrounds - 1);
522 __m128i io, jo, mc;
523
524 x = aes_schedule_transform(x, dipt);
525 x ^= loadroundkey(rk32);
526 rk32 += 4;
527
528 mc = mc_forward[3].m;
529 for (;;) {
530 subbytes(&io, &jo, x);
531 if (--nrounds == 0)
532 break;
533
534 x = _mm_shuffle_epi8(dsb9[0].m, io) ^
535 _mm_shuffle_epi8(dsb9[1].m, jo);
536 x ^= loadroundkey(rk32);
537 rk32 += 4; /* next round key */
538
539 x = _mm_shuffle_epi8(x, mc);
540 x ^= _mm_shuffle_epi8(dsbd[0].m, io) ^
541 _mm_shuffle_epi8(dsbd[1].m, jo);
542
543 x = _mm_shuffle_epi8(x, mc);
544 x ^= _mm_shuffle_epi8(dsbb[0].m, io) ^
545 _mm_shuffle_epi8(dsbb[1].m, jo);
546
547 x = _mm_shuffle_epi8(x, mc);
548 x ^= _mm_shuffle_epi8(dsbe[0].m, io) ^
549 _mm_shuffle_epi8(dsbe[1].m, jo);
550
551 mc = _mm_alignr_epi8(mc, mc, 12);
552 }
553 x = _mm_shuffle_epi8(dsbo[0].m, io) ^ _mm_shuffle_epi8(dsbo[1].m, jo);
554 x ^= loadroundkey(rk32);
555 return _mm_shuffle_epi8(x, sr[i].m);
556 }
557