aes_ssse3.c revision 1.2 1 1.2 riastrad /* $NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad /*
30 1.1 riastrad * Permutation-based AES using SSSE3, derived from Mike Hamburg's VPAES
31 1.1 riastrad * software, at <https://crypto.stanford.edu/vpaes/>, described in
32 1.1 riastrad *
33 1.1 riastrad * Mike Hamburg, `Accelerating AES with Vector Permute
34 1.1 riastrad * Instructions', in Christophe Clavier and Kris Gaj (eds.),
35 1.1 riastrad * Cryptographic Hardware and Embedded Systems -- CHES 2009,
36 1.1 riastrad * Springer LNCS 5747, pp. 18-32.
37 1.1 riastrad *
38 1.1 riastrad * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39 1.1 riastrad */
40 1.1 riastrad
41 1.1 riastrad #include <sys/cdefs.h>
42 1.2 riastrad __KERNEL_RCSID(1, "$NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $");
43 1.1 riastrad
44 1.1 riastrad #include <sys/types.h>
45 1.1 riastrad
46 1.2 riastrad #ifdef _KERNEL
47 1.1 riastrad #include <sys/systm.h>
48 1.2 riastrad #else
49 1.2 riastrad #include <err.h>
50 1.2 riastrad #define panic(fmt, args...) err(1, fmt, ##args)
51 1.2 riastrad #endif
52 1.1 riastrad
53 1.1 riastrad #include "aes_ssse3_impl.h"
54 1.1 riastrad
55 1.1 riastrad static const union m128const {
56 1.1 riastrad uint64_t u64[2];
57 1.1 riastrad __m128i m;
58 1.1 riastrad }
59 1.1 riastrad mc_forward[4] = {
60 1.1 riastrad {.u64 = {0x0407060500030201, 0x0C0F0E0D080B0A09}},
61 1.1 riastrad {.u64 = {0x080B0A0904070605, 0x000302010C0F0E0D}},
62 1.1 riastrad {.u64 = {0x0C0F0E0D080B0A09, 0x0407060500030201}},
63 1.1 riastrad {.u64 = {0x000302010C0F0E0D, 0x080B0A0904070605}},
64 1.1 riastrad },
65 1.1 riastrad mc_backward[4] = {
66 1.1 riastrad {.u64 = {0x0605040702010003, 0x0E0D0C0F0A09080B}},
67 1.1 riastrad {.u64 = {0x020100030E0D0C0F, 0x0A09080B06050407}},
68 1.1 riastrad {.u64 = {0x0E0D0C0F0A09080B, 0x0605040702010003}},
69 1.1 riastrad {.u64 = {0x0A09080B06050407, 0x020100030E0D0C0F}},
70 1.1 riastrad },
71 1.1 riastrad ipt[2] = {
72 1.1 riastrad {.u64 = {0xC2B2E8985A2A7000, 0xCABAE09052227808}},
73 1.1 riastrad {.u64 = {0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81}},
74 1.1 riastrad },
75 1.1 riastrad opt[2] = {
76 1.1 riastrad {.u64 = {0xFF9F4929D6B66000, 0xF7974121DEBE6808}},
77 1.1 riastrad {.u64 = {0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0}},
78 1.1 riastrad },
79 1.1 riastrad dipt[2] = {
80 1.1 riastrad {.u64 = {0x0F505B040B545F00, 0x154A411E114E451A}},
81 1.1 riastrad {.u64 = {0x86E383E660056500, 0x12771772F491F194}},
82 1.1 riastrad },
83 1.1 riastrad sb1[2] = {
84 1.1 riastrad {.u64 = {0xB19BE18FCB503E00, 0xA5DF7A6E142AF544}},
85 1.1 riastrad {.u64 = {0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF}},
86 1.1 riastrad },
87 1.1 riastrad sb2[2] = {
88 1.1 riastrad {.u64 = {0xE27A93C60B712400, 0x5EB7E955BC982FCD}},
89 1.1 riastrad {.u64 = {0x69EB88400AE12900, 0xC2A163C8AB82234A}},
90 1.1 riastrad },
91 1.1 riastrad sbo[2] = {
92 1.1 riastrad {.u64 = {0xD0D26D176FBDC700, 0x15AABF7AC502A878}},
93 1.1 riastrad {.u64 = {0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA}},
94 1.1 riastrad },
95 1.1 riastrad dsb9[2] = {
96 1.1 riastrad {.u64 = {0x851C03539A86D600, 0xCAD51F504F994CC9}},
97 1.1 riastrad {.u64 = {0xC03B1789ECD74900, 0x725E2C9EB2FBA565}},
98 1.1 riastrad },
99 1.1 riastrad dsbd[2] = {
100 1.1 riastrad {.u64 = {0x7D57CCDFE6B1A200, 0xF56E9B13882A4439}},
101 1.1 riastrad {.u64 = {0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3}},
102 1.1 riastrad },
103 1.1 riastrad dsbb[2] = {
104 1.1 riastrad {.u64 = {0xD022649296B44200, 0x602646F6B0F2D404}},
105 1.1 riastrad {.u64 = {0xC19498A6CD596700, 0xF3FF0C3E3255AA6B}},
106 1.1 riastrad },
107 1.1 riastrad dsbe[2] = {
108 1.1 riastrad {.u64 = {0x46F2929626D4D000, 0x2242600464B4F6B0}},
109 1.1 riastrad {.u64 = {0x0C55A6CDFFAAC100, 0x9467F36B98593E32}},
110 1.1 riastrad },
111 1.1 riastrad dsbo[2] = {
112 1.1 riastrad {.u64 = {0x1387EA537EF94000, 0xC7AA6DB9D4943E2D}},
113 1.1 riastrad {.u64 = {0x12D7560F93441D00, 0xCA4B8159D8C58E9C}},
114 1.1 riastrad },
115 1.1 riastrad dks1[2] = {
116 1.1 riastrad {.u64 = {0xB6116FC87ED9A700, 0x4AED933482255BFC}},
117 1.1 riastrad {.u64 = {0x4576516227143300, 0x8BB89FACE9DAFDCE}},
118 1.1 riastrad },
119 1.1 riastrad dks2[2] = {
120 1.1 riastrad {.u64 = {0x27438FEBCCA86400, 0x4622EE8AADC90561}},
121 1.1 riastrad {.u64 = {0x815C13CE4F92DD00, 0x73AEE13CBD602FF2}},
122 1.1 riastrad },
123 1.1 riastrad dks3[2] = {
124 1.1 riastrad {.u64 = {0x03C4C50201C6C700, 0xF83F3EF9FA3D3CFB}},
125 1.1 riastrad {.u64 = {0xEE1921D638CFF700, 0xA5526A9D7384BC4B}},
126 1.1 riastrad },
127 1.1 riastrad dks4[2] = {
128 1.1 riastrad {.u64 = {0xE3C390B053732000, 0xA080D3F310306343}},
129 1.1 riastrad {.u64 = {0xA0CA214B036982E8, 0x2F45AEC48CE60D67}},
130 1.1 riastrad },
131 1.1 riastrad deskew[2] = {
132 1.1 riastrad {.u64 = {0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A}},
133 1.1 riastrad {.u64 = {0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77}},
134 1.1 riastrad },
135 1.1 riastrad sr[4] = {
136 1.1 riastrad {.u64 = {0x0706050403020100, 0x0F0E0D0C0B0A0908}},
137 1.1 riastrad {.u64 = {0x030E09040F0A0500, 0x0B06010C07020D08}},
138 1.1 riastrad {.u64 = {0x0F060D040B020900, 0x070E050C030A0108}},
139 1.1 riastrad {.u64 = {0x0B0E0104070A0D00, 0x0306090C0F020508}},
140 1.1 riastrad },
141 1.1 riastrad rcon = {.u64 = {0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81}},
142 1.1 riastrad s63 = {.u64 = {0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B}},
143 1.1 riastrad of = {.u64 = {0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F}},
144 1.1 riastrad inv = {.u64 = {0x0E05060F0D080180, 0x040703090A0B0C02}},
145 1.1 riastrad inva = {.u64 = {0x01040A060F0B0780, 0x030D0E0C02050809}};
146 1.1 riastrad
147 1.1 riastrad static inline __m128i
148 1.1 riastrad loadroundkey(const uint32_t *rk32)
149 1.1 riastrad {
150 1.1 riastrad return _mm_load_si128((const void *)rk32);
151 1.1 riastrad }
152 1.1 riastrad
153 1.1 riastrad static inline void
154 1.1 riastrad storeroundkey(uint32_t *rk32, __m128i rk)
155 1.1 riastrad {
156 1.1 riastrad _mm_store_si128((void *)rk32, rk);
157 1.1 riastrad }
158 1.1 riastrad
159 1.1 riastrad /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */
160 1.1 riastrad static inline void
161 1.1 riastrad bytes2nybbles(__m128i *restrict lo, __m128i *restrict hi, __m128i x)
162 1.1 riastrad {
163 1.1 riastrad
164 1.1 riastrad *lo = x & of.m;
165 1.1 riastrad *hi = _mm_srli_epi32(x & ~of.m, 4);
166 1.1 riastrad }
167 1.1 riastrad
168 1.1 riastrad /* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c. */
169 1.1 riastrad static inline __m128i
170 1.1 riastrad gf16_inva(__m128i x)
171 1.1 riastrad {
172 1.1 riastrad return _mm_shuffle_epi8(inva.m, x);
173 1.1 riastrad }
174 1.1 riastrad
175 1.1 riastrad /* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c. */
176 1.1 riastrad static inline __m128i
177 1.1 riastrad gf16_inv(__m128i x)
178 1.1 riastrad {
179 1.1 riastrad return _mm_shuffle_epi8(inv.m, x);
180 1.1 riastrad }
181 1.1 riastrad
182 1.1 riastrad /*
183 1.1 riastrad * t is a pair of maps respectively from low and high nybbles to bytes.
184 1.1 riastrad * Apply t the nybbles, and add the results in GF(2).
185 1.1 riastrad */
186 1.1 riastrad static __m128i
187 1.1 riastrad aes_schedule_transform(__m128i x, const union m128const t[static 2])
188 1.1 riastrad {
189 1.1 riastrad __m128i lo, hi;
190 1.1 riastrad
191 1.1 riastrad bytes2nybbles(&lo, &hi, x);
192 1.1 riastrad return _mm_shuffle_epi8(t[0].m, lo) ^ _mm_shuffle_epi8(t[1].m, hi);
193 1.1 riastrad }
194 1.1 riastrad
195 1.1 riastrad static inline void
196 1.1 riastrad subbytes(__m128i *io, __m128i *jo, __m128i x)
197 1.1 riastrad {
198 1.1 riastrad __m128i k, i, ak, j;
199 1.1 riastrad
200 1.1 riastrad bytes2nybbles(&k, &i, x);
201 1.1 riastrad ak = gf16_inva(k);
202 1.1 riastrad j = i ^ k;
203 1.1 riastrad *io = j ^ gf16_inv(ak ^ gf16_inv(i));
204 1.1 riastrad *jo = i ^ gf16_inv(ak ^ gf16_inv(j));
205 1.1 riastrad }
206 1.1 riastrad
207 1.1 riastrad static __m128i
208 1.1 riastrad aes_schedule_low_round(__m128i rk, __m128i prk)
209 1.1 riastrad {
210 1.1 riastrad __m128i io, jo;
211 1.1 riastrad
212 1.1 riastrad /* smear prk */
213 1.1 riastrad prk ^= _mm_slli_si128(prk, 4);
214 1.1 riastrad prk ^= _mm_slli_si128(prk, 8);
215 1.1 riastrad prk ^= s63.m;
216 1.1 riastrad
217 1.1 riastrad /* subbytes */
218 1.1 riastrad subbytes(&io, &jo, rk);
219 1.1 riastrad rk = _mm_shuffle_epi8(sb1[0].m, io) ^ _mm_shuffle_epi8(sb1[1].m, jo);
220 1.1 riastrad
221 1.1 riastrad /* add in smeared stuff */
222 1.1 riastrad return rk ^ prk;
223 1.1 riastrad }
224 1.1 riastrad
225 1.1 riastrad static __m128i
226 1.1 riastrad aes_schedule_round(__m128i rk, __m128i prk, __m128i *rcon_rot)
227 1.1 riastrad {
228 1.1 riastrad
229 1.1 riastrad /* extract rcon from rcon_rot */
230 1.1 riastrad prk ^= _mm_alignr_epi8(_mm_setzero_si128(), *rcon_rot, 15);
231 1.1 riastrad *rcon_rot = _mm_alignr_epi8(*rcon_rot, *rcon_rot, 15);
232 1.1 riastrad
233 1.1 riastrad /* rotate */
234 1.1 riastrad rk = _mm_shuffle_epi32(rk, 0xff);
235 1.1 riastrad rk = _mm_alignr_epi8(rk, rk, 1);
236 1.1 riastrad
237 1.1 riastrad return aes_schedule_low_round(rk, prk);
238 1.1 riastrad }
239 1.1 riastrad
240 1.1 riastrad static __m128i
241 1.1 riastrad aes_schedule_mangle_enc(__m128i x, __m128i sr_i)
242 1.1 riastrad {
243 1.1 riastrad __m128i y = _mm_setzero_si128();
244 1.1 riastrad
245 1.1 riastrad x ^= s63.m;
246 1.1 riastrad
247 1.1 riastrad x = _mm_shuffle_epi8(x, mc_forward[0].m);
248 1.1 riastrad y ^= x;
249 1.1 riastrad x = _mm_shuffle_epi8(x, mc_forward[0].m);
250 1.1 riastrad y ^= x;
251 1.1 riastrad x = _mm_shuffle_epi8(x, mc_forward[0].m);
252 1.1 riastrad y ^= x;
253 1.1 riastrad
254 1.1 riastrad return _mm_shuffle_epi8(y, sr_i);
255 1.1 riastrad }
256 1.1 riastrad
257 1.1 riastrad static __m128i
258 1.1 riastrad aes_schedule_mangle_last_enc(__m128i x, __m128i sr_i)
259 1.1 riastrad {
260 1.1 riastrad
261 1.1 riastrad return aes_schedule_transform(_mm_shuffle_epi8(x, sr_i) ^ s63.m, opt);
262 1.1 riastrad }
263 1.1 riastrad
264 1.1 riastrad static __m128i
265 1.1 riastrad aes_schedule_mangle_dec(__m128i x, __m128i sr_i)
266 1.1 riastrad {
267 1.1 riastrad __m128i y = _mm_setzero_si128();
268 1.1 riastrad
269 1.1 riastrad x = aes_schedule_transform(x, dks1);
270 1.1 riastrad y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
271 1.1 riastrad x = aes_schedule_transform(x, dks2);
272 1.1 riastrad y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
273 1.1 riastrad x = aes_schedule_transform(x, dks3);
274 1.1 riastrad y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
275 1.1 riastrad x = aes_schedule_transform(x, dks4);
276 1.1 riastrad y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
277 1.1 riastrad
278 1.1 riastrad return _mm_shuffle_epi8(y, sr_i);
279 1.1 riastrad }
280 1.1 riastrad
281 1.1 riastrad static __m128i
282 1.1 riastrad aes_schedule_mangle_last_dec(__m128i x)
283 1.1 riastrad {
284 1.1 riastrad
285 1.1 riastrad return aes_schedule_transform(x ^ s63.m, deskew);
286 1.1 riastrad }
287 1.1 riastrad
288 1.1 riastrad static __m128i
289 1.1 riastrad aes_schedule_192_smear(__m128i prkhi, __m128i prk)
290 1.1 riastrad {
291 1.1 riastrad __m128i rk;
292 1.1 riastrad
293 1.1 riastrad rk = prkhi;
294 1.1 riastrad rk ^= _mm_shuffle_epi32(prkhi, 0x80);
295 1.1 riastrad rk ^= _mm_shuffle_epi32(prk, 0xfe);
296 1.1 riastrad
297 1.1 riastrad return rk;
298 1.1 riastrad }
299 1.1 riastrad
300 1.1 riastrad static __m128i
301 1.1 riastrad aes_schedule_192_smearhi(__m128i rk)
302 1.1 riastrad {
303 1.1 riastrad return (__m128i)_mm_movehl_ps((__m128)rk, _mm_setzero_ps());
304 1.1 riastrad }
305 1.1 riastrad
306 1.1 riastrad void
307 1.1 riastrad aes_ssse3_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
308 1.1 riastrad {
309 1.1 riastrad uint32_t *rk32 = enc->aese_aes.aes_rk;
310 1.1 riastrad __m128i mrk; /* mangled round key */
311 1.1 riastrad __m128i rk; /* round key */
312 1.1 riastrad __m128i prk; /* previous round key */
313 1.1 riastrad __m128i rcon_rot = rcon.m;
314 1.1 riastrad uint64_t i = 3;
315 1.1 riastrad
316 1.1 riastrad /* input transform */
317 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key), ipt);
318 1.1 riastrad storeroundkey(rk32, rk);
319 1.1 riastrad rk32 += 4;
320 1.1 riastrad
321 1.1 riastrad switch (nrounds) {
322 1.1 riastrad case 10:
323 1.1 riastrad for (;;) {
324 1.1 riastrad rk = aes_schedule_round(rk, rk, &rcon_rot);
325 1.1 riastrad if (--nrounds == 0)
326 1.1 riastrad break;
327 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
328 1.1 riastrad storeroundkey(rk32, mrk);
329 1.1 riastrad rk32 += 4;
330 1.1 riastrad }
331 1.1 riastrad break;
332 1.1 riastrad case 12: {
333 1.1 riastrad __m128i prkhi; /* high half of previous round key */
334 1.1 riastrad
335 1.1 riastrad prk = rk;
336 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
337 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
338 1.1 riastrad for (;;) {
339 1.1 riastrad prk = aes_schedule_round(rk, prk, &rcon_rot);
340 1.1 riastrad rk = _mm_alignr_epi8(prk, prkhi, 8);
341 1.1 riastrad
342 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
343 1.1 riastrad storeroundkey(rk32, mrk);
344 1.1 riastrad rk32 += 4;
345 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk);
346 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
347 1.1 riastrad
348 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
349 1.1 riastrad storeroundkey(rk32, mrk);
350 1.1 riastrad rk32 += 4;
351 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
352 1.1 riastrad if ((nrounds -= 3) == 0)
353 1.1 riastrad break;
354 1.1 riastrad
355 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
356 1.1 riastrad storeroundkey(rk32, mrk);
357 1.1 riastrad rk32 += 4;
358 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk);
359 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
360 1.1 riastrad }
361 1.1 riastrad break;
362 1.1 riastrad }
363 1.1 riastrad case 14: {
364 1.1 riastrad __m128i pprk; /* previous previous round key */
365 1.1 riastrad
366 1.1 riastrad prk = rk;
367 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
368 1.1 riastrad for (;;) {
369 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
370 1.1 riastrad storeroundkey(rk32, mrk);
371 1.1 riastrad rk32 += 4;
372 1.1 riastrad pprk = rk;
373 1.1 riastrad
374 1.1 riastrad /* high round */
375 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
376 1.1 riastrad if ((nrounds -= 2) == 0)
377 1.1 riastrad break;
378 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
379 1.1 riastrad storeroundkey(rk32, mrk);
380 1.1 riastrad rk32 += 4;
381 1.1 riastrad
382 1.1 riastrad /* low round */
383 1.1 riastrad rk = _mm_shuffle_epi32(rk, 0xff);
384 1.1 riastrad rk = aes_schedule_low_round(rk, pprk);
385 1.1 riastrad }
386 1.1 riastrad break;
387 1.1 riastrad }
388 1.1 riastrad default:
389 1.1 riastrad panic("invalid number of AES rounds: %u", nrounds);
390 1.1 riastrad }
391 1.1 riastrad storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4].m));
392 1.1 riastrad }
393 1.1 riastrad
394 1.1 riastrad void
395 1.1 riastrad aes_ssse3_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
396 1.1 riastrad {
397 1.1 riastrad uint32_t *rk32 = dec->aesd_aes.aes_rk;
398 1.1 riastrad __m128i mrk; /* mangled round key */
399 1.1 riastrad __m128i ork; /* original round key */
400 1.1 riastrad __m128i rk; /* round key */
401 1.1 riastrad __m128i prk; /* previous round key */
402 1.1 riastrad __m128i rcon_rot = rcon.m;
403 1.1 riastrad unsigned i = nrounds == 12 ? 0 : 2;
404 1.1 riastrad
405 1.1 riastrad ork = _mm_loadu_epi8(key);
406 1.1 riastrad
407 1.1 riastrad /* input transform */
408 1.1 riastrad rk = aes_schedule_transform(ork, ipt);
409 1.1 riastrad
410 1.1 riastrad /* go from end */
411 1.1 riastrad rk32 += 4*nrounds;
412 1.1 riastrad storeroundkey(rk32, _mm_shuffle_epi8(ork, sr[i].m));
413 1.1 riastrad rk32 -= 4;
414 1.1 riastrad i ^= 3;
415 1.1 riastrad
416 1.1 riastrad switch (nrounds) {
417 1.1 riastrad case 10:
418 1.1 riastrad for (;;) {
419 1.1 riastrad rk = aes_schedule_round(rk, rk, &rcon_rot);
420 1.1 riastrad if (--nrounds == 0)
421 1.1 riastrad break;
422 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
423 1.1 riastrad storeroundkey(rk32, mrk);
424 1.1 riastrad rk32 -= 4;
425 1.1 riastrad }
426 1.1 riastrad break;
427 1.1 riastrad case 12: {
428 1.1 riastrad __m128i prkhi; /* high half of previous round key */
429 1.1 riastrad
430 1.1 riastrad prk = rk;
431 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
432 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
433 1.1 riastrad for (;;) {
434 1.1 riastrad prk = aes_schedule_round(rk, prk, &rcon_rot);
435 1.1 riastrad rk = _mm_alignr_epi8(prk, prkhi, 8);
436 1.1 riastrad
437 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
438 1.1 riastrad storeroundkey(rk32, mrk);
439 1.1 riastrad rk32 -= 4;
440 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk);
441 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
442 1.1 riastrad
443 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
444 1.1 riastrad storeroundkey(rk32, mrk);
445 1.1 riastrad rk32 -= 4;
446 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
447 1.1 riastrad if ((nrounds -= 3) == 0)
448 1.1 riastrad break;
449 1.1 riastrad
450 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
451 1.1 riastrad storeroundkey(rk32, mrk);
452 1.1 riastrad rk32 -= 4;
453 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk);
454 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
455 1.1 riastrad }
456 1.1 riastrad break;
457 1.1 riastrad }
458 1.1 riastrad case 14: {
459 1.1 riastrad __m128i pprk; /* previous previous round key */
460 1.1 riastrad
461 1.1 riastrad prk = rk;
462 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
463 1.1 riastrad for (;;) {
464 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
465 1.1 riastrad storeroundkey(rk32, mrk);
466 1.1 riastrad rk32 -= 4;
467 1.1 riastrad pprk = rk;
468 1.1 riastrad
469 1.1 riastrad /* high round */
470 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
471 1.1 riastrad if ((nrounds -= 2) == 0)
472 1.1 riastrad break;
473 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
474 1.1 riastrad storeroundkey(rk32, mrk);
475 1.1 riastrad rk32 -= 4;
476 1.1 riastrad
477 1.1 riastrad /* low round */
478 1.1 riastrad rk = _mm_shuffle_epi32(rk, 0xff);
479 1.1 riastrad rk = aes_schedule_low_round(rk, pprk);
480 1.1 riastrad }
481 1.1 riastrad break;
482 1.1 riastrad }
483 1.1 riastrad default:
484 1.1 riastrad panic("invalid number of AES rounds: %u", nrounds);
485 1.1 riastrad }
486 1.1 riastrad storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
487 1.1 riastrad }
488 1.1 riastrad
489 1.1 riastrad __m128i
490 1.1 riastrad aes_ssse3_enc1(const struct aesenc *enc, __m128i x, unsigned nrounds)
491 1.1 riastrad {
492 1.1 riastrad const uint32_t *rk32 = enc->aese_aes.aes_rk;
493 1.1 riastrad __m128i io, jo;
494 1.1 riastrad unsigned rmod4 = 0;
495 1.1 riastrad
496 1.1 riastrad x = aes_schedule_transform(x, ipt);
497 1.1 riastrad x ^= loadroundkey(rk32);
498 1.1 riastrad for (;;) {
499 1.1 riastrad __m128i A, A2, A2_B, A2_B_D;
500 1.1 riastrad
501 1.1 riastrad subbytes(&io, &jo, x);
502 1.1 riastrad
503 1.1 riastrad rk32 += 4;
504 1.1 riastrad rmod4 = (rmod4 + 1) % 4;
505 1.1 riastrad if (--nrounds == 0)
506 1.1 riastrad break;
507 1.1 riastrad
508 1.1 riastrad A = _mm_shuffle_epi8(sb1[0].m, io) ^
509 1.1 riastrad _mm_shuffle_epi8(sb1[1].m, jo);
510 1.1 riastrad A ^= loadroundkey(rk32);
511 1.1 riastrad A2 = _mm_shuffle_epi8(sb2[0].m, io) ^
512 1.1 riastrad _mm_shuffle_epi8(sb2[1].m, jo);
513 1.1 riastrad A2_B = A2 ^ _mm_shuffle_epi8(A, mc_forward[rmod4].m);
514 1.1 riastrad A2_B_D = A2_B ^ _mm_shuffle_epi8(A, mc_backward[rmod4].m);
515 1.1 riastrad x = A2_B_D ^ _mm_shuffle_epi8(A2_B, mc_forward[rmod4].m);
516 1.1 riastrad }
517 1.1 riastrad x = _mm_shuffle_epi8(sbo[0].m, io) ^ _mm_shuffle_epi8(sbo[1].m, jo);
518 1.1 riastrad x ^= loadroundkey(rk32);
519 1.1 riastrad return _mm_shuffle_epi8(x, sr[rmod4].m);
520 1.1 riastrad }
521 1.1 riastrad
522 1.1 riastrad __m128i
523 1.1 riastrad aes_ssse3_dec1(const struct aesdec *dec, __m128i x, unsigned nrounds)
524 1.1 riastrad {
525 1.1 riastrad const uint32_t *rk32 = dec->aesd_aes.aes_rk;
526 1.1 riastrad unsigned i = 3 & ~(nrounds - 1);
527 1.1 riastrad __m128i io, jo, mc;
528 1.1 riastrad
529 1.1 riastrad x = aes_schedule_transform(x, dipt);
530 1.1 riastrad x ^= loadroundkey(rk32);
531 1.1 riastrad rk32 += 4;
532 1.1 riastrad
533 1.1 riastrad mc = mc_forward[3].m;
534 1.1 riastrad for (;;) {
535 1.1 riastrad subbytes(&io, &jo, x);
536 1.1 riastrad if (--nrounds == 0)
537 1.1 riastrad break;
538 1.1 riastrad
539 1.1 riastrad x = _mm_shuffle_epi8(dsb9[0].m, io) ^
540 1.1 riastrad _mm_shuffle_epi8(dsb9[1].m, jo);
541 1.1 riastrad x ^= loadroundkey(rk32);
542 1.1 riastrad rk32 += 4; /* next round key */
543 1.1 riastrad
544 1.1 riastrad x = _mm_shuffle_epi8(x, mc);
545 1.1 riastrad x ^= _mm_shuffle_epi8(dsbd[0].m, io) ^
546 1.1 riastrad _mm_shuffle_epi8(dsbd[1].m, jo);
547 1.1 riastrad
548 1.1 riastrad x = _mm_shuffle_epi8(x, mc);
549 1.1 riastrad x ^= _mm_shuffle_epi8(dsbb[0].m, io) ^
550 1.1 riastrad _mm_shuffle_epi8(dsbb[1].m, jo);
551 1.1 riastrad
552 1.1 riastrad x = _mm_shuffle_epi8(x, mc);
553 1.1 riastrad x ^= _mm_shuffle_epi8(dsbe[0].m, io) ^
554 1.1 riastrad _mm_shuffle_epi8(dsbe[1].m, jo);
555 1.1 riastrad
556 1.1 riastrad mc = _mm_alignr_epi8(mc, mc, 12);
557 1.1 riastrad }
558 1.1 riastrad x = _mm_shuffle_epi8(dsbo[0].m, io) ^ _mm_shuffle_epi8(dsbo[1].m, jo);
559 1.1 riastrad x ^= loadroundkey(rk32);
560 1.1 riastrad return _mm_shuffle_epi8(x, sr[i].m);
561 1.1 riastrad }
562