aes_neon.c revision 1.2 1 /* $NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
31 * software, at <https://crypto.stanford.edu/vpaes/>, described in
32 *
33 * Mike Hamburg, `Accelerating AES with Vector Permute
34 * Instructions', in Christophe Clavier and Kris Gaj (eds.),
35 * Cryptographic Hardware and Embedded Systems -- CHES 2009,
36 * Springer LNCS 5747, pp. 18-32.
37 *
38 * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39 */
40
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $");
43
44 #include <sys/types.h>
45
46 #include <sys/systm.h>
47
48 #include "aes_neon_impl.h"
49
50 #ifdef __aarch64__
51 #define __aarch64_used
52 #else
53 #define __aarch64_used __unused
54 #endif
55
56 static const uint8x16_t
57 mc_forward[4] = {
58 {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
59 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C},
60 {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
61 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00},
62 {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
63 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04},
64 {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
65 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
66 },
67 mc_backward[4] __aarch64_used = {
68 {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
69 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
70 {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
71 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A},
72 {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
73 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06},
74 {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
75 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
76 },
77 ipt[2] __aarch64_used = {
78 {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
79 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
80 {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
81 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD},
82 },
83 opt[2] = {
84 {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
85 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7},
86 {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
87 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
88 },
89 dipt[2] __aarch64_used = {
90 {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
91 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
92 {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
93 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
94 },
95 sb1[2] __aarch64_used = {
96 {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
97 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
98 {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
99 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
100 },
101 sb2[2] __aarch64_used = {
102 {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
103 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
104 {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
105 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
106 },
107 sbo[2] __aarch64_used = {
108 {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
109 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
110 {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
111 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
112 },
113 dsb9[2] __aarch64_used = {
114 {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
115 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
116 {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
117 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
118 },
119 dsbd[2] __aarch64_used = {
120 {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
121 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
122 {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
123 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
124 },
125 dsbb[2] __aarch64_used = {
126 {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
127 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
128 {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
129 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
130 },
131 dsbe[2] __aarch64_used = {
132 {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
133 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
134 {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
135 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
136 },
137 dsbo[2] __aarch64_used = {
138 {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
139 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
140 {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
141 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA},
142 },
143 dks1[2] = {
144 {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
145 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A},
146 {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
147 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B},
148 },
149 dks2[2] = {
150 {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
151 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46},
152 {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
153 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73},
154 },
155 dks3[2] = {
156 {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
157 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8},
158 {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
159 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5},
160 },
161 dks4[2] = {
162 {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
163 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0},
164 {0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
165 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F},
166 },
167 deskew[2] = {
168 {0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
169 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D},
170 {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
171 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
172 },
173 sr[4] __aarch64_used = {
174 {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
175 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
176 {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
177 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B},
178 {0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
179 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07},
180 {0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
181 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03},
182 },
183 rcon = {0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
184 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70},
185 s63 = {0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
186 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B},
187 of = {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
188 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F},
189 inv = {0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
190 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04},
191 inva = {0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
192 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03};
193
194 static inline uint8x16_t
195 loadroundkey(const void *rkp)
196 {
197 return vld1q_u8(rkp);
198 }
199
200 static inline void
201 storeroundkey(void *rkp, uint8x16_t rk)
202 {
203 vst1q_u8(rkp, rk);
204 }
205
206 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */
207 static inline void
208 bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
209 {
210
211 *lo = of & x;
212 *hi = of & vshrq_n_u8(x, 4);
213 }
214
215 /*
216 * t is a pair of maps respectively from low and high nybbles to bytes.
217 * Apply t the nybbles, and add the results in GF(2).
218 */
219 static uint8x16_t
220 aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
221 {
222 uint8x16_t lo, hi;
223
224 bytes2nybbles(&lo, &hi, x);
225 return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
226 }
227
228 static inline void
229 subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
230 uint8x16_t inva_)
231 {
232 uint8x16_t k, i, ak, j;
233
234 bytes2nybbles(&k, &i, x);
235 ak = vqtbl1q_u8(inva_, k);
236 j = i ^ k;
237 *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
238 *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
239 }
240
241 static uint8x16_t
242 aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
243 {
244 uint8x16_t io, jo;
245
246 /* smear prk */
247 prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
248 prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
249 prk ^= s63;
250
251 /* subbytes */
252 subbytes(&io, &jo, rk, inv, inva);
253 rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
254
255 /* add in smeared stuff */
256 return rk ^ prk;
257 }
258
259 static uint8x16_t
260 aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
261 {
262 uint32x4_t rk32;
263
264 /* extract rcon from rcon_rot */
265 prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
266 *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
267
268 /* rotate */
269 rk32 = vreinterpretq_u32_u8(rk);
270 rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
271 rk = vreinterpretq_u8_u32(rk32);
272 rk = vextq_u8(rk, rk, 1);
273
274 return aes_schedule_low_round(rk, prk);
275 }
276
277 static uint8x16_t
278 aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
279 {
280 uint8x16_t y = vdupq_n_u8(0);
281
282 x ^= s63;
283
284 x = vqtbl1q_u8(x, mc_forward[0]);
285 y ^= x;
286 x = vqtbl1q_u8(x, mc_forward[0]);
287 y ^= x;
288 x = vqtbl1q_u8(x, mc_forward[0]);
289 y ^= x;
290
291 return vqtbl1q_u8(y, sr_i);
292 }
293
294 static uint8x16_t
295 aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
296 {
297
298 return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
299 }
300
301 static uint8x16_t
302 aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
303 {
304 uint8x16_t y = vdupq_n_u8(0);
305
306 x = aes_schedule_transform(x, dks1);
307 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
308 x = aes_schedule_transform(x, dks2);
309 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
310 x = aes_schedule_transform(x, dks3);
311 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
312 x = aes_schedule_transform(x, dks4);
313 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
314
315 return vqtbl1q_u8(y, sr_i);
316 }
317
318 static uint8x16_t
319 aes_schedule_mangle_last_dec(uint8x16_t x)
320 {
321
322 return aes_schedule_transform(x ^ s63, deskew);
323 }
324
325 static uint8x16_t
326 aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
327 {
328 uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
329 uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
330 uint32x4_t rk32;
331
332 rk32 = prkhi32;
333 rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
334 vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
335 3);
336 rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
337 vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
338 0);
339
340 return vreinterpretq_u8_u32(rk32);
341 }
342
343 static uint8x16_t
344 aes_schedule_192_smearhi(uint8x16_t rk)
345 {
346 uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
347
348 rk64 = vsetq_lane_u64(0, rk64, 0);
349
350 return vreinterpretq_u8_u64(rk64);
351 }
352
353 void
354 aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
355 {
356 uint32_t *rk32 = enc->aese_aes.aes_rk;
357 uint8x16_t mrk; /* mangled round key */
358 uint8x16_t rk; /* round key */
359 uint8x16_t prk; /* previous round key */
360 uint8x16_t rcon_rot = rcon;
361 uint64_t i = 3;
362
363 /* input transform */
364 rk = aes_schedule_transform(vld1q_u8(key), ipt);
365 storeroundkey(rk32, rk);
366 rk32 += 4;
367
368 switch (nrounds) {
369 case 10:
370 for (;;) {
371 rk = aes_schedule_round(rk, rk, &rcon_rot);
372 if (--nrounds == 0)
373 break;
374 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
375 storeroundkey(rk32, mrk);
376 rk32 += 4;
377 }
378 break;
379 case 12: {
380 uint8x16_t prkhi; /* high half of previous round key */
381
382 prk = rk;
383 rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
384 prkhi = aes_schedule_192_smearhi(rk);
385 for (;;) {
386 prk = aes_schedule_round(rk, prk, &rcon_rot);
387 rk = vextq_u8(prkhi, prk, 8);
388
389 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
390 storeroundkey(rk32, mrk);
391 rk32 += 4;
392 rk = aes_schedule_192_smear(prkhi, prk);
393 prkhi = aes_schedule_192_smearhi(rk);
394
395 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
396 storeroundkey(rk32, mrk);
397 rk32 += 4;
398 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
399 if ((nrounds -= 3) == 0)
400 break;
401
402 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
403 storeroundkey(rk32, mrk);
404 rk32 += 4;
405 rk = aes_schedule_192_smear(prkhi, prk);
406 prkhi = aes_schedule_192_smearhi(rk);
407 }
408 break;
409 }
410 case 14: {
411 uint8x16_t pprk; /* previous previous round key */
412
413 prk = rk;
414 rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
415 for (;;) {
416 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
417 storeroundkey(rk32, mrk);
418 rk32 += 4;
419 pprk = rk;
420
421 /* high round */
422 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
423 if ((nrounds -= 2) == 0)
424 break;
425 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
426 storeroundkey(rk32, mrk);
427 rk32 += 4;
428
429 /* low round */
430 rk = vreinterpretq_u8_u32(
431 vdupq_n_u32(
432 vgetq_lane_u32(vreinterpretq_u32_u8(rk),
433 3)));
434 rk = aes_schedule_low_round(rk, pprk);
435 }
436 break;
437 }
438 default:
439 panic("invalid number of AES rounds: %u", nrounds);
440 }
441 storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
442 }
443
444 void
445 aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
446 {
447 uint32_t *rk32 = dec->aesd_aes.aes_rk;
448 uint8x16_t mrk; /* mangled round key */
449 uint8x16_t ork; /* original round key */
450 uint8x16_t rk; /* round key */
451 uint8x16_t prk; /* previous round key */
452 uint8x16_t rcon_rot = rcon;
453 unsigned i = nrounds == 12 ? 0 : 2;
454
455 ork = vld1q_u8(key);
456
457 /* input transform */
458 rk = aes_schedule_transform(ork, ipt);
459
460 /* go from end */
461 rk32 += 4*nrounds;
462 storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
463 rk32 -= 4;
464 i ^= 3;
465
466 switch (nrounds) {
467 case 10:
468 for (;;) {
469 rk = aes_schedule_round(rk, rk, &rcon_rot);
470 if (--nrounds == 0)
471 break;
472 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
473 storeroundkey(rk32, mrk);
474 rk32 -= 4;
475 }
476 break;
477 case 12: {
478 uint8x16_t prkhi; /* high half of previous round key */
479
480 prk = rk;
481 rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
482 prkhi = aes_schedule_192_smearhi(rk);
483 for (;;) {
484 prk = aes_schedule_round(rk, prk, &rcon_rot);
485 rk = vextq_u8(prkhi, prk, 8);
486
487 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
488 storeroundkey(rk32, mrk);
489 rk32 -= 4;
490 rk = aes_schedule_192_smear(prkhi, prk);
491 prkhi = aes_schedule_192_smearhi(rk);
492
493 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
494 storeroundkey(rk32, mrk);
495 rk32 -= 4;
496 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
497 if ((nrounds -= 3) == 0)
498 break;
499
500 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
501 storeroundkey(rk32, mrk);
502 rk32 -= 4;
503 rk = aes_schedule_192_smear(prkhi, prk);
504 prkhi = aes_schedule_192_smearhi(rk);
505 }
506 break;
507 }
508 case 14: {
509 uint8x16_t pprk; /* previous previous round key */
510
511 prk = rk;
512 rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
513 for (;;) {
514 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
515 storeroundkey(rk32, mrk);
516 rk32 -= 4;
517 pprk = rk;
518
519 /* high round */
520 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
521 if ((nrounds -= 2) == 0)
522 break;
523 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
524 storeroundkey(rk32, mrk);
525 rk32 -= 4;
526
527 /* low round */
528 rk = vreinterpretq_u8_u32(
529 vdupq_n_u32(
530 vgetq_lane_u32(vreinterpretq_u32_u8(rk),
531 3)));
532 rk = aes_schedule_low_round(rk, pprk);
533 }
534 break;
535 }
536 default:
537 panic("invalid number of AES rounds: %u", nrounds);
538 }
539 storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
540 }
541
542 #ifdef __aarch64__
543
544 /*
545 * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
546 * do the performance-critical parts -- encryption and decryption -- in
547 * hand-written assembly on arm32.
548 */
549
550 uint8x16_t
551 aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
552 {
553 const uint32_t *rk32 = enc->aese_aes.aes_rk;
554 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
555 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
556 uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
557 uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
558 uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
559 uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
560 uint8x16_t io, jo;
561 unsigned rmod4 = 0;
562
563 x = aes_schedule_transform(x, ipt);
564 x ^= loadroundkey(rk32);
565 for (;;) {
566 uint8x16_t A, A2, A2_B, A2_B_D;
567
568 subbytes(&io, &jo, x, inv_, inva_);
569
570 rk32 += 4;
571 rmod4 = (rmod4 + 1) % 4;
572 if (--nrounds == 0)
573 break;
574
575 A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
576 A ^= loadroundkey(rk32);
577 A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
578 A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
579 A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
580 x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
581 }
582 x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
583 x ^= loadroundkey(rk32);
584 return vqtbl1q_u8(x, sr[rmod4]);
585 }
586
587 uint8x16_t
588 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
589 {
590 const uint32_t *rk32 = dec->aesd_aes.aes_rk;
591 unsigned i = 3 & ~(nrounds - 1);
592 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
593 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
594 uint8x16_t io, jo, mc;
595
596 x = aes_schedule_transform(x, dipt);
597 x ^= loadroundkey(rk32);
598 rk32 += 4;
599
600 mc = mc_forward[3];
601 for (;;) {
602 subbytes(&io, &jo, x, inv_, inva_);
603 if (--nrounds == 0)
604 break;
605
606 x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
607 x ^= loadroundkey(rk32);
608 rk32 += 4; /* next round key */
609
610 x = vqtbl1q_u8(x, mc);
611 x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
612
613 x = vqtbl1q_u8(x, mc);
614 x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
615
616 x = vqtbl1q_u8(x, mc);
617 x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
618
619 mc = vextq_u8(mc, mc, 12);
620 }
621 x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
622 x ^= loadroundkey(rk32);
623 return vqtbl1q_u8(x, sr[i]);
624 }
625
626 #endif
627