aes_neon.c revision 1.5 1 /* $NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
31 * software, at <https://crypto.stanford.edu/vpaes/>, described in
32 *
33 * Mike Hamburg, `Accelerating AES with Vector Permute
34 * Instructions', in Christophe Clavier and Kris Gaj (eds.),
35 * Cryptographic Hardware and Embedded Systems -- CHES 2009,
36 * Springer LNCS 5747, pp. 18-32.
37 *
38 * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39 */
40
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $");
43
44 #include <sys/types.h>
45
46 #ifdef _KERNEL
47 #include <sys/systm.h>
48 #else
49 #include <err.h>
50 #define panic(fmt, args...) err(1, fmt, ##args)
51 #endif
52
53 #include "aes_neon_impl.h"
54
55 #ifdef __aarch64__
56 #define __aarch64_used
57 #else
58 #define __aarch64_used __unused
59 #endif
60
61 static const uint8x16_t
62 mc_forward[4] = {
63 VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
64 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C),
65 VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
66 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00),
67 VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
68 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04),
69 VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
70 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08),
71 },
72 mc_backward[4] __aarch64_used = {
73 VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
74 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E),
75 VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
76 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A),
77 VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
78 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06),
79 VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
80 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02),
81 },
82 ipt[2] __aarch64_used = {
83 VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
84 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA),
85 VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
86 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD),
87 },
88 opt[2] = {
89 VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
90 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7),
91 VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
92 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1),
93 },
94 dipt[2] __aarch64_used = {
95 VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
96 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15),
97 VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
98 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12),
99 },
100 sb1[2] __aarch64_used = {
101 VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
102 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5),
103 VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
104 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B),
105 },
106 sb2[2] __aarch64_used = {
107 VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
108 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E),
109 VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
110 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2),
111 },
112 sbo[2] __aarch64_used = {
113 VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
114 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15),
115 VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
116 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E),
117 },
118 dsb9[2] __aarch64_used = {
119 VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
120 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA),
121 VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
122 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72),
123 },
124 dsbd[2] __aarch64_used = {
125 VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
126 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5),
127 VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
128 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29),
129 },
130 dsbb[2] __aarch64_used = {
131 VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
132 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60),
133 VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
134 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3),
135 },
136 dsbe[2] __aarch64_used = {
137 VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
138 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22),
139 VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
140 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94),
141 },
142 dsbo[2] __aarch64_used = {
143 VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
144 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7),
145 VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
146 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA),
147 },
148 dks1[2] = {
149 VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
150 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A),
151 VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
152 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B),
153 },
154 dks2[2] = {
155 VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
156 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46),
157 VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
158 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73),
159 },
160 dks3[2] = {
161 VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
162 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8),
163 VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
164 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5),
165 },
166 dks4[2] = {
167 VQ_N_U8(0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
168 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0),
169 VQ_N_U8(0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
170 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F),
171 },
172 deskew[2] = {
173 VQ_N_U8(0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
174 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D),
175 VQ_N_U8(0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
176 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28),
177 },
178 sr[4] __aarch64_used = {
179 VQ_N_U8(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
180 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F),
181 VQ_N_U8(0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
182 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B),
183 VQ_N_U8(0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
184 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07),
185 VQ_N_U8(0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
186 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03),
187 },
188 rcon = VQ_N_U8(0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
189 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70),
190 of = VQ_N_U8(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
191 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F),
192 s63 = VQ_N_U8(0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
193 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B),
194 inv = VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
195 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04),
196 inva = VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
197 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03);
198
199 static inline uint8x16_t
200 loadroundkey(const void *rkp)
201 {
202 return vld1q_u8(rkp);
203 }
204
205 static inline void
206 storeroundkey(void *rkp, uint8x16_t rk)
207 {
208 vst1q_u8(rkp, rk);
209 }
210
211 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */
212 static inline void
213 bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
214 {
215
216 *lo = of & x;
217 *hi = of & vshrq_n_u8(x, 4);
218 }
219
220 /*
221 * t is a pair of maps respectively from low and high nybbles to bytes.
222 * Apply t the nybbles, and add the results in GF(2).
223 */
224 static uint8x16_t
225 aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
226 {
227 uint8x16_t lo, hi;
228
229 bytes2nybbles(&lo, &hi, x);
230 return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
231 }
232
233 static inline void
234 subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
235 uint8x16_t inva_)
236 {
237 uint8x16_t k, i, ak, j;
238
239 bytes2nybbles(&k, &i, x);
240 ak = vqtbl1q_u8(inva_, k);
241 j = i ^ k;
242 *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
243 *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
244 }
245
246 static uint8x16_t
247 aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
248 {
249 uint8x16_t io, jo;
250
251 /* smear prk */
252 prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
253 prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
254 prk ^= s63;
255
256 /* subbytes */
257 subbytes(&io, &jo, rk, inv, inva);
258 rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
259
260 /* add in smeared stuff */
261 return rk ^ prk;
262 }
263
264 static uint8x16_t
265 aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
266 {
267 uint32x4_t rk32;
268
269 /* extract rcon from rcon_rot */
270 prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
271 *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
272
273 /* rotate */
274 rk32 = vreinterpretq_u32_u8(rk);
275 rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
276 rk = vreinterpretq_u8_u32(rk32);
277 rk = vextq_u8(rk, rk, 1);
278
279 return aes_schedule_low_round(rk, prk);
280 }
281
282 static uint8x16_t
283 aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
284 {
285 uint8x16_t y = vdupq_n_u8(0);
286
287 x ^= s63;
288
289 x = vqtbl1q_u8(x, mc_forward[0]);
290 y ^= x;
291 x = vqtbl1q_u8(x, mc_forward[0]);
292 y ^= x;
293 x = vqtbl1q_u8(x, mc_forward[0]);
294 y ^= x;
295
296 return vqtbl1q_u8(y, sr_i);
297 }
298
299 static uint8x16_t
300 aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
301 {
302
303 return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
304 }
305
306 static uint8x16_t
307 aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
308 {
309 uint8x16_t y = vdupq_n_u8(0);
310
311 x = aes_schedule_transform(x, dks1);
312 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
313 x = aes_schedule_transform(x, dks2);
314 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
315 x = aes_schedule_transform(x, dks3);
316 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
317 x = aes_schedule_transform(x, dks4);
318 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
319
320 return vqtbl1q_u8(y, sr_i);
321 }
322
323 static uint8x16_t
324 aes_schedule_mangle_last_dec(uint8x16_t x)
325 {
326
327 return aes_schedule_transform(x ^ s63, deskew);
328 }
329
330 static uint8x16_t
331 aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
332 {
333 uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
334 uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
335 uint32x4_t rk32;
336
337 rk32 = prkhi32;
338 rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
339 vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
340 3);
341 rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
342 vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
343 0);
344
345 return vreinterpretq_u8_u32(rk32);
346 }
347
348 static uint8x16_t
349 aes_schedule_192_smearhi(uint8x16_t rk)
350 {
351 uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
352
353 rk64 = vsetq_lane_u64(0, rk64, 0);
354
355 return vreinterpretq_u8_u64(rk64);
356 }
357
358 void
359 aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
360 {
361 uint32_t *rk32 = enc->aese_aes.aes_rk;
362 uint8x16_t mrk; /* mangled round key */
363 uint8x16_t rk; /* round key */
364 uint8x16_t prk; /* previous round key */
365 uint8x16_t rcon_rot = rcon;
366 uint64_t i = 3;
367
368 /* input transform */
369 rk = aes_schedule_transform(vld1q_u8(key), ipt);
370 storeroundkey(rk32, rk);
371 rk32 += 4;
372
373 switch (nrounds) {
374 case 10:
375 for (;;) {
376 rk = aes_schedule_round(rk, rk, &rcon_rot);
377 if (--nrounds == 0)
378 break;
379 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
380 storeroundkey(rk32, mrk);
381 rk32 += 4;
382 }
383 break;
384 case 12: {
385 uint8x16_t prkhi; /* high half of previous round key */
386
387 prk = rk;
388 rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
389 prkhi = aes_schedule_192_smearhi(rk);
390 for (;;) {
391 prk = aes_schedule_round(rk, prk, &rcon_rot);
392 rk = vextq_u8(prkhi, prk, 8);
393
394 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
395 storeroundkey(rk32, mrk);
396 rk32 += 4;
397 rk = aes_schedule_192_smear(prkhi, prk);
398 prkhi = aes_schedule_192_smearhi(rk);
399
400 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
401 storeroundkey(rk32, mrk);
402 rk32 += 4;
403 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
404 if ((nrounds -= 3) == 0)
405 break;
406
407 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
408 storeroundkey(rk32, mrk);
409 rk32 += 4;
410 rk = aes_schedule_192_smear(prkhi, prk);
411 prkhi = aes_schedule_192_smearhi(rk);
412 }
413 break;
414 }
415 case 14: {
416 uint8x16_t pprk; /* previous previous round key */
417
418 prk = rk;
419 rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
420 for (;;) {
421 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
422 storeroundkey(rk32, mrk);
423 rk32 += 4;
424 pprk = rk;
425
426 /* high round */
427 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
428 if ((nrounds -= 2) == 0)
429 break;
430 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
431 storeroundkey(rk32, mrk);
432 rk32 += 4;
433
434 /* low round */
435 rk = vreinterpretq_u8_u32(
436 vdupq_n_u32(
437 vgetq_lane_u32(vreinterpretq_u32_u8(rk),
438 3)));
439 rk = aes_schedule_low_round(rk, pprk);
440 }
441 break;
442 }
443 default:
444 panic("invalid number of AES rounds: %u", nrounds);
445 }
446 storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
447 }
448
449 void
450 aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
451 {
452 uint32_t *rk32 = dec->aesd_aes.aes_rk;
453 uint8x16_t mrk; /* mangled round key */
454 uint8x16_t ork; /* original round key */
455 uint8x16_t rk; /* round key */
456 uint8x16_t prk; /* previous round key */
457 uint8x16_t rcon_rot = rcon;
458 unsigned i = nrounds == 12 ? 0 : 2;
459
460 ork = vld1q_u8(key);
461
462 /* input transform */
463 rk = aes_schedule_transform(ork, ipt);
464
465 /* go from end */
466 rk32 += 4*nrounds;
467 storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
468 rk32 -= 4;
469 i ^= 3;
470
471 switch (nrounds) {
472 case 10:
473 for (;;) {
474 rk = aes_schedule_round(rk, rk, &rcon_rot);
475 if (--nrounds == 0)
476 break;
477 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
478 storeroundkey(rk32, mrk);
479 rk32 -= 4;
480 }
481 break;
482 case 12: {
483 uint8x16_t prkhi; /* high half of previous round key */
484
485 prk = rk;
486 rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
487 prkhi = aes_schedule_192_smearhi(rk);
488 for (;;) {
489 prk = aes_schedule_round(rk, prk, &rcon_rot);
490 rk = vextq_u8(prkhi, prk, 8);
491
492 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
493 storeroundkey(rk32, mrk);
494 rk32 -= 4;
495 rk = aes_schedule_192_smear(prkhi, prk);
496 prkhi = aes_schedule_192_smearhi(rk);
497
498 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
499 storeroundkey(rk32, mrk);
500 rk32 -= 4;
501 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
502 if ((nrounds -= 3) == 0)
503 break;
504
505 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
506 storeroundkey(rk32, mrk);
507 rk32 -= 4;
508 rk = aes_schedule_192_smear(prkhi, prk);
509 prkhi = aes_schedule_192_smearhi(rk);
510 }
511 break;
512 }
513 case 14: {
514 uint8x16_t pprk; /* previous previous round key */
515
516 prk = rk;
517 rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
518 for (;;) {
519 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
520 storeroundkey(rk32, mrk);
521 rk32 -= 4;
522 pprk = rk;
523
524 /* high round */
525 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
526 if ((nrounds -= 2) == 0)
527 break;
528 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
529 storeroundkey(rk32, mrk);
530 rk32 -= 4;
531
532 /* low round */
533 rk = vreinterpretq_u8_u32(
534 vdupq_n_u32(
535 vgetq_lane_u32(vreinterpretq_u32_u8(rk),
536 3)));
537 rk = aes_schedule_low_round(rk, pprk);
538 }
539 break;
540 }
541 default:
542 panic("invalid number of AES rounds: %u", nrounds);
543 }
544 storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
545 }
546
547 #ifdef __aarch64__
548
549 /*
550 * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
551 * do the performance-critical parts -- encryption and decryption -- in
552 * hand-written assembly on arm32.
553 */
554
555 uint8x16_t
556 aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
557 {
558 const uint32_t *rk32 = enc->aese_aes.aes_rk;
559 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
560 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
561 uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
562 uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
563 uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
564 uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
565 uint8x16_t io, jo;
566 unsigned rmod4 = 0;
567
568 x = aes_schedule_transform(x, ipt);
569 x ^= loadroundkey(rk32);
570 for (;;) {
571 uint8x16_t A, A2, A2_B, A2_B_D;
572
573 subbytes(&io, &jo, x, inv_, inva_);
574
575 rk32 += 4;
576 rmod4 = (rmod4 + 1) % 4;
577 if (--nrounds == 0)
578 break;
579
580 A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
581 A ^= loadroundkey(rk32);
582 A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
583 A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
584 A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
585 x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
586 }
587 x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
588 x ^= loadroundkey(rk32);
589 return vqtbl1q_u8(x, sr[rmod4]);
590 }
591
592 uint8x16x2_t
593 aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
594 {
595 const uint32_t *rk32 = enc->aese_aes.aes_rk;
596 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
597 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
598 uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
599 uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
600 uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
601 uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
602 uint8x16_t x0 = x.val[0], x1 = x.val[1];
603 uint8x16_t io0, jo0, io1, jo1;
604 unsigned rmod4 = 0;
605
606 x0 = aes_schedule_transform(x0, ipt);
607 x1 = aes_schedule_transform(x1, ipt);
608 x0 ^= loadroundkey(rk32);
609 x1 ^= loadroundkey(rk32);
610 for (;;) {
611 uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
612 uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
613
614 subbytes(&io0, &jo0, x0, inv_, inva_);
615 subbytes(&io1, &jo1, x1, inv_, inva_);
616
617 rk32 += 4;
618 rmod4 = (rmod4 + 1) % 4;
619 if (--nrounds == 0)
620 break;
621
622 A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
623 A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
624 A_0 ^= loadroundkey(rk32);
625 A_1 ^= loadroundkey(rk32);
626 A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
627 A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
628 A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
629 A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
630 A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
631 A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
632 x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
633 x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
634 }
635 x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
636 x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
637 x0 ^= loadroundkey(rk32);
638 x1 ^= loadroundkey(rk32);
639 return (uint8x16x2_t) { .val = {
640 [0] = vqtbl1q_u8(x0, sr[rmod4]),
641 [1] = vqtbl1q_u8(x1, sr[rmod4]),
642 } };
643 }
644
645 uint8x16_t
646 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
647 {
648 const uint32_t *rk32 = dec->aesd_aes.aes_rk;
649 unsigned i = 3 & ~(nrounds - 1);
650 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
651 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
652 uint8x16_t io, jo, mc;
653
654 x = aes_schedule_transform(x, dipt);
655 x ^= loadroundkey(rk32);
656 rk32 += 4;
657
658 mc = mc_forward[3];
659 for (;;) {
660 subbytes(&io, &jo, x, inv_, inva_);
661 if (--nrounds == 0)
662 break;
663
664 x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
665 x ^= loadroundkey(rk32);
666 rk32 += 4; /* next round key */
667
668 x = vqtbl1q_u8(x, mc);
669 x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
670
671 x = vqtbl1q_u8(x, mc);
672 x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
673
674 x = vqtbl1q_u8(x, mc);
675 x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
676
677 mc = vextq_u8(mc, mc, 12);
678 }
679 x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
680 x ^= loadroundkey(rk32);
681 return vqtbl1q_u8(x, sr[i]);
682 }
683
684 uint8x16x2_t
685 aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
686 {
687 const uint32_t *rk32 = dec->aesd_aes.aes_rk;
688 unsigned i = 3 & ~(nrounds - 1);
689 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
690 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
691 uint8x16_t x0 = x.val[0], x1 = x.val[1];
692 uint8x16_t io0, jo0, io1, jo1, mc;
693
694 x0 = aes_schedule_transform(x0, dipt);
695 x1 = aes_schedule_transform(x1, dipt);
696 x0 ^= loadroundkey(rk32);
697 x1 ^= loadroundkey(rk32);
698 rk32 += 4;
699
700 mc = mc_forward[3];
701 for (;;) {
702 subbytes(&io0, &jo0, x0, inv_, inva_);
703 subbytes(&io1, &jo1, x1, inv_, inva_);
704 if (--nrounds == 0)
705 break;
706
707 x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
708 x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
709 x0 ^= loadroundkey(rk32);
710 x1 ^= loadroundkey(rk32);
711 rk32 += 4; /* next round key */
712
713 x0 = vqtbl1q_u8(x0, mc);
714 x1 = vqtbl1q_u8(x1, mc);
715 x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
716 x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
717
718 x0 = vqtbl1q_u8(x0, mc);
719 x1 = vqtbl1q_u8(x1, mc);
720 x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
721 x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
722
723 x0 = vqtbl1q_u8(x0, mc);
724 x1 = vqtbl1q_u8(x1, mc);
725 x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
726 x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
727
728 mc = vextq_u8(mc, mc, 12);
729 }
730 x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
731 x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
732 x0 ^= loadroundkey(rk32);
733 x1 ^= loadroundkey(rk32);
734 return (uint8x16x2_t) { .val = {
735 [0] = vqtbl1q_u8(x0, sr[i]),
736 [1] = vqtbl1q_u8(x1, sr[i]),
737 } };
738 }
739
740 #endif
741