aes_neon.c revision 1.6 1 1.6 rin /* $NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad /*
30 1.1 riastrad * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
31 1.1 riastrad * software, at <https://crypto.stanford.edu/vpaes/>, described in
32 1.1 riastrad *
33 1.1 riastrad * Mike Hamburg, `Accelerating AES with Vector Permute
34 1.1 riastrad * Instructions', in Christophe Clavier and Kris Gaj (eds.),
35 1.1 riastrad * Cryptographic Hardware and Embedded Systems -- CHES 2009,
36 1.1 riastrad * Springer LNCS 5747, pp. 18-32.
37 1.1 riastrad *
38 1.1 riastrad * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39 1.1 riastrad */
40 1.1 riastrad
41 1.1 riastrad #include <sys/cdefs.h>
42 1.6 rin __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $");
43 1.1 riastrad
44 1.1 riastrad #include <sys/types.h>
45 1.1 riastrad
46 1.3 riastrad #ifdef _KERNEL
47 1.1 riastrad #include <sys/systm.h>
48 1.3 riastrad #else
49 1.3 riastrad #include <err.h>
50 1.3 riastrad #define panic(fmt, args...) err(1, fmt, ##args)
51 1.3 riastrad #endif
52 1.1 riastrad
53 1.1 riastrad #include "aes_neon_impl.h"
54 1.1 riastrad
55 1.2 riastrad #ifdef __aarch64__
56 1.2 riastrad #define __aarch64_used
57 1.2 riastrad #else
58 1.2 riastrad #define __aarch64_used __unused
59 1.2 riastrad #endif
60 1.2 riastrad
61 1.1 riastrad static const uint8x16_t
62 1.1 riastrad mc_forward[4] = {
63 1.5 riastrad VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
64 1.5 riastrad 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C),
65 1.5 riastrad VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
66 1.5 riastrad 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00),
67 1.5 riastrad VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
68 1.5 riastrad 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04),
69 1.5 riastrad VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
70 1.5 riastrad 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08),
71 1.1 riastrad },
72 1.2 riastrad mc_backward[4] __aarch64_used = {
73 1.5 riastrad VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
74 1.5 riastrad 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E),
75 1.5 riastrad VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
76 1.5 riastrad 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A),
77 1.5 riastrad VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
78 1.5 riastrad 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06),
79 1.5 riastrad VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
80 1.5 riastrad 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02),
81 1.1 riastrad },
82 1.2 riastrad ipt[2] __aarch64_used = {
83 1.5 riastrad VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
84 1.5 riastrad 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA),
85 1.5 riastrad VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
86 1.5 riastrad 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD),
87 1.1 riastrad },
88 1.1 riastrad opt[2] = {
89 1.5 riastrad VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
90 1.5 riastrad 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7),
91 1.5 riastrad VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
92 1.5 riastrad 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1),
93 1.1 riastrad },
94 1.2 riastrad dipt[2] __aarch64_used = {
95 1.5 riastrad VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
96 1.5 riastrad 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15),
97 1.5 riastrad VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
98 1.5 riastrad 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12),
99 1.1 riastrad },
100 1.2 riastrad sb1[2] __aarch64_used = {
101 1.5 riastrad VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
102 1.5 riastrad 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5),
103 1.5 riastrad VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
104 1.5 riastrad 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B),
105 1.1 riastrad },
106 1.2 riastrad sb2[2] __aarch64_used = {
107 1.5 riastrad VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
108 1.5 riastrad 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E),
109 1.5 riastrad VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
110 1.5 riastrad 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2),
111 1.1 riastrad },
112 1.2 riastrad sbo[2] __aarch64_used = {
113 1.5 riastrad VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
114 1.5 riastrad 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15),
115 1.5 riastrad VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
116 1.5 riastrad 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E),
117 1.1 riastrad },
118 1.2 riastrad dsb9[2] __aarch64_used = {
119 1.5 riastrad VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
120 1.5 riastrad 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA),
121 1.5 riastrad VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
122 1.5 riastrad 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72),
123 1.1 riastrad },
124 1.2 riastrad dsbd[2] __aarch64_used = {
125 1.5 riastrad VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
126 1.5 riastrad 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5),
127 1.5 riastrad VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
128 1.5 riastrad 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29),
129 1.1 riastrad },
130 1.2 riastrad dsbb[2] __aarch64_used = {
131 1.5 riastrad VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
132 1.5 riastrad 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60),
133 1.5 riastrad VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
134 1.5 riastrad 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3),
135 1.1 riastrad },
136 1.2 riastrad dsbe[2] __aarch64_used = {
137 1.5 riastrad VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
138 1.5 riastrad 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22),
139 1.5 riastrad VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
140 1.5 riastrad 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94),
141 1.1 riastrad },
142 1.2 riastrad dsbo[2] __aarch64_used = {
143 1.5 riastrad VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
144 1.5 riastrad 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7),
145 1.5 riastrad VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
146 1.5 riastrad 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA),
147 1.1 riastrad },
148 1.1 riastrad dks1[2] = {
149 1.5 riastrad VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
150 1.5 riastrad 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A),
151 1.5 riastrad VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
152 1.5 riastrad 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B),
153 1.1 riastrad },
154 1.1 riastrad dks2[2] = {
155 1.5 riastrad VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
156 1.5 riastrad 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46),
157 1.5 riastrad VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
158 1.5 riastrad 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73),
159 1.1 riastrad },
160 1.1 riastrad dks3[2] = {
161 1.5 riastrad VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
162 1.5 riastrad 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8),
163 1.5 riastrad VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
164 1.5 riastrad 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5),
165 1.1 riastrad },
166 1.1 riastrad dks4[2] = {
167 1.5 riastrad VQ_N_U8(0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
168 1.5 riastrad 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0),
169 1.5 riastrad VQ_N_U8(0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
170 1.5 riastrad 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F),
171 1.1 riastrad },
172 1.1 riastrad deskew[2] = {
173 1.5 riastrad VQ_N_U8(0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
174 1.5 riastrad 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D),
175 1.5 riastrad VQ_N_U8(0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
176 1.5 riastrad 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28),
177 1.1 riastrad },
178 1.2 riastrad sr[4] __aarch64_used = {
179 1.5 riastrad VQ_N_U8(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
180 1.5 riastrad 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F),
181 1.5 riastrad VQ_N_U8(0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
182 1.5 riastrad 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B),
183 1.5 riastrad VQ_N_U8(0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
184 1.5 riastrad 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07),
185 1.5 riastrad VQ_N_U8(0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
186 1.5 riastrad 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03),
187 1.5 riastrad },
188 1.5 riastrad rcon = VQ_N_U8(0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
189 1.5 riastrad 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70),
190 1.5 riastrad of = VQ_N_U8(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
191 1.5 riastrad 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F),
192 1.5 riastrad s63 = VQ_N_U8(0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
193 1.5 riastrad 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B),
194 1.5 riastrad inv = VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
195 1.5 riastrad 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04),
196 1.5 riastrad inva = VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
197 1.5 riastrad 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03);
198 1.1 riastrad
199 1.6 rin #ifdef __aarch64__
200 1.1 riastrad static inline uint8x16_t
201 1.1 riastrad loadroundkey(const void *rkp)
202 1.1 riastrad {
203 1.1 riastrad return vld1q_u8(rkp);
204 1.1 riastrad }
205 1.6 rin #endif
206 1.1 riastrad
207 1.1 riastrad static inline void
208 1.1 riastrad storeroundkey(void *rkp, uint8x16_t rk)
209 1.1 riastrad {
210 1.1 riastrad vst1q_u8(rkp, rk);
211 1.1 riastrad }
212 1.1 riastrad
213 1.1 riastrad /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */
214 1.1 riastrad static inline void
215 1.1 riastrad bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
216 1.1 riastrad {
217 1.1 riastrad
218 1.1 riastrad *lo = of & x;
219 1.1 riastrad *hi = of & vshrq_n_u8(x, 4);
220 1.1 riastrad }
221 1.1 riastrad
222 1.1 riastrad /*
223 1.1 riastrad * t is a pair of maps respectively from low and high nybbles to bytes.
224 1.1 riastrad * Apply t the nybbles, and add the results in GF(2).
225 1.1 riastrad */
226 1.1 riastrad static uint8x16_t
227 1.1 riastrad aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
228 1.1 riastrad {
229 1.1 riastrad uint8x16_t lo, hi;
230 1.1 riastrad
231 1.1 riastrad bytes2nybbles(&lo, &hi, x);
232 1.1 riastrad return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
233 1.1 riastrad }
234 1.1 riastrad
235 1.1 riastrad static inline void
236 1.1 riastrad subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
237 1.1 riastrad uint8x16_t inva_)
238 1.1 riastrad {
239 1.1 riastrad uint8x16_t k, i, ak, j;
240 1.1 riastrad
241 1.1 riastrad bytes2nybbles(&k, &i, x);
242 1.1 riastrad ak = vqtbl1q_u8(inva_, k);
243 1.1 riastrad j = i ^ k;
244 1.1 riastrad *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
245 1.1 riastrad *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
246 1.1 riastrad }
247 1.1 riastrad
248 1.1 riastrad static uint8x16_t
249 1.1 riastrad aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
250 1.1 riastrad {
251 1.1 riastrad uint8x16_t io, jo;
252 1.1 riastrad
253 1.1 riastrad /* smear prk */
254 1.1 riastrad prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
255 1.1 riastrad prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
256 1.1 riastrad prk ^= s63;
257 1.1 riastrad
258 1.1 riastrad /* subbytes */
259 1.1 riastrad subbytes(&io, &jo, rk, inv, inva);
260 1.1 riastrad rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
261 1.1 riastrad
262 1.1 riastrad /* add in smeared stuff */
263 1.1 riastrad return rk ^ prk;
264 1.1 riastrad }
265 1.1 riastrad
266 1.1 riastrad static uint8x16_t
267 1.1 riastrad aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
268 1.1 riastrad {
269 1.1 riastrad uint32x4_t rk32;
270 1.1 riastrad
271 1.1 riastrad /* extract rcon from rcon_rot */
272 1.1 riastrad prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
273 1.1 riastrad *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
274 1.1 riastrad
275 1.1 riastrad /* rotate */
276 1.1 riastrad rk32 = vreinterpretq_u32_u8(rk);
277 1.1 riastrad rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
278 1.1 riastrad rk = vreinterpretq_u8_u32(rk32);
279 1.1 riastrad rk = vextq_u8(rk, rk, 1);
280 1.1 riastrad
281 1.1 riastrad return aes_schedule_low_round(rk, prk);
282 1.1 riastrad }
283 1.1 riastrad
284 1.1 riastrad static uint8x16_t
285 1.1 riastrad aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
286 1.1 riastrad {
287 1.1 riastrad uint8x16_t y = vdupq_n_u8(0);
288 1.1 riastrad
289 1.1 riastrad x ^= s63;
290 1.1 riastrad
291 1.1 riastrad x = vqtbl1q_u8(x, mc_forward[0]);
292 1.1 riastrad y ^= x;
293 1.1 riastrad x = vqtbl1q_u8(x, mc_forward[0]);
294 1.1 riastrad y ^= x;
295 1.1 riastrad x = vqtbl1q_u8(x, mc_forward[0]);
296 1.1 riastrad y ^= x;
297 1.1 riastrad
298 1.1 riastrad return vqtbl1q_u8(y, sr_i);
299 1.1 riastrad }
300 1.1 riastrad
301 1.1 riastrad static uint8x16_t
302 1.1 riastrad aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
303 1.1 riastrad {
304 1.1 riastrad
305 1.1 riastrad return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
306 1.1 riastrad }
307 1.1 riastrad
308 1.1 riastrad static uint8x16_t
309 1.1 riastrad aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
310 1.1 riastrad {
311 1.1 riastrad uint8x16_t y = vdupq_n_u8(0);
312 1.1 riastrad
313 1.1 riastrad x = aes_schedule_transform(x, dks1);
314 1.1 riastrad y = vqtbl1q_u8(y ^ x, mc_forward[0]);
315 1.1 riastrad x = aes_schedule_transform(x, dks2);
316 1.1 riastrad y = vqtbl1q_u8(y ^ x, mc_forward[0]);
317 1.1 riastrad x = aes_schedule_transform(x, dks3);
318 1.1 riastrad y = vqtbl1q_u8(y ^ x, mc_forward[0]);
319 1.1 riastrad x = aes_schedule_transform(x, dks4);
320 1.1 riastrad y = vqtbl1q_u8(y ^ x, mc_forward[0]);
321 1.1 riastrad
322 1.1 riastrad return vqtbl1q_u8(y, sr_i);
323 1.1 riastrad }
324 1.1 riastrad
325 1.1 riastrad static uint8x16_t
326 1.1 riastrad aes_schedule_mangle_last_dec(uint8x16_t x)
327 1.1 riastrad {
328 1.1 riastrad
329 1.1 riastrad return aes_schedule_transform(x ^ s63, deskew);
330 1.1 riastrad }
331 1.1 riastrad
332 1.1 riastrad static uint8x16_t
333 1.1 riastrad aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
334 1.1 riastrad {
335 1.1 riastrad uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
336 1.1 riastrad uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
337 1.1 riastrad uint32x4_t rk32;
338 1.1 riastrad
339 1.1 riastrad rk32 = prkhi32;
340 1.1 riastrad rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
341 1.1 riastrad vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
342 1.1 riastrad 3);
343 1.1 riastrad rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
344 1.1 riastrad vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
345 1.1 riastrad 0);
346 1.1 riastrad
347 1.1 riastrad return vreinterpretq_u8_u32(rk32);
348 1.1 riastrad }
349 1.1 riastrad
350 1.1 riastrad static uint8x16_t
351 1.1 riastrad aes_schedule_192_smearhi(uint8x16_t rk)
352 1.1 riastrad {
353 1.1 riastrad uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
354 1.1 riastrad
355 1.1 riastrad rk64 = vsetq_lane_u64(0, rk64, 0);
356 1.1 riastrad
357 1.1 riastrad return vreinterpretq_u8_u64(rk64);
358 1.1 riastrad }
359 1.1 riastrad
360 1.1 riastrad void
361 1.1 riastrad aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
362 1.1 riastrad {
363 1.1 riastrad uint32_t *rk32 = enc->aese_aes.aes_rk;
364 1.1 riastrad uint8x16_t mrk; /* mangled round key */
365 1.1 riastrad uint8x16_t rk; /* round key */
366 1.1 riastrad uint8x16_t prk; /* previous round key */
367 1.1 riastrad uint8x16_t rcon_rot = rcon;
368 1.1 riastrad uint64_t i = 3;
369 1.1 riastrad
370 1.1 riastrad /* input transform */
371 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key), ipt);
372 1.1 riastrad storeroundkey(rk32, rk);
373 1.1 riastrad rk32 += 4;
374 1.1 riastrad
375 1.1 riastrad switch (nrounds) {
376 1.1 riastrad case 10:
377 1.1 riastrad for (;;) {
378 1.1 riastrad rk = aes_schedule_round(rk, rk, &rcon_rot);
379 1.1 riastrad if (--nrounds == 0)
380 1.1 riastrad break;
381 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
382 1.1 riastrad storeroundkey(rk32, mrk);
383 1.1 riastrad rk32 += 4;
384 1.1 riastrad }
385 1.1 riastrad break;
386 1.1 riastrad case 12: {
387 1.1 riastrad uint8x16_t prkhi; /* high half of previous round key */
388 1.1 riastrad
389 1.1 riastrad prk = rk;
390 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
391 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
392 1.1 riastrad for (;;) {
393 1.1 riastrad prk = aes_schedule_round(rk, prk, &rcon_rot);
394 1.1 riastrad rk = vextq_u8(prkhi, prk, 8);
395 1.1 riastrad
396 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
397 1.1 riastrad storeroundkey(rk32, mrk);
398 1.1 riastrad rk32 += 4;
399 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk);
400 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
401 1.1 riastrad
402 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
403 1.1 riastrad storeroundkey(rk32, mrk);
404 1.1 riastrad rk32 += 4;
405 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
406 1.1 riastrad if ((nrounds -= 3) == 0)
407 1.1 riastrad break;
408 1.1 riastrad
409 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
410 1.1 riastrad storeroundkey(rk32, mrk);
411 1.1 riastrad rk32 += 4;
412 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk);
413 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
414 1.1 riastrad }
415 1.1 riastrad break;
416 1.1 riastrad }
417 1.1 riastrad case 14: {
418 1.1 riastrad uint8x16_t pprk; /* previous previous round key */
419 1.1 riastrad
420 1.1 riastrad prk = rk;
421 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
422 1.1 riastrad for (;;) {
423 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
424 1.1 riastrad storeroundkey(rk32, mrk);
425 1.1 riastrad rk32 += 4;
426 1.1 riastrad pprk = rk;
427 1.1 riastrad
428 1.1 riastrad /* high round */
429 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
430 1.1 riastrad if ((nrounds -= 2) == 0)
431 1.1 riastrad break;
432 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
433 1.1 riastrad storeroundkey(rk32, mrk);
434 1.1 riastrad rk32 += 4;
435 1.1 riastrad
436 1.1 riastrad /* low round */
437 1.1 riastrad rk = vreinterpretq_u8_u32(
438 1.1 riastrad vdupq_n_u32(
439 1.1 riastrad vgetq_lane_u32(vreinterpretq_u32_u8(rk),
440 1.1 riastrad 3)));
441 1.1 riastrad rk = aes_schedule_low_round(rk, pprk);
442 1.1 riastrad }
443 1.1 riastrad break;
444 1.1 riastrad }
445 1.1 riastrad default:
446 1.1 riastrad panic("invalid number of AES rounds: %u", nrounds);
447 1.1 riastrad }
448 1.1 riastrad storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
449 1.1 riastrad }
450 1.1 riastrad
451 1.1 riastrad void
452 1.1 riastrad aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
453 1.1 riastrad {
454 1.1 riastrad uint32_t *rk32 = dec->aesd_aes.aes_rk;
455 1.1 riastrad uint8x16_t mrk; /* mangled round key */
456 1.1 riastrad uint8x16_t ork; /* original round key */
457 1.1 riastrad uint8x16_t rk; /* round key */
458 1.1 riastrad uint8x16_t prk; /* previous round key */
459 1.1 riastrad uint8x16_t rcon_rot = rcon;
460 1.1 riastrad unsigned i = nrounds == 12 ? 0 : 2;
461 1.1 riastrad
462 1.1 riastrad ork = vld1q_u8(key);
463 1.1 riastrad
464 1.1 riastrad /* input transform */
465 1.1 riastrad rk = aes_schedule_transform(ork, ipt);
466 1.1 riastrad
467 1.1 riastrad /* go from end */
468 1.1 riastrad rk32 += 4*nrounds;
469 1.1 riastrad storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
470 1.1 riastrad rk32 -= 4;
471 1.1 riastrad i ^= 3;
472 1.1 riastrad
473 1.1 riastrad switch (nrounds) {
474 1.1 riastrad case 10:
475 1.1 riastrad for (;;) {
476 1.1 riastrad rk = aes_schedule_round(rk, rk, &rcon_rot);
477 1.1 riastrad if (--nrounds == 0)
478 1.1 riastrad break;
479 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
480 1.1 riastrad storeroundkey(rk32, mrk);
481 1.1 riastrad rk32 -= 4;
482 1.1 riastrad }
483 1.1 riastrad break;
484 1.1 riastrad case 12: {
485 1.1 riastrad uint8x16_t prkhi; /* high half of previous round key */
486 1.1 riastrad
487 1.1 riastrad prk = rk;
488 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
489 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
490 1.1 riastrad for (;;) {
491 1.1 riastrad prk = aes_schedule_round(rk, prk, &rcon_rot);
492 1.1 riastrad rk = vextq_u8(prkhi, prk, 8);
493 1.1 riastrad
494 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
495 1.1 riastrad storeroundkey(rk32, mrk);
496 1.1 riastrad rk32 -= 4;
497 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk);
498 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
499 1.1 riastrad
500 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
501 1.1 riastrad storeroundkey(rk32, mrk);
502 1.1 riastrad rk32 -= 4;
503 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
504 1.1 riastrad if ((nrounds -= 3) == 0)
505 1.1 riastrad break;
506 1.1 riastrad
507 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
508 1.1 riastrad storeroundkey(rk32, mrk);
509 1.1 riastrad rk32 -= 4;
510 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk);
511 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk);
512 1.1 riastrad }
513 1.1 riastrad break;
514 1.1 riastrad }
515 1.1 riastrad case 14: {
516 1.1 riastrad uint8x16_t pprk; /* previous previous round key */
517 1.1 riastrad
518 1.1 riastrad prk = rk;
519 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
520 1.1 riastrad for (;;) {
521 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
522 1.1 riastrad storeroundkey(rk32, mrk);
523 1.1 riastrad rk32 -= 4;
524 1.1 riastrad pprk = rk;
525 1.1 riastrad
526 1.1 riastrad /* high round */
527 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
528 1.1 riastrad if ((nrounds -= 2) == 0)
529 1.1 riastrad break;
530 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
531 1.1 riastrad storeroundkey(rk32, mrk);
532 1.1 riastrad rk32 -= 4;
533 1.1 riastrad
534 1.1 riastrad /* low round */
535 1.1 riastrad rk = vreinterpretq_u8_u32(
536 1.1 riastrad vdupq_n_u32(
537 1.1 riastrad vgetq_lane_u32(vreinterpretq_u32_u8(rk),
538 1.1 riastrad 3)));
539 1.1 riastrad rk = aes_schedule_low_round(rk, pprk);
540 1.1 riastrad }
541 1.1 riastrad break;
542 1.1 riastrad }
543 1.1 riastrad default:
544 1.1 riastrad panic("invalid number of AES rounds: %u", nrounds);
545 1.1 riastrad }
546 1.1 riastrad storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
547 1.1 riastrad }
548 1.1 riastrad
549 1.2 riastrad #ifdef __aarch64__
550 1.2 riastrad
551 1.2 riastrad /*
552 1.2 riastrad * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
553 1.2 riastrad * do the performance-critical parts -- encryption and decryption -- in
554 1.2 riastrad * hand-written assembly on arm32.
555 1.2 riastrad */
556 1.2 riastrad
557 1.1 riastrad uint8x16_t
558 1.1 riastrad aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
559 1.1 riastrad {
560 1.1 riastrad const uint32_t *rk32 = enc->aese_aes.aes_rk;
561 1.1 riastrad uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
562 1.1 riastrad uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
563 1.1 riastrad uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
564 1.1 riastrad uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
565 1.1 riastrad uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
566 1.1 riastrad uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
567 1.1 riastrad uint8x16_t io, jo;
568 1.1 riastrad unsigned rmod4 = 0;
569 1.1 riastrad
570 1.1 riastrad x = aes_schedule_transform(x, ipt);
571 1.1 riastrad x ^= loadroundkey(rk32);
572 1.1 riastrad for (;;) {
573 1.1 riastrad uint8x16_t A, A2, A2_B, A2_B_D;
574 1.1 riastrad
575 1.1 riastrad subbytes(&io, &jo, x, inv_, inva_);
576 1.1 riastrad
577 1.1 riastrad rk32 += 4;
578 1.1 riastrad rmod4 = (rmod4 + 1) % 4;
579 1.1 riastrad if (--nrounds == 0)
580 1.1 riastrad break;
581 1.1 riastrad
582 1.1 riastrad A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
583 1.1 riastrad A ^= loadroundkey(rk32);
584 1.1 riastrad A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
585 1.1 riastrad A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
586 1.1 riastrad A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
587 1.1 riastrad x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
588 1.1 riastrad }
589 1.1 riastrad x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
590 1.1 riastrad x ^= loadroundkey(rk32);
591 1.1 riastrad return vqtbl1q_u8(x, sr[rmod4]);
592 1.1 riastrad }
593 1.1 riastrad
594 1.4 riastrad uint8x16x2_t
595 1.4 riastrad aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
596 1.4 riastrad {
597 1.4 riastrad const uint32_t *rk32 = enc->aese_aes.aes_rk;
598 1.4 riastrad uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
599 1.4 riastrad uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
600 1.4 riastrad uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
601 1.4 riastrad uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
602 1.4 riastrad uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
603 1.4 riastrad uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
604 1.4 riastrad uint8x16_t x0 = x.val[0], x1 = x.val[1];
605 1.4 riastrad uint8x16_t io0, jo0, io1, jo1;
606 1.4 riastrad unsigned rmod4 = 0;
607 1.4 riastrad
608 1.4 riastrad x0 = aes_schedule_transform(x0, ipt);
609 1.4 riastrad x1 = aes_schedule_transform(x1, ipt);
610 1.4 riastrad x0 ^= loadroundkey(rk32);
611 1.4 riastrad x1 ^= loadroundkey(rk32);
612 1.4 riastrad for (;;) {
613 1.4 riastrad uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
614 1.4 riastrad uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
615 1.4 riastrad
616 1.4 riastrad subbytes(&io0, &jo0, x0, inv_, inva_);
617 1.4 riastrad subbytes(&io1, &jo1, x1, inv_, inva_);
618 1.4 riastrad
619 1.4 riastrad rk32 += 4;
620 1.4 riastrad rmod4 = (rmod4 + 1) % 4;
621 1.4 riastrad if (--nrounds == 0)
622 1.4 riastrad break;
623 1.4 riastrad
624 1.4 riastrad A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
625 1.4 riastrad A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
626 1.4 riastrad A_0 ^= loadroundkey(rk32);
627 1.4 riastrad A_1 ^= loadroundkey(rk32);
628 1.4 riastrad A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
629 1.4 riastrad A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
630 1.4 riastrad A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
631 1.4 riastrad A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
632 1.4 riastrad A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
633 1.4 riastrad A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
634 1.4 riastrad x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
635 1.4 riastrad x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
636 1.4 riastrad }
637 1.4 riastrad x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
638 1.4 riastrad x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
639 1.4 riastrad x0 ^= loadroundkey(rk32);
640 1.4 riastrad x1 ^= loadroundkey(rk32);
641 1.4 riastrad return (uint8x16x2_t) { .val = {
642 1.4 riastrad [0] = vqtbl1q_u8(x0, sr[rmod4]),
643 1.4 riastrad [1] = vqtbl1q_u8(x1, sr[rmod4]),
644 1.4 riastrad } };
645 1.4 riastrad }
646 1.4 riastrad
647 1.1 riastrad uint8x16_t
648 1.1 riastrad aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
649 1.1 riastrad {
650 1.1 riastrad const uint32_t *rk32 = dec->aesd_aes.aes_rk;
651 1.1 riastrad unsigned i = 3 & ~(nrounds - 1);
652 1.1 riastrad uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
653 1.1 riastrad uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
654 1.1 riastrad uint8x16_t io, jo, mc;
655 1.1 riastrad
656 1.1 riastrad x = aes_schedule_transform(x, dipt);
657 1.1 riastrad x ^= loadroundkey(rk32);
658 1.1 riastrad rk32 += 4;
659 1.1 riastrad
660 1.1 riastrad mc = mc_forward[3];
661 1.1 riastrad for (;;) {
662 1.1 riastrad subbytes(&io, &jo, x, inv_, inva_);
663 1.1 riastrad if (--nrounds == 0)
664 1.1 riastrad break;
665 1.1 riastrad
666 1.1 riastrad x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
667 1.1 riastrad x ^= loadroundkey(rk32);
668 1.1 riastrad rk32 += 4; /* next round key */
669 1.1 riastrad
670 1.1 riastrad x = vqtbl1q_u8(x, mc);
671 1.1 riastrad x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
672 1.1 riastrad
673 1.1 riastrad x = vqtbl1q_u8(x, mc);
674 1.1 riastrad x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
675 1.1 riastrad
676 1.1 riastrad x = vqtbl1q_u8(x, mc);
677 1.1 riastrad x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
678 1.1 riastrad
679 1.1 riastrad mc = vextq_u8(mc, mc, 12);
680 1.1 riastrad }
681 1.1 riastrad x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
682 1.1 riastrad x ^= loadroundkey(rk32);
683 1.1 riastrad return vqtbl1q_u8(x, sr[i]);
684 1.1 riastrad }
685 1.2 riastrad
686 1.4 riastrad uint8x16x2_t
687 1.4 riastrad aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
688 1.4 riastrad {
689 1.4 riastrad const uint32_t *rk32 = dec->aesd_aes.aes_rk;
690 1.4 riastrad unsigned i = 3 & ~(nrounds - 1);
691 1.4 riastrad uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
692 1.4 riastrad uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
693 1.4 riastrad uint8x16_t x0 = x.val[0], x1 = x.val[1];
694 1.4 riastrad uint8x16_t io0, jo0, io1, jo1, mc;
695 1.4 riastrad
696 1.4 riastrad x0 = aes_schedule_transform(x0, dipt);
697 1.4 riastrad x1 = aes_schedule_transform(x1, dipt);
698 1.4 riastrad x0 ^= loadroundkey(rk32);
699 1.4 riastrad x1 ^= loadroundkey(rk32);
700 1.4 riastrad rk32 += 4;
701 1.4 riastrad
702 1.4 riastrad mc = mc_forward[3];
703 1.4 riastrad for (;;) {
704 1.4 riastrad subbytes(&io0, &jo0, x0, inv_, inva_);
705 1.4 riastrad subbytes(&io1, &jo1, x1, inv_, inva_);
706 1.4 riastrad if (--nrounds == 0)
707 1.4 riastrad break;
708 1.4 riastrad
709 1.4 riastrad x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
710 1.4 riastrad x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
711 1.4 riastrad x0 ^= loadroundkey(rk32);
712 1.4 riastrad x1 ^= loadroundkey(rk32);
713 1.4 riastrad rk32 += 4; /* next round key */
714 1.4 riastrad
715 1.4 riastrad x0 = vqtbl1q_u8(x0, mc);
716 1.4 riastrad x1 = vqtbl1q_u8(x1, mc);
717 1.4 riastrad x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
718 1.4 riastrad x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
719 1.4 riastrad
720 1.4 riastrad x0 = vqtbl1q_u8(x0, mc);
721 1.4 riastrad x1 = vqtbl1q_u8(x1, mc);
722 1.4 riastrad x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
723 1.4 riastrad x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
724 1.4 riastrad
725 1.4 riastrad x0 = vqtbl1q_u8(x0, mc);
726 1.4 riastrad x1 = vqtbl1q_u8(x1, mc);
727 1.4 riastrad x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
728 1.4 riastrad x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
729 1.4 riastrad
730 1.4 riastrad mc = vextq_u8(mc, mc, 12);
731 1.4 riastrad }
732 1.4 riastrad x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
733 1.4 riastrad x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
734 1.4 riastrad x0 ^= loadroundkey(rk32);
735 1.4 riastrad x1 ^= loadroundkey(rk32);
736 1.4 riastrad return (uint8x16x2_t) { .val = {
737 1.4 riastrad [0] = vqtbl1q_u8(x0, sr[i]),
738 1.4 riastrad [1] = vqtbl1q_u8(x1, sr[i]),
739 1.4 riastrad } };
740 1.4 riastrad }
741 1.4 riastrad
742 1.2 riastrad #endif
743