aes_armv8_64.S revision 1.3 1 1.3 riastrad /* $NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <aarch64/asm.h>
30 1.1 riastrad
31 1.3 riastrad .arch_extension aes
32 1.1 riastrad
33 1.1 riastrad /*
34 1.1 riastrad * uint32_t rcon[10]
35 1.1 riastrad *
36 1.1 riastrad * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
37 1.1 riastrad * Such elements of GF(8) need only eight bits to be represented,
38 1.1 riastrad * but we store them in 4-byte units so we can copy one into all
39 1.1 riastrad * four 4-byte lanes of a vector register with a single LD1R. The
40 1.1 riastrad * access pattern is fixed, so indices into this table are never
41 1.1 riastrad * secret.
42 1.1 riastrad */
43 1.1 riastrad .section .rodata
44 1.2 riastrad .p2align 2
45 1.1 riastrad .type rcon,@object
46 1.1 riastrad rcon:
47 1.1 riastrad .long 0x01
48 1.1 riastrad .long 0x02
49 1.1 riastrad .long 0x04
50 1.1 riastrad .long 0x08
51 1.1 riastrad .long 0x10
52 1.1 riastrad .long 0x20
53 1.1 riastrad .long 0x40
54 1.1 riastrad .long 0x80
55 1.1 riastrad .long 0x1b
56 1.1 riastrad .long 0x36
57 1.1 riastrad END(rcon)
58 1.1 riastrad
59 1.1 riastrad /*
60 1.1 riastrad * uint128_t unshiftrows_rotword_1
61 1.1 riastrad *
62 1.1 riastrad * Table for TBL instruction to undo ShiftRows, and then do
63 1.1 riastrad * RotWord on word 1, and then copy it into all the other words.
64 1.1 riastrad */
65 1.1 riastrad .section .rodata
66 1.2 riastrad .p2align 4
67 1.1 riastrad .type unshiftrows_rotword_1,@object
68 1.1 riastrad unshiftrows_rotword_1:
69 1.1 riastrad .byte 0x01,0x0e,0x0b,0x04
70 1.1 riastrad .byte 0x01,0x0e,0x0b,0x04
71 1.1 riastrad .byte 0x01,0x0e,0x0b,0x04
72 1.1 riastrad .byte 0x01,0x0e,0x0b,0x04
73 1.1 riastrad END(unshiftrows_rotword_1)
74 1.1 riastrad
75 1.1 riastrad /*
76 1.1 riastrad * uint128_t unshiftrows_3
77 1.1 riastrad *
78 1.1 riastrad * Table for TBL instruction to undo ShiftRows, and then copy word
79 1.1 riastrad * 3 into all the other words.
80 1.1 riastrad */
81 1.1 riastrad .section .rodata
82 1.2 riastrad .p2align 4
83 1.1 riastrad .type unshiftrows_3,@object
84 1.1 riastrad unshiftrows_3:
85 1.1 riastrad .byte 0x0c,0x09,0x06,0x03
86 1.1 riastrad .byte 0x0c,0x09,0x06,0x03
87 1.1 riastrad .byte 0x0c,0x09,0x06,0x03
88 1.1 riastrad .byte 0x0c,0x09,0x06,0x03
89 1.1 riastrad END(unshiftrows_3)
90 1.1 riastrad
91 1.1 riastrad /*
92 1.1 riastrad * uint128_t unshiftrows_rotword_3
93 1.1 riastrad *
94 1.1 riastrad * Table for TBL instruction to undo ShiftRows, and then do
95 1.1 riastrad * RotWord on word 3, and then copy it into all the other words.
96 1.1 riastrad */
97 1.1 riastrad .section .rodata
98 1.2 riastrad .p2align 4
99 1.1 riastrad .type unshiftrows_rotword_3,@object
100 1.1 riastrad unshiftrows_rotword_3:
101 1.1 riastrad .byte 0x09,0x06,0x03,0x0c
102 1.1 riastrad .byte 0x09,0x06,0x03,0x0c
103 1.1 riastrad .byte 0x09,0x06,0x03,0x0c
104 1.1 riastrad .byte 0x09,0x06,0x03,0x0c
105 1.1 riastrad END(unshiftrows_rotword_3)
106 1.1 riastrad
107 1.1 riastrad /*
108 1.1 riastrad * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
109 1.1 riastrad *
110 1.1 riastrad * Expand a 16-byte AES-128 key into 10 round keys.
111 1.1 riastrad *
112 1.1 riastrad * Standard ABI calling convention.
113 1.1 riastrad */
114 1.1 riastrad ENTRY(aesarmv8_setenckey128)
115 1.1 riastrad ldr q1, [x1] /* q1 := master key */
116 1.1 riastrad
117 1.1 riastrad adrl x4, unshiftrows_rotword_3
118 1.1 riastrad eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
119 1.1 riastrad ldr q8, [x4] /* q8 := unshiftrows_rotword_3 table */
120 1.1 riastrad
121 1.1 riastrad str q1, [x0], #0x10 /* store master key as first round key */
122 1.1 riastrad mov x2, #10 /* round count */
123 1.1 riastrad adrl x3, rcon /* round constant */
124 1.1 riastrad
125 1.1 riastrad 1: /*
126 1.1 riastrad * q0 = 0
127 1.1 riastrad * v1.4s = (prk[0], prk[1], prk[2], prk[3])
128 1.1 riastrad * x0 = pointer to round key to compute
129 1.1 riastrad * x2 = round count
130 1.1 riastrad * x3 = rcon pointer
131 1.1 riastrad */
132 1.1 riastrad
133 1.1 riastrad /* q3 := ShiftRows(SubBytes(q1)) */
134 1.1 riastrad mov v3.16b, v1.16b
135 1.1 riastrad aese v3.16b, v0.16b
136 1.1 riastrad
137 1.1 riastrad /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
138 1.1 riastrad ld1r {v4.4s}, [x3], #4
139 1.1 riastrad tbl v3.16b, {v3.16b}, v8.16b
140 1.1 riastrad eor v3.16b, v3.16b, v4.16b
141 1.1 riastrad
142 1.1 riastrad /*
143 1.1 riastrad * v5.4s := (0,prk[0],prk[1],prk[2])
144 1.1 riastrad * v6.4s := (0,0,prk[0],prk[1])
145 1.1 riastrad * v7.4s := (0,0,0,prk[0])
146 1.1 riastrad */
147 1.1 riastrad ext v5.16b, v0.16b, v1.16b, #12
148 1.1 riastrad ext v6.16b, v0.16b, v1.16b, #8
149 1.1 riastrad ext v7.16b, v0.16b, v1.16b, #4
150 1.1 riastrad
151 1.1 riastrad /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
152 1.1 riastrad eor v1.16b, v1.16b, v3.16b
153 1.1 riastrad eor v1.16b, v1.16b, v5.16b
154 1.1 riastrad eor v1.16b, v1.16b, v6.16b
155 1.1 riastrad eor v1.16b, v1.16b, v7.16b
156 1.1 riastrad
157 1.1 riastrad subs x2, x2, #1 /* count down rounds */
158 1.1 riastrad str q1, [x0], #0x10 /* store round key */
159 1.1 riastrad b.ne 1b
160 1.1 riastrad
161 1.1 riastrad ret
162 1.1 riastrad END(aesarmv8_setenckey128)
163 1.1 riastrad
164 1.1 riastrad /*
165 1.1 riastrad * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
166 1.1 riastrad *
167 1.1 riastrad * Expand a 24-byte AES-192 key into 12 round keys.
168 1.1 riastrad *
169 1.1 riastrad * Standard ABI calling convention.
170 1.1 riastrad */
171 1.1 riastrad ENTRY(aesarmv8_setenckey192)
172 1.1 riastrad ldr q1, [x1], #0x10 /* q1 := master key[0:128) */
173 1.1 riastrad ldr d2, [x1] /* d2 := master key[128:192) */
174 1.1 riastrad
175 1.1 riastrad adrl x4, unshiftrows_rotword_1
176 1.1 riastrad adrl x5, unshiftrows_rotword_3
177 1.1 riastrad eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
178 1.1 riastrad ldr q8, [x4] /* q8 := unshiftrows_rotword_1 */
179 1.1 riastrad ldr q9, [x5] /* q9 := unshiftrows_rotword_3 */
180 1.1 riastrad
181 1.1 riastrad str q1, [x0], #0x10 /* store master key[0:128) as round key */
182 1.1 riastrad mov x2, #12 /* round count */
183 1.1 riastrad adrl x3, rcon /* round constant */
184 1.1 riastrad
185 1.1 riastrad 1: /*
186 1.1 riastrad * q0 = 0
187 1.1 riastrad * v1.4s = (prk[0], prk[1], prk[2], prk[3])
188 1.1 riastrad * v2.4s = (rklo[0], rklo[1], xxx, xxx)
189 1.1 riastrad * x0 = pointer to three round keys to compute
190 1.1 riastrad * x2 = round count
191 1.1 riastrad * x3 = rcon pointer
192 1.1 riastrad */
193 1.1 riastrad
194 1.1 riastrad /* q3 := ShiftRows(SubBytes(q2)) */
195 1.1 riastrad mov v3.16b, v2.16b
196 1.1 riastrad aese v3.16b, v0.16b
197 1.1 riastrad
198 1.1 riastrad /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
199 1.1 riastrad ld1r {v4.4s}, [x3], #4
200 1.1 riastrad tbl v3.16b, {v3.16b}, v8.16b
201 1.1 riastrad eor v3.16b, v3.16b, v4.16b
202 1.1 riastrad
203 1.1 riastrad /*
204 1.1 riastrad * We need to compute:
205 1.1 riastrad *
206 1.1 riastrad * rk[0] := rklo[0]
207 1.1 riastrad * rk[1] := rklo[1]
208 1.1 riastrad * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
209 1.1 riastrad * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
210 1.1 riastrad * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
211 1.1 riastrad * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
212 1.1 riastrad * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
213 1.1 riastrad * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
214 1.1 riastrad * ^ rklo[1]
215 1.1 riastrad */
216 1.1 riastrad
217 1.1 riastrad /*
218 1.1 riastrad * v5.4s := (0,prk[0],prk[1],prk[2])
219 1.1 riastrad * v6.4s := (0,0,prk[0],prk[1])
220 1.1 riastrad * v7.4s := (0,0,0,prk[0])
221 1.1 riastrad */
222 1.1 riastrad ext v5.16b, v0.16b, v1.16b, #12
223 1.1 riastrad ext v6.16b, v0.16b, v1.16b, #8
224 1.1 riastrad ext v7.16b, v0.16b, v1.16b, #4
225 1.1 riastrad
226 1.1 riastrad /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
227 1.1 riastrad eor v5.16b, v5.16b, v1.16b
228 1.1 riastrad eor v5.16b, v5.16b, v3.16b
229 1.1 riastrad eor v5.16b, v5.16b, v6.16b
230 1.1 riastrad eor v5.16b, v5.16b, v7.16b
231 1.1 riastrad
232 1.1 riastrad /*
233 1.1 riastrad * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
234 1.1 riastrad * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
235 1.1 riastrad * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
236 1.1 riastrad * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
237 1.1 riastrad * (rklo[0],rklo[1],...).
238 1.1 riastrad */
239 1.1 riastrad
240 1.1 riastrad /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
241 1.1 riastrad dup v1.4s, v5.4s[3]
242 1.1 riastrad mov v1.4s[0], v5.4s[2]
243 1.1 riastrad
244 1.1 riastrad /*
245 1.1 riastrad * v6.4s := (0, 0, rklo[0], rklo[1])
246 1.1 riastrad * v7.4s := (0, 0, 0, rklo[0])
247 1.1 riastrad */
248 1.1 riastrad ext v6.16b, v0.16b, v2.16b, #8
249 1.1 riastrad ext v7.16b, v0.16b, v2.16b, #4
250 1.1 riastrad
251 1.1 riastrad /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
252 1.1 riastrad eor v3.16b, v1.16b, v6.16b
253 1.1 riastrad eor v3.16b, v3.16b, v7.16b
254 1.1 riastrad
255 1.1 riastrad /*
256 1.1 riastrad * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
257 1.1 riastrad * and v5.4s = (rk[2], rk[3], xxx, xxx). Set
258 1.1 riastrad * v2.4s := (rk[0], rk[1], rk[2], rk[3])
259 1.1 riastrad */
260 1.1 riastrad mov v2.2d[1], v5.2d[0]
261 1.1 riastrad
262 1.1 riastrad /* store two round keys */
263 1.1 riastrad stp q2, q3, [x0], #0x20
264 1.1 riastrad
265 1.1 riastrad /*
266 1.1 riastrad * Live vector registers at this point:
267 1.1 riastrad *
268 1.1 riastrad * q0 = zero
269 1.1 riastrad * q2 = rk
270 1.1 riastrad * q3 = nrk
271 1.1 riastrad * v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
272 1.1 riastrad * q8 = unshiftrows_rotword_1
273 1.1 riastrad * q9 = unshiftrows_rotword_3
274 1.1 riastrad *
275 1.1 riastrad * We have to compute, in q1:
276 1.1 riastrad *
277 1.1 riastrad * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
278 1.1 riastrad * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
279 1.1 riastrad * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
280 1.1 riastrad * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
281 1.1 riastrad * ^ nrk[1]
282 1.1 riastrad *
283 1.1 riastrad * And, if there's any more afterward, in q2:
284 1.1 riastrad *
285 1.1 riastrad * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
286 1.1 riastrad * ^ nrk[1] ^ nrk[2]
287 1.1 riastrad * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
288 1.1 riastrad * ^ nrk[1] ^ nrk[2] ^ nrk[3]
289 1.1 riastrad */
290 1.1 riastrad
291 1.1 riastrad /* q1 := RotWords(SubBytes(q3)) */
292 1.1 riastrad mov v1.16b, v3.16b
293 1.1 riastrad aese v1.16b, v0.16b
294 1.1 riastrad
295 1.1 riastrad /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
296 1.1 riastrad ld1r {v4.4s}, [x3], #4
297 1.1 riastrad tbl v1.16b, {v1.16b}, v9.16b
298 1.1 riastrad eor v1.16b, v1.16b, v4.16b
299 1.1 riastrad
300 1.1 riastrad /*
301 1.1 riastrad * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
302 1.1 riastrad * v4.4s := (0, rk[2], rk[3], nrk[0])
303 1.1 riastrad * v6.4s := (0, 0, rk[2], rk[3])
304 1.1 riastrad * v7.4s := (0, 0, 0, rk[2])
305 1.1 riastrad */
306 1.1 riastrad ext v4.16b, v0.16b, v5.16b, #12
307 1.1 riastrad ext v6.16b, v0.16b, v5.16b, #8
308 1.1 riastrad ext v7.16b, v0.16b, v5.16b, #4
309 1.1 riastrad
310 1.1 riastrad /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
311 1.1 riastrad eor v1.16b, v1.16b, v5.16b
312 1.1 riastrad eor v1.16b, v1.16b, v4.16b
313 1.1 riastrad eor v1.16b, v1.16b, v6.16b
314 1.1 riastrad eor v1.16b, v1.16b, v7.16b
315 1.1 riastrad
316 1.1 riastrad subs x2, x2, #3 /* count down three rounds */
317 1.1 riastrad str q1, [x0], #0x10 /* store third round key */
318 1.1 riastrad b.eq 2f
319 1.1 riastrad
320 1.1 riastrad /*
321 1.1 riastrad * v4.4s := (nrk[2], nrk[3], xxx, xxx)
322 1.1 riastrad * v5.4s := (0, nrk[2], xxx, xxx)
323 1.1 riastrad */
324 1.1 riastrad ext v4.16b, v3.16b, v0.16b, #8
325 1.1 riastrad ext v5.16b, v0.16b, v4.16b, #12
326 1.1 riastrad
327 1.1 riastrad /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
328 1.1 riastrad dup v2.4s, v1.4s[3]
329 1.1 riastrad
330 1.1 riastrad /*
331 1.1 riastrad * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
332 1.1 riastrad * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
333 1.1 riastrad * xxx, xxx)
334 1.1 riastrad */
335 1.1 riastrad eor v2.16b, v2.16b, v4.16b
336 1.1 riastrad eor v2.16b, v2.16b, v5.16b
337 1.1 riastrad
338 1.1 riastrad b 1b
339 1.1 riastrad
340 1.1 riastrad 2: ret
341 1.1 riastrad END(aesarmv8_setenckey192)
342 1.1 riastrad
343 1.1 riastrad /*
344 1.1 riastrad * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
345 1.1 riastrad *
346 1.1 riastrad * Expand a 32-byte AES-256 key into 14 round keys.
347 1.1 riastrad *
348 1.1 riastrad * Standard ABI calling convention.
349 1.1 riastrad */
350 1.1 riastrad ENTRY(aesarmv8_setenckey256)
351 1.1 riastrad /* q1 := key[0:128), q2 := key[128:256) */
352 1.1 riastrad ldp q1, q2, [x1], #0x20
353 1.1 riastrad
354 1.1 riastrad adrl x4, unshiftrows_rotword_3
355 1.1 riastrad adrl x5, unshiftrows_3
356 1.1 riastrad eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
357 1.1 riastrad ldr q8, [x4] /* q8 := unshiftrows_rotword_3 */
358 1.1 riastrad ldr q9, [x5] /* q9 := unshiftrows_3 */
359 1.1 riastrad
360 1.1 riastrad /* store master key as first two round keys */
361 1.1 riastrad stp q1, q2, [x0], #0x20
362 1.1 riastrad mov x2, #14 /* round count */
363 1.1 riastrad adrl x3, rcon /* round constant */
364 1.1 riastrad
365 1.1 riastrad 1: /*
366 1.1 riastrad * q0 = 0
367 1.1 riastrad * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
368 1.1 riastrad * v2.4s = (prk[0], prk[1], prk[2], prk[3])
369 1.1 riastrad * x2 = round count
370 1.1 riastrad * x3 = rcon pointer
371 1.1 riastrad */
372 1.1 riastrad
373 1.1 riastrad /* q3 := ShiftRows(SubBytes(q2)) */
374 1.1 riastrad mov v3.16b, v2.16b
375 1.1 riastrad aese v3.16b, v0.16b
376 1.1 riastrad
377 1.1 riastrad /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
378 1.1 riastrad ld1r {v4.4s}, [x3], #4
379 1.1 riastrad tbl v3.16b, {v3.16b}, v8.16b
380 1.1 riastrad eor v3.16b, v3.16b, v4.16b
381 1.1 riastrad
382 1.1 riastrad /*
383 1.1 riastrad * v5.4s := (0,pprk[0],pprk[1],pprk[2])
384 1.1 riastrad * v6.4s := (0,0,pprk[0],pprk[1])
385 1.1 riastrad * v7.4s := (0,0,0,pprk[0])
386 1.1 riastrad */
387 1.1 riastrad ext v5.16b, v0.16b, v1.16b, #12
388 1.1 riastrad ext v6.16b, v0.16b, v1.16b, #8
389 1.1 riastrad ext v7.16b, v0.16b, v1.16b, #4
390 1.1 riastrad
391 1.1 riastrad /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
392 1.1 riastrad eor v1.16b, v1.16b, v3.16b
393 1.1 riastrad eor v1.16b, v1.16b, v5.16b
394 1.1 riastrad eor v1.16b, v1.16b, v6.16b
395 1.1 riastrad eor v1.16b, v1.16b, v7.16b
396 1.1 riastrad
397 1.1 riastrad subs x2, x2, #2 /* count down two rounds */
398 1.1 riastrad b.eq 2f /* stop if this is the last one */
399 1.1 riastrad
400 1.1 riastrad /* q3 := ShiftRows(SubBytes(q1)) */
401 1.1 riastrad mov v3.16b, v1.16b
402 1.1 riastrad aese v3.16b, v0.16b
403 1.1 riastrad
404 1.1 riastrad /* v3.4s[i] := SubBytes(rk[3]) */
405 1.1 riastrad tbl v3.16b, {v3.16b}, v9.16b
406 1.1 riastrad
407 1.1 riastrad /*
408 1.1 riastrad * v5.4s := (0,prk[0],prk[1],prk[2])
409 1.1 riastrad * v6.4s := (0,0,prk[0],prk[1])
410 1.1 riastrad * v7.4s := (0,0,0,prk[0])
411 1.1 riastrad */
412 1.1 riastrad ext v5.16b, v0.16b, v2.16b, #12
413 1.1 riastrad ext v6.16b, v0.16b, v2.16b, #8
414 1.1 riastrad ext v7.16b, v0.16b, v2.16b, #4
415 1.1 riastrad
416 1.1 riastrad /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
417 1.1 riastrad eor v2.16b, v2.16b, v3.16b
418 1.1 riastrad eor v2.16b, v2.16b, v5.16b
419 1.1 riastrad eor v2.16b, v2.16b, v6.16b
420 1.1 riastrad eor v2.16b, v2.16b, v7.16b
421 1.1 riastrad
422 1.1 riastrad stp q1, q2, [x0], #0x20 /* store two round keys */
423 1.1 riastrad b 1b
424 1.1 riastrad
425 1.1 riastrad 2: str q1, [x0] /* store last round key */
426 1.1 riastrad ret
427 1.1 riastrad END(aesarmv8_setenckey256)
428 1.1 riastrad
429 1.1 riastrad /*
430 1.1 riastrad * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
431 1.1 riastrad * uint32_t nrounds@x2)
432 1.1 riastrad *
433 1.1 riastrad * Convert AES encryption round keys to AES decryption round keys.
434 1.1 riastrad * `rounds' must be between 10 and 14.
435 1.1 riastrad *
436 1.1 riastrad * Standard ABI calling convention.
437 1.1 riastrad */
438 1.1 riastrad ENTRY(aesarmv8_enctodec)
439 1.1 riastrad ldr q0, [x0, x2, lsl #4] /* load last round key */
440 1.1 riastrad 1: str q0, [x1], #0x10 /* store round key */
441 1.1 riastrad subs x2, x2, #1 /* count down round */
442 1.1 riastrad ldr q0, [x0, x2, lsl #4] /* load previous round key */
443 1.1 riastrad b.eq 2f /* stop if this is the last one */
444 1.1 riastrad aesimc v0.16b, v0.16b /* convert encryption to decryption */
445 1.1 riastrad b 1b
446 1.1 riastrad 2: str q0, [x1] /* store first round key verbatim */
447 1.1 riastrad ret
448 1.1 riastrad END(aesarmv8_enctodec)
449 1.1 riastrad
450 1.1 riastrad /*
451 1.1 riastrad * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
452 1.1 riastrad * uint8_t out[16] @x2, uint32_t nrounds@x3)
453 1.1 riastrad *
454 1.1 riastrad * Encrypt a single block.
455 1.1 riastrad *
456 1.1 riastrad * Standard ABI calling convention.
457 1.1 riastrad */
458 1.1 riastrad ENTRY(aesarmv8_enc)
459 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
460 1.1 riastrad mov fp, sp
461 1.1 riastrad ldr q0, [x1] /* q0 := block */
462 1.1 riastrad bl aesarmv8_enc1
463 1.1 riastrad str q0, [x2] /* store block */
464 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
465 1.1 riastrad ret
466 1.1 riastrad END(aesarmv8_enc)
467 1.1 riastrad
468 1.1 riastrad /*
469 1.1 riastrad * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
470 1.1 riastrad * uint8_t out[16] @x2, uint32_t nrounds@x3)
471 1.1 riastrad *
472 1.1 riastrad * Decrypt a single block.
473 1.1 riastrad *
474 1.1 riastrad * Standard ABI calling convention.
475 1.1 riastrad */
476 1.1 riastrad ENTRY(aesarmv8_dec)
477 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
478 1.1 riastrad mov fp, sp
479 1.1 riastrad ldr q0, [x1] /* q0 := block */
480 1.1 riastrad bl aesarmv8_dec1
481 1.1 riastrad str q0, [x2] /* store block */
482 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
483 1.1 riastrad ret
484 1.1 riastrad END(aesarmv8_dec)
485 1.1 riastrad
486 1.1 riastrad /*
487 1.1 riastrad * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
488 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
489 1.1 riastrad * uint32_t nrounds@x5)
490 1.1 riastrad *
491 1.1 riastrad * Encrypt a contiguous sequence of blocks with AES-CBC.
492 1.1 riastrad *
493 1.1 riastrad * nbytes must be an integral multiple of 16.
494 1.1 riastrad *
495 1.1 riastrad * Standard ABI calling convention.
496 1.1 riastrad */
497 1.1 riastrad ENTRY(aesarmv8_cbc_enc)
498 1.1 riastrad cbz x3, 2f /* stop if nothing to do */
499 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
500 1.1 riastrad mov fp, sp
501 1.1 riastrad mov x9, x0 /* x9 := enckey */
502 1.1 riastrad mov x10, x3 /* x10 := nbytes */
503 1.1 riastrad ldr q0, [x4] /* q0 := chaining value */
504 1.1 riastrad 1: ldr q1, [x1], #0x10 /* q1 := plaintext block */
505 1.1 riastrad eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
506 1.1 riastrad mov x0, x9 /* x0 := enckey */
507 1.1 riastrad mov x3, x5 /* x3 := nrounds */
508 1.1 riastrad bl aesarmv8_enc1 /* q0 := ciphertext block */
509 1.1 riastrad subs x10, x10, #0x10 /* count down nbytes */
510 1.1 riastrad str q0, [x2], #0x10 /* store ciphertext block */
511 1.1 riastrad b.ne 1b /* repeat if x10 is nonzero */
512 1.1 riastrad str q0, [x4] /* store chaining value */
513 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
514 1.1 riastrad 2: ret
515 1.1 riastrad END(aesarmv8_cbc_enc)
516 1.1 riastrad
517 1.1 riastrad /*
518 1.1 riastrad * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
519 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
520 1.1 riastrad * uint32_t nrounds@x5)
521 1.1 riastrad *
522 1.1 riastrad * Decrypt a contiguous sequence of blocks with AES-CBC.
523 1.1 riastrad *
524 1.1 riastrad * nbytes must be a positive integral multiple of 16. This routine
525 1.1 riastrad * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
526 1.1 riastrad *
527 1.1 riastrad * Standard ABI calling convention.
528 1.1 riastrad */
529 1.1 riastrad ENTRY(aesarmv8_cbc_dec1)
530 1.1 riastrad stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */
531 1.1 riastrad mov fp, sp
532 1.1 riastrad ldr q8, [x4] /* q8 := iv */
533 1.1 riastrad str q8, [sp, #16] /* save iv */
534 1.1 riastrad mov x9, x0 /* x9 := enckey */
535 1.1 riastrad mov x10, x3 /* x10 := nbytes */
536 1.1 riastrad add x1, x1, x3 /* x1 := pointer past end of in */
537 1.1 riastrad add x2, x2, x3 /* x2 := pointer past end of out */
538 1.1 riastrad ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */
539 1.1 riastrad str q0, [x4] /* update iv */
540 1.1 riastrad 1: mov x0, x9 /* x0 := enckey */
541 1.1 riastrad mov x3, x5 /* x3 := nrounds */
542 1.1 riastrad bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3 */
543 1.1 riastrad subs x10, x10, #0x10 /* count down nbytes */
544 1.1 riastrad b.eq 2f /* stop if this is the first block */
545 1.1 riastrad ldr q8, [x1, #-0x10]! /* q8 := chaining value */
546 1.1 riastrad eor v0.16b, v0.16b, v8.16b /* q0 := plaintext block */
547 1.1 riastrad str q0, [x2, #-0x10]! /* store plaintext block */
548 1.1 riastrad mov v0.16b, v8.16b /* move cv = ciphertext block */
549 1.1 riastrad b 1b
550 1.1 riastrad 2: ldr q8, [sp, #16] /* q8 := iv */
551 1.1 riastrad eor v0.16b, v0.16b, v8.16b /* q0 := first plaintext block */
552 1.1 riastrad str q0, [x2, #-0x10]! /* store first plaintext block */
553 1.1 riastrad ldp fp, lr, [sp], #32 /* pop stack frame */
554 1.1 riastrad ret
555 1.1 riastrad END(aesarmv8_cbc_dec1)
556 1.1 riastrad
557 1.1 riastrad /*
558 1.1 riastrad * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
559 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
560 1.1 riastrad * uint32_t nrounds@x5)
561 1.1 riastrad *
562 1.1 riastrad * Decrypt a contiguous sequence of 8-block units with AES-CBC.
563 1.1 riastrad *
564 1.1 riastrad * nbytes must be a positive integral multiple of 128.
565 1.1 riastrad *
566 1.1 riastrad * Standard ABI calling convention.
567 1.1 riastrad */
568 1.1 riastrad ENTRY(aesarmv8_cbc_dec8)
569 1.1 riastrad stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */
570 1.1 riastrad mov fp, sp
571 1.1 riastrad ldr q8, [x4] /* q8 := iv */
572 1.1 riastrad str q8, [sp, #16] /* save iv */
573 1.1 riastrad mov x9, x0 /* x9 := enckey */
574 1.1 riastrad mov x10, x3 /* x10 := nbytes */
575 1.1 riastrad add x1, x1, x3 /* x1 := pointer past end of in */
576 1.1 riastrad add x2, x2, x3 /* x2 := pointer past end of out */
577 1.1 riastrad ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */
578 1.1 riastrad str q7, [x4] /* update iv */
579 1.1 riastrad 1: ldp q4, q5, [x1, #-0x20]!
580 1.1 riastrad ldp q2, q3, [x1, #-0x20]!
581 1.1 riastrad ldp q0, q1, [x1, #-0x20]!
582 1.1 riastrad mov v15.16b, v6.16b /* q[8+i] := cv[i], 0<i<8 */
583 1.1 riastrad mov v14.16b, v5.16b
584 1.1 riastrad mov v13.16b, v4.16b
585 1.1 riastrad mov v12.16b, v3.16b
586 1.1 riastrad mov v11.16b, v2.16b
587 1.1 riastrad mov v10.16b, v1.16b
588 1.1 riastrad mov v9.16b, v0.16b
589 1.1 riastrad mov x0, x9 /* x0 := enckey */
590 1.1 riastrad mov x3, x5 /* x3 := nrounds */
591 1.1 riastrad bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i] */
592 1.1 riastrad eor v7.16b, v7.16b, v15.16b /* q[i] := pt[i] */
593 1.1 riastrad eor v6.16b, v6.16b, v14.16b
594 1.1 riastrad eor v5.16b, v5.16b, v13.16b
595 1.1 riastrad eor v4.16b, v4.16b, v12.16b
596 1.1 riastrad eor v3.16b, v3.16b, v11.16b
597 1.1 riastrad eor v2.16b, v2.16b, v10.16b
598 1.1 riastrad eor v1.16b, v1.16b, v9.16b
599 1.1 riastrad subs x10, x10, #0x80 /* count down nbytes */
600 1.1 riastrad stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
601 1.1 riastrad stp q4, q5, [x2, #-0x20]!
602 1.1 riastrad stp q2, q3, [x2, #-0x20]!
603 1.1 riastrad b.eq 2f /* stop if this is the first block */
604 1.1 riastrad ldp q6, q7, [x1, #-0x20]!
605 1.1 riastrad eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
606 1.1 riastrad stp q0, q1, [x2, #-0x20]!
607 1.1 riastrad b 1b
608 1.1 riastrad 2: ldr q8, [sp, #16] /* q8 := iv */
609 1.1 riastrad eor v0.16b, v0.16b, v8.16b /* q0 := pt0 */
610 1.1 riastrad stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
611 1.1 riastrad ldp fp, lr, [sp], #32 /* pop stack frame */
612 1.1 riastrad ret
613 1.1 riastrad END(aesarmv8_cbc_dec8)
614 1.1 riastrad
615 1.1 riastrad /*
616 1.1 riastrad * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
617 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
618 1.1 riastrad * uint32_t nrounds@x5)
619 1.1 riastrad *
620 1.1 riastrad * Encrypt a contiguous sequence of blocks with AES-XTS.
621 1.1 riastrad *
622 1.1 riastrad * nbytes must be a positive integral multiple of 16. This routine
623 1.1 riastrad * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
624 1.1 riastrad *
625 1.1 riastrad * Standard ABI calling convention.
626 1.1 riastrad */
627 1.1 riastrad ENTRY(aesarmv8_xts_enc1)
628 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
629 1.1 riastrad mov fp, sp
630 1.1 riastrad mov x9, x0 /* x9 := enckey */
631 1.1 riastrad mov x10, x3 /* x10 := nbytes */
632 1.1 riastrad ldr q9, [x4] /* q9 := tweak */
633 1.1 riastrad 1: ldr q0, [x1], #0x10 /* q0 := ptxt */
634 1.1 riastrad mov x0, x9 /* x0 := enckey */
635 1.1 riastrad mov x3, x5 /* x3 := nrounds */
636 1.1 riastrad eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */
637 1.1 riastrad bl aesarmv8_enc1 /* q0 := AES(ptxt ^ tweak) */
638 1.1 riastrad eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
639 1.1 riastrad str q0, [x2], #0x10 /* store ciphertext block */
640 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
641 1.1 riastrad subs x10, x10, #0x10 /* count down nbytes */
642 1.1 riastrad b.ne 1b /* repeat if more blocks */
643 1.1 riastrad str q9, [x4] /* update tweak */
644 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
645 1.1 riastrad ret
646 1.1 riastrad END(aesarmv8_xts_enc1)
647 1.1 riastrad
648 1.1 riastrad /*
649 1.1 riastrad * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
650 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
651 1.1 riastrad * uint32_t nrounds@x5)
652 1.1 riastrad *
653 1.1 riastrad * Encrypt a contiguous sequence of blocks with AES-XTS.
654 1.1 riastrad *
655 1.1 riastrad * nbytes must be a positive integral multiple of 128.
656 1.1 riastrad *
657 1.1 riastrad * Standard ABI calling convention.
658 1.1 riastrad */
659 1.1 riastrad ENTRY(aesarmv8_xts_enc8)
660 1.1 riastrad stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */
661 1.1 riastrad mov fp, sp
662 1.1 riastrad mov x9, x0 /* x9 := enckey */
663 1.1 riastrad mov x10, x3 /* x10 := nbytes */
664 1.1 riastrad ldr q9, [x4] /* q9 := tweak */
665 1.1 riastrad 1: str q9, [sp, #16] /* save tweak[0] */
666 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
667 1.1 riastrad str q9, [sp, #32] /* save tweak[1] */
668 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
669 1.1 riastrad mov v10.16b, v9.16b /* q10 := tweak[2] */
670 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
671 1.1 riastrad mov v11.16b, v9.16b /* q11 := tweak[3] */
672 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
673 1.1 riastrad mov v12.16b, v9.16b /* q11 := tweak[4] */
674 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
675 1.1 riastrad mov v13.16b, v9.16b /* q11 := tweak[5] */
676 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
677 1.1 riastrad mov v14.16b, v9.16b /* q11 := tweak[6] */
678 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
679 1.1 riastrad mov v15.16b, v9.16b /* q11 := tweak[7] */
680 1.1 riastrad ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */
681 1.1 riastrad ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */
682 1.1 riastrad ldp q2, q3, [x1], #0x20
683 1.1 riastrad ldp q4, q5, [x1], #0x20
684 1.1 riastrad ldp q6, q7, [x1], #0x20
685 1.1 riastrad eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */
686 1.1 riastrad eor v1.16b, v1.16b, v9.16b
687 1.1 riastrad eor v2.16b, v2.16b, v10.16b
688 1.1 riastrad eor v3.16b, v3.16b, v11.16b
689 1.1 riastrad eor v4.16b, v4.16b, v12.16b
690 1.1 riastrad eor v5.16b, v5.16b, v13.16b
691 1.1 riastrad eor v6.16b, v6.16b, v14.16b
692 1.1 riastrad eor v7.16b, v7.16b, v15.16b
693 1.1 riastrad mov x0, x9 /* x0 := enckey */
694 1.1 riastrad mov x3, x5 /* x3 := nrounds */
695 1.1 riastrad bl aesarmv8_enc8 /* encrypt q0,...,q7; trash x0/x3/q8 */
696 1.1 riastrad ldr q8, [sp, #16] /* reload q8 := tweak[0] */
697 1.1 riastrad eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */
698 1.1 riastrad eor v2.16b, v2.16b, v10.16b
699 1.1 riastrad eor v3.16b, v3.16b, v11.16b
700 1.1 riastrad eor v0.16b, v0.16b, v8.16b
701 1.1 riastrad eor v4.16b, v4.16b, v12.16b
702 1.1 riastrad eor v5.16b, v5.16b, v13.16b
703 1.1 riastrad eor v6.16b, v6.16b, v14.16b
704 1.1 riastrad eor v7.16b, v7.16b, v15.16b
705 1.1 riastrad stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
706 1.1 riastrad stp q2, q3, [x2], #0x20 /* store ciphertext blocks */
707 1.1 riastrad stp q4, q5, [x2], #0x20 /* store ciphertext blocks */
708 1.1 riastrad stp q6, q7, [x2], #0x20 /* store ciphertext blocks */
709 1.1 riastrad mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */
710 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
711 1.1 riastrad subs x10, x10, #0x80 /* count down nbytes */
712 1.1 riastrad b.ne 1b /* repeat if more block groups */
713 1.1 riastrad str q9, [x4] /* update tweak */
714 1.1 riastrad ldp fp, lr, [sp], #48 /* pop stack frame */
715 1.1 riastrad ret
716 1.1 riastrad END(aesarmv8_xts_enc8)
717 1.1 riastrad
718 1.1 riastrad /*
719 1.1 riastrad * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
720 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
721 1.1 riastrad * uint32_t nrounds@x5)
722 1.1 riastrad *
723 1.1 riastrad * Decrypt a contiguous sequence of blocks with AES-XTS.
724 1.1 riastrad *
725 1.1 riastrad * nbytes must be a positive integral multiple of 16. This routine
726 1.1 riastrad * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
727 1.1 riastrad *
728 1.1 riastrad * Standard ABI calling convention.
729 1.1 riastrad */
730 1.1 riastrad ENTRY(aesarmv8_xts_dec1)
731 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
732 1.1 riastrad mov fp, sp
733 1.1 riastrad mov x9, x0 /* x9 := deckey */
734 1.1 riastrad mov x10, x3 /* x10 := nbytes */
735 1.1 riastrad ldr q9, [x4] /* q9 := tweak */
736 1.1 riastrad 1: ldr q0, [x1], #0x10 /* q0 := ptxt */
737 1.1 riastrad mov x0, x9 /* x0 := deckey */
738 1.1 riastrad mov x3, x5 /* x3 := nrounds */
739 1.1 riastrad eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */
740 1.1 riastrad bl aesarmv8_dec1 /* q0 := AES(ptxt ^ tweak) */
741 1.1 riastrad eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
742 1.1 riastrad str q0, [x2], #0x10 /* store ciphertext block */
743 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
744 1.1 riastrad subs x10, x10, #0x10 /* count down nbytes */
745 1.1 riastrad b.ne 1b /* repeat if more blocks */
746 1.1 riastrad str q9, [x4] /* update tweak */
747 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
748 1.1 riastrad ret
749 1.1 riastrad END(aesarmv8_xts_dec1)
750 1.1 riastrad
751 1.1 riastrad /*
752 1.1 riastrad * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
753 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
754 1.1 riastrad * uint32_t nrounds@x5)
755 1.1 riastrad *
756 1.1 riastrad * Decrypt a contiguous sequence of blocks with AES-XTS.
757 1.1 riastrad *
758 1.1 riastrad * nbytes must be a positive integral multiple of 128.
759 1.1 riastrad *
760 1.1 riastrad * Standard ABI calling convention.
761 1.1 riastrad */
762 1.1 riastrad ENTRY(aesarmv8_xts_dec8)
763 1.1 riastrad stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */
764 1.1 riastrad mov fp, sp
765 1.1 riastrad mov x9, x0 /* x9 := deckey */
766 1.1 riastrad mov x10, x3 /* x10 := nbytes */
767 1.1 riastrad ldr q9, [x4] /* q9 := tweak */
768 1.1 riastrad 1: str q9, [sp, #16] /* save tweak[0] */
769 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
770 1.1 riastrad str q9, [sp, #32] /* save tweak[1] */
771 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
772 1.1 riastrad mov v10.16b, v9.16b /* q10 := tweak[2] */
773 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
774 1.1 riastrad mov v11.16b, v9.16b /* q11 := tweak[3] */
775 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
776 1.1 riastrad mov v12.16b, v9.16b /* q11 := tweak[4] */
777 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
778 1.1 riastrad mov v13.16b, v9.16b /* q11 := tweak[5] */
779 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
780 1.1 riastrad mov v14.16b, v9.16b /* q11 := tweak[6] */
781 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
782 1.1 riastrad mov v15.16b, v9.16b /* q11 := tweak[7] */
783 1.1 riastrad ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */
784 1.1 riastrad ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */
785 1.1 riastrad ldp q2, q3, [x1], #0x20
786 1.1 riastrad ldp q4, q5, [x1], #0x20
787 1.1 riastrad ldp q6, q7, [x1], #0x20
788 1.1 riastrad eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */
789 1.1 riastrad eor v1.16b, v1.16b, v9.16b
790 1.1 riastrad eor v2.16b, v2.16b, v10.16b
791 1.1 riastrad eor v3.16b, v3.16b, v11.16b
792 1.1 riastrad eor v4.16b, v4.16b, v12.16b
793 1.1 riastrad eor v5.16b, v5.16b, v13.16b
794 1.1 riastrad eor v6.16b, v6.16b, v14.16b
795 1.1 riastrad eor v7.16b, v7.16b, v15.16b
796 1.1 riastrad mov x0, x9 /* x0 := deckey */
797 1.1 riastrad mov x3, x5 /* x3 := nrounds */
798 1.1 riastrad bl aesarmv8_dec8 /* decrypt q0,...,q7; trash x0/x3/q8 */
799 1.1 riastrad ldr q8, [sp, #16] /* reload q8 := tweak[0] */
800 1.1 riastrad eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */
801 1.1 riastrad eor v2.16b, v2.16b, v10.16b
802 1.1 riastrad eor v3.16b, v3.16b, v11.16b
803 1.1 riastrad eor v0.16b, v0.16b, v8.16b
804 1.1 riastrad eor v4.16b, v4.16b, v12.16b
805 1.1 riastrad eor v5.16b, v5.16b, v13.16b
806 1.1 riastrad eor v6.16b, v6.16b, v14.16b
807 1.1 riastrad eor v7.16b, v7.16b, v15.16b
808 1.1 riastrad stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
809 1.1 riastrad stp q2, q3, [x2], #0x20 /* store ciphertext blocks */
810 1.1 riastrad stp q4, q5, [x2], #0x20 /* store ciphertext blocks */
811 1.1 riastrad stp q6, q7, [x2], #0x20 /* store ciphertext blocks */
812 1.1 riastrad mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */
813 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
814 1.1 riastrad subs x10, x10, #0x80 /* count down nbytes */
815 1.1 riastrad b.ne 1b /* repeat if more block groups */
816 1.1 riastrad str q9, [x4] /* update tweak */
817 1.1 riastrad ldp fp, lr, [sp], #48 /* pop stack frame */
818 1.1 riastrad ret
819 1.1 riastrad END(aesarmv8_xts_dec8)
820 1.1 riastrad
821 1.1 riastrad /*
822 1.1 riastrad * aesarmv8_xts_mulx(tweak@q9)
823 1.1 riastrad *
824 1.1 riastrad * Multiply q9 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
825 1.1 riastrad * Uses x0 and q0/q1 as temporaries.
826 1.1 riastrad */
827 1.1 riastrad .text
828 1.1 riastrad _ALIGN_TEXT
829 1.1 riastrad .type aesarmv8_xts_mulx,@function
830 1.1 riastrad aesarmv8_xts_mulx:
831 1.1 riastrad /*
832 1.1 riastrad * Simultaneously determine
833 1.1 riastrad * (a) whether the high bit of the low half must be
834 1.1 riastrad * shifted into the low bit of the high half, and
835 1.1 riastrad * (b) whether the high bit of the high half must be
836 1.1 riastrad * carried into x^128 = x^7 + x^2 + x + 1.
837 1.1 riastrad */
838 1.1 riastrad adrl x0, xtscarry
839 1.1 riastrad cmlt v1.2d, v9.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
840 1.1 riastrad ldr q0, [x0] /* q0 := xtscarry */
841 1.1 riastrad ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
842 1.1 riastrad shl v9.2d, v9.2d, #1 /* shift */
843 1.1 riastrad and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */
844 1.1 riastrad eor v9.16b, v9.16b, v0.16b /* incorporate (a) and (b) */
845 1.1 riastrad ret
846 1.1 riastrad END(aesarmv8_xts_mulx)
847 1.1 riastrad
848 1.1 riastrad .section .rodata
849 1.2 riastrad .p2align 4
850 1.1 riastrad .type xtscarry,@object
851 1.1 riastrad xtscarry:
852 1.1 riastrad .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0
853 1.1 riastrad END(xtscarry)
854 1.1 riastrad
855 1.1 riastrad /*
856 1.1 riastrad * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
857 1.1 riastrad *
858 1.1 riastrad * Update an AES-XTS tweak.
859 1.1 riastrad *
860 1.1 riastrad * Standard ABI calling convention.
861 1.1 riastrad */
862 1.1 riastrad ENTRY(aesarmv8_xts_update)
863 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
864 1.1 riastrad mov fp, sp
865 1.1 riastrad ldr q9, [x0] /* load tweak */
866 1.1 riastrad bl aesarmv8_xts_mulx /* q9 *= x */
867 1.1 riastrad str q9, [x1] /* store tweak */
868 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
869 1.1 riastrad ret
870 1.1 riastrad END(aesarmv8_xts_update)
871 1.1 riastrad
872 1.1 riastrad /*
873 1.1 riastrad * aesarmv8_enc1(const struct aesenc *enckey@x0,
874 1.1 riastrad * uint128_t block@q0, uint32_t nrounds@x3)
875 1.1 riastrad *
876 1.1 riastrad * Encrypt a single AES block in q0.
877 1.1 riastrad *
878 1.1 riastrad * Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
879 1.1 riastrad */
880 1.1 riastrad .text
881 1.1 riastrad _ALIGN_TEXT
882 1.1 riastrad .type aesarmv8_enc1,@function
883 1.1 riastrad aesarmv8_enc1:
884 1.1 riastrad ldr q8, [x0], #0x10 /* load round key */
885 1.1 riastrad 1: subs x3, x3, #1
886 1.1 riastrad /* q0 := ShiftRows(SubBytes(AddRoundKey_q8(q0))) */
887 1.1 riastrad aese v0.16b, v8.16b
888 1.1 riastrad ldr q8, [x0], #0x10 /* load next round key */
889 1.1 riastrad b.eq 2f
890 1.1 riastrad /* q0 := MixColumns(q0) */
891 1.1 riastrad aesmc v0.16b, v0.16b
892 1.1 riastrad b 1b
893 1.1 riastrad 2: eor v0.16b, v0.16b, v8.16b
894 1.1 riastrad ret
895 1.1 riastrad END(aesarmv8_enc1)
896 1.1 riastrad
897 1.1 riastrad /*
898 1.1 riastrad * aesarmv8_enc8(const struct aesenc *enckey@x0,
899 1.1 riastrad * uint128_t block0@q0, ..., uint128_t block7@q7,
900 1.1 riastrad * uint32_t nrounds@x3)
901 1.1 riastrad *
902 1.1 riastrad * Encrypt eight AES blocks in q0 through q7 in parallel.
903 1.1 riastrad *
904 1.1 riastrad * Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
905 1.1 riastrad */
906 1.1 riastrad .text
907 1.1 riastrad _ALIGN_TEXT
908 1.1 riastrad .type aesarmv8_enc8,@function
909 1.1 riastrad aesarmv8_enc8:
910 1.1 riastrad ldr q8, [x0], #0x10 /* load round key */
911 1.1 riastrad 1: subs x3, x3, #1
912 1.1 riastrad /* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */
913 1.1 riastrad aese v0.16b, v8.16b
914 1.1 riastrad aese v1.16b, v8.16b
915 1.1 riastrad aese v2.16b, v8.16b
916 1.1 riastrad aese v3.16b, v8.16b
917 1.1 riastrad aese v4.16b, v8.16b
918 1.1 riastrad aese v5.16b, v8.16b
919 1.1 riastrad aese v6.16b, v8.16b
920 1.1 riastrad aese v7.16b, v8.16b
921 1.1 riastrad ldr q8, [x0], #0x10 /* load next round key */
922 1.1 riastrad b.eq 2f
923 1.1 riastrad /* q[i] := MixColumns(q[i]) */
924 1.1 riastrad aesmc v0.16b, v0.16b
925 1.1 riastrad aesmc v1.16b, v1.16b
926 1.1 riastrad aesmc v2.16b, v2.16b
927 1.1 riastrad aesmc v3.16b, v3.16b
928 1.1 riastrad aesmc v4.16b, v4.16b
929 1.1 riastrad aesmc v5.16b, v5.16b
930 1.1 riastrad aesmc v6.16b, v6.16b
931 1.1 riastrad aesmc v7.16b, v7.16b
932 1.1 riastrad b 1b
933 1.1 riastrad 2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */
934 1.1 riastrad eor v1.16b, v1.16b, v8.16b
935 1.1 riastrad eor v2.16b, v2.16b, v8.16b
936 1.1 riastrad eor v3.16b, v3.16b, v8.16b
937 1.1 riastrad eor v4.16b, v4.16b, v8.16b
938 1.1 riastrad eor v5.16b, v5.16b, v8.16b
939 1.1 riastrad eor v6.16b, v6.16b, v8.16b
940 1.1 riastrad eor v7.16b, v7.16b, v8.16b
941 1.1 riastrad ret
942 1.1 riastrad END(aesarmv8_enc8)
943 1.1 riastrad
944 1.1 riastrad /*
945 1.1 riastrad * aesarmv8_dec1(const struct aesdec *deckey@x0,
946 1.1 riastrad * uint128_t block@q0, uint32_t nrounds@x3)
947 1.1 riastrad *
948 1.1 riastrad * Decrypt a single AES block in q0.
949 1.1 riastrad *
950 1.1 riastrad * Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
951 1.1 riastrad */
952 1.1 riastrad .text
953 1.1 riastrad _ALIGN_TEXT
954 1.1 riastrad .type aesarmv8_dec1,@function
955 1.1 riastrad aesarmv8_dec1:
956 1.1 riastrad ldr q8, [x0], #0x10 /* load round key */
957 1.1 riastrad 1: subs x3, x3, #1
958 1.1 riastrad /* q0 := InSubBytes(InShiftRows(AddRoundKey_q8(q0))) */
959 1.1 riastrad aesd v0.16b, v8.16b
960 1.1 riastrad ldr q8, [x0], #0x10 /* load next round key */
961 1.1 riastrad b.eq 2f
962 1.1 riastrad /* q0 := InMixColumns(q0) */
963 1.1 riastrad aesimc v0.16b, v0.16b
964 1.1 riastrad b 1b
965 1.1 riastrad 2: eor v0.16b, v0.16b, v8.16b
966 1.1 riastrad ret
967 1.1 riastrad END(aesarmv8_dec1)
968 1.1 riastrad
969 1.1 riastrad /*
970 1.1 riastrad * aesarmv8_dec8(const struct aesdec *deckey@x0,
971 1.1 riastrad * uint128_t block0@q0, ..., uint128_t block7@q7,
972 1.1 riastrad * uint32_t nrounds@x3)
973 1.1 riastrad *
974 1.1 riastrad * Decrypt eight AES blocks in q0 through q7 in parallel.
975 1.1 riastrad *
976 1.1 riastrad * Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
977 1.1 riastrad */
978 1.1 riastrad .text
979 1.1 riastrad _ALIGN_TEXT
980 1.1 riastrad .type aesarmv8_dec8,@function
981 1.1 riastrad aesarmv8_dec8:
982 1.1 riastrad ldr q8, [x0], #0x10 /* load round key */
983 1.1 riastrad 1: subs x3, x3, #1
984 1.1 riastrad /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */
985 1.1 riastrad aesd v0.16b, v8.16b
986 1.1 riastrad aesd v1.16b, v8.16b
987 1.1 riastrad aesd v2.16b, v8.16b
988 1.1 riastrad aesd v3.16b, v8.16b
989 1.1 riastrad aesd v4.16b, v8.16b
990 1.1 riastrad aesd v5.16b, v8.16b
991 1.1 riastrad aesd v6.16b, v8.16b
992 1.1 riastrad aesd v7.16b, v8.16b
993 1.1 riastrad ldr q8, [x0], #0x10 /* load next round key */
994 1.1 riastrad b.eq 2f
995 1.1 riastrad /* q[i] := InMixColumns(q[i]) */
996 1.1 riastrad aesimc v0.16b, v0.16b
997 1.1 riastrad aesimc v1.16b, v1.16b
998 1.1 riastrad aesimc v2.16b, v2.16b
999 1.1 riastrad aesimc v3.16b, v3.16b
1000 1.1 riastrad aesimc v4.16b, v4.16b
1001 1.1 riastrad aesimc v5.16b, v5.16b
1002 1.1 riastrad aesimc v6.16b, v6.16b
1003 1.1 riastrad aesimc v7.16b, v7.16b
1004 1.1 riastrad b 1b
1005 1.1 riastrad 2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */
1006 1.1 riastrad eor v1.16b, v1.16b, v8.16b
1007 1.1 riastrad eor v2.16b, v2.16b, v8.16b
1008 1.1 riastrad eor v3.16b, v3.16b, v8.16b
1009 1.1 riastrad eor v4.16b, v4.16b, v8.16b
1010 1.1 riastrad eor v5.16b, v5.16b, v8.16b
1011 1.1 riastrad eor v6.16b, v6.16b, v8.16b
1012 1.1 riastrad eor v7.16b, v7.16b, v8.16b
1013 1.1 riastrad ret
1014 1.1 riastrad END(aesarmv8_dec8)
1015