aes_armv8_64.S revision 1.7 1 1.7 riastrad /* $NetBSD: aes_armv8_64.S,v 1.7 2020/07/25 22:32:09 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <aarch64/asm.h>
30 1.1 riastrad
31 1.3 riastrad .arch_extension aes
32 1.1 riastrad
33 1.1 riastrad /*
34 1.1 riastrad * uint32_t rcon[10]
35 1.1 riastrad *
36 1.1 riastrad * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
37 1.1 riastrad * Such elements of GF(8) need only eight bits to be represented,
38 1.1 riastrad * but we store them in 4-byte units so we can copy one into all
39 1.1 riastrad * four 4-byte lanes of a vector register with a single LD1R. The
40 1.1 riastrad * access pattern is fixed, so indices into this table are never
41 1.1 riastrad * secret.
42 1.1 riastrad */
43 1.1 riastrad .section .rodata
44 1.2 riastrad .p2align 2
45 1.1 riastrad .type rcon,@object
46 1.1 riastrad rcon:
47 1.1 riastrad .long 0x01
48 1.1 riastrad .long 0x02
49 1.1 riastrad .long 0x04
50 1.1 riastrad .long 0x08
51 1.1 riastrad .long 0x10
52 1.1 riastrad .long 0x20
53 1.1 riastrad .long 0x40
54 1.1 riastrad .long 0x80
55 1.1 riastrad .long 0x1b
56 1.1 riastrad .long 0x36
57 1.1 riastrad END(rcon)
58 1.1 riastrad
59 1.1 riastrad /*
60 1.1 riastrad * uint128_t unshiftrows_rotword_1
61 1.1 riastrad *
62 1.1 riastrad * Table for TBL instruction to undo ShiftRows, and then do
63 1.1 riastrad * RotWord on word 1, and then copy it into all the other words.
64 1.1 riastrad */
65 1.1 riastrad .section .rodata
66 1.2 riastrad .p2align 4
67 1.1 riastrad .type unshiftrows_rotword_1,@object
68 1.1 riastrad unshiftrows_rotword_1:
69 1.1 riastrad .byte 0x01,0x0e,0x0b,0x04
70 1.1 riastrad .byte 0x01,0x0e,0x0b,0x04
71 1.1 riastrad .byte 0x01,0x0e,0x0b,0x04
72 1.1 riastrad .byte 0x01,0x0e,0x0b,0x04
73 1.1 riastrad END(unshiftrows_rotword_1)
74 1.1 riastrad
75 1.1 riastrad /*
76 1.1 riastrad * uint128_t unshiftrows_3
77 1.1 riastrad *
78 1.1 riastrad * Table for TBL instruction to undo ShiftRows, and then copy word
79 1.1 riastrad * 3 into all the other words.
80 1.1 riastrad */
81 1.1 riastrad .section .rodata
82 1.2 riastrad .p2align 4
83 1.1 riastrad .type unshiftrows_3,@object
84 1.1 riastrad unshiftrows_3:
85 1.1 riastrad .byte 0x0c,0x09,0x06,0x03
86 1.1 riastrad .byte 0x0c,0x09,0x06,0x03
87 1.1 riastrad .byte 0x0c,0x09,0x06,0x03
88 1.1 riastrad .byte 0x0c,0x09,0x06,0x03
89 1.1 riastrad END(unshiftrows_3)
90 1.1 riastrad
91 1.1 riastrad /*
92 1.1 riastrad * uint128_t unshiftrows_rotword_3
93 1.1 riastrad *
94 1.1 riastrad * Table for TBL instruction to undo ShiftRows, and then do
95 1.1 riastrad * RotWord on word 3, and then copy it into all the other words.
96 1.1 riastrad */
97 1.1 riastrad .section .rodata
98 1.2 riastrad .p2align 4
99 1.1 riastrad .type unshiftrows_rotword_3,@object
100 1.1 riastrad unshiftrows_rotword_3:
101 1.1 riastrad .byte 0x09,0x06,0x03,0x0c
102 1.1 riastrad .byte 0x09,0x06,0x03,0x0c
103 1.1 riastrad .byte 0x09,0x06,0x03,0x0c
104 1.1 riastrad .byte 0x09,0x06,0x03,0x0c
105 1.1 riastrad END(unshiftrows_rotword_3)
106 1.1 riastrad
107 1.1 riastrad /*
108 1.1 riastrad * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
109 1.1 riastrad *
110 1.1 riastrad * Expand a 16-byte AES-128 key into 10 round keys.
111 1.1 riastrad *
112 1.1 riastrad * Standard ABI calling convention.
113 1.1 riastrad */
114 1.1 riastrad ENTRY(aesarmv8_setenckey128)
115 1.1 riastrad ldr q1, [x1] /* q1 := master key */
116 1.1 riastrad
117 1.1 riastrad adrl x4, unshiftrows_rotword_3
118 1.1 riastrad eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
119 1.4 riastrad ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */
120 1.1 riastrad
121 1.1 riastrad str q1, [x0], #0x10 /* store master key as first round key */
122 1.1 riastrad mov x2, #10 /* round count */
123 1.1 riastrad adrl x3, rcon /* round constant */
124 1.1 riastrad
125 1.1 riastrad 1: /*
126 1.1 riastrad * q0 = 0
127 1.1 riastrad * v1.4s = (prk[0], prk[1], prk[2], prk[3])
128 1.1 riastrad * x0 = pointer to round key to compute
129 1.1 riastrad * x2 = round count
130 1.1 riastrad * x3 = rcon pointer
131 1.1 riastrad */
132 1.1 riastrad
133 1.1 riastrad /* q3 := ShiftRows(SubBytes(q1)) */
134 1.1 riastrad mov v3.16b, v1.16b
135 1.1 riastrad aese v3.16b, v0.16b
136 1.1 riastrad
137 1.1 riastrad /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
138 1.1 riastrad ld1r {v4.4s}, [x3], #4
139 1.4 riastrad tbl v3.16b, {v3.16b}, v16.16b
140 1.1 riastrad eor v3.16b, v3.16b, v4.16b
141 1.1 riastrad
142 1.1 riastrad /*
143 1.1 riastrad * v5.4s := (0,prk[0],prk[1],prk[2])
144 1.1 riastrad * v6.4s := (0,0,prk[0],prk[1])
145 1.1 riastrad * v7.4s := (0,0,0,prk[0])
146 1.1 riastrad */
147 1.1 riastrad ext v5.16b, v0.16b, v1.16b, #12
148 1.1 riastrad ext v6.16b, v0.16b, v1.16b, #8
149 1.1 riastrad ext v7.16b, v0.16b, v1.16b, #4
150 1.1 riastrad
151 1.1 riastrad /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
152 1.1 riastrad eor v1.16b, v1.16b, v3.16b
153 1.1 riastrad eor v1.16b, v1.16b, v5.16b
154 1.1 riastrad eor v1.16b, v1.16b, v6.16b
155 1.1 riastrad eor v1.16b, v1.16b, v7.16b
156 1.1 riastrad
157 1.1 riastrad subs x2, x2, #1 /* count down rounds */
158 1.1 riastrad str q1, [x0], #0x10 /* store round key */
159 1.1 riastrad b.ne 1b
160 1.1 riastrad
161 1.1 riastrad ret
162 1.1 riastrad END(aesarmv8_setenckey128)
163 1.1 riastrad
164 1.1 riastrad /*
165 1.1 riastrad * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
166 1.1 riastrad *
167 1.1 riastrad * Expand a 24-byte AES-192 key into 12 round keys.
168 1.1 riastrad *
169 1.1 riastrad * Standard ABI calling convention.
170 1.1 riastrad */
171 1.1 riastrad ENTRY(aesarmv8_setenckey192)
172 1.1 riastrad ldr q1, [x1], #0x10 /* q1 := master key[0:128) */
173 1.1 riastrad ldr d2, [x1] /* d2 := master key[128:192) */
174 1.1 riastrad
175 1.1 riastrad adrl x4, unshiftrows_rotword_1
176 1.1 riastrad adrl x5, unshiftrows_rotword_3
177 1.1 riastrad eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
178 1.4 riastrad ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */
179 1.4 riastrad ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */
180 1.1 riastrad
181 1.1 riastrad str q1, [x0], #0x10 /* store master key[0:128) as round key */
182 1.1 riastrad mov x2, #12 /* round count */
183 1.1 riastrad adrl x3, rcon /* round constant */
184 1.1 riastrad
185 1.1 riastrad 1: /*
186 1.1 riastrad * q0 = 0
187 1.1 riastrad * v1.4s = (prk[0], prk[1], prk[2], prk[3])
188 1.1 riastrad * v2.4s = (rklo[0], rklo[1], xxx, xxx)
189 1.1 riastrad * x0 = pointer to three round keys to compute
190 1.1 riastrad * x2 = round count
191 1.1 riastrad * x3 = rcon pointer
192 1.1 riastrad */
193 1.1 riastrad
194 1.1 riastrad /* q3 := ShiftRows(SubBytes(q2)) */
195 1.1 riastrad mov v3.16b, v2.16b
196 1.1 riastrad aese v3.16b, v0.16b
197 1.1 riastrad
198 1.1 riastrad /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
199 1.1 riastrad ld1r {v4.4s}, [x3], #4
200 1.4 riastrad tbl v3.16b, {v3.16b}, v16.16b
201 1.1 riastrad eor v3.16b, v3.16b, v4.16b
202 1.1 riastrad
203 1.1 riastrad /*
204 1.1 riastrad * We need to compute:
205 1.1 riastrad *
206 1.1 riastrad * rk[0] := rklo[0]
207 1.1 riastrad * rk[1] := rklo[1]
208 1.1 riastrad * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
209 1.1 riastrad * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
210 1.1 riastrad * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
211 1.1 riastrad * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
212 1.1 riastrad * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
213 1.1 riastrad * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
214 1.1 riastrad * ^ rklo[1]
215 1.1 riastrad */
216 1.1 riastrad
217 1.1 riastrad /*
218 1.1 riastrad * v5.4s := (0,prk[0],prk[1],prk[2])
219 1.1 riastrad * v6.4s := (0,0,prk[0],prk[1])
220 1.1 riastrad * v7.4s := (0,0,0,prk[0])
221 1.1 riastrad */
222 1.1 riastrad ext v5.16b, v0.16b, v1.16b, #12
223 1.1 riastrad ext v6.16b, v0.16b, v1.16b, #8
224 1.1 riastrad ext v7.16b, v0.16b, v1.16b, #4
225 1.1 riastrad
226 1.1 riastrad /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
227 1.1 riastrad eor v5.16b, v5.16b, v1.16b
228 1.1 riastrad eor v5.16b, v5.16b, v3.16b
229 1.1 riastrad eor v5.16b, v5.16b, v6.16b
230 1.1 riastrad eor v5.16b, v5.16b, v7.16b
231 1.1 riastrad
232 1.1 riastrad /*
233 1.1 riastrad * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
234 1.1 riastrad * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
235 1.1 riastrad * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
236 1.1 riastrad * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
237 1.1 riastrad * (rklo[0],rklo[1],...).
238 1.1 riastrad */
239 1.1 riastrad
240 1.1 riastrad /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
241 1.5 ryo dup v1.4s, v5.s[3]
242 1.5 ryo mov v1.s[0], v5.s[2]
243 1.1 riastrad
244 1.1 riastrad /*
245 1.1 riastrad * v6.4s := (0, 0, rklo[0], rklo[1])
246 1.1 riastrad * v7.4s := (0, 0, 0, rklo[0])
247 1.1 riastrad */
248 1.1 riastrad ext v6.16b, v0.16b, v2.16b, #8
249 1.1 riastrad ext v7.16b, v0.16b, v2.16b, #4
250 1.1 riastrad
251 1.1 riastrad /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
252 1.1 riastrad eor v3.16b, v1.16b, v6.16b
253 1.1 riastrad eor v3.16b, v3.16b, v7.16b
254 1.1 riastrad
255 1.1 riastrad /*
256 1.1 riastrad * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
257 1.1 riastrad * and v5.4s = (rk[2], rk[3], xxx, xxx). Set
258 1.1 riastrad * v2.4s := (rk[0], rk[1], rk[2], rk[3])
259 1.1 riastrad */
260 1.5 ryo mov v2.d[1], v5.d[0]
261 1.1 riastrad
262 1.1 riastrad /* store two round keys */
263 1.1 riastrad stp q2, q3, [x0], #0x20
264 1.1 riastrad
265 1.1 riastrad /*
266 1.1 riastrad * Live vector registers at this point:
267 1.1 riastrad *
268 1.1 riastrad * q0 = zero
269 1.1 riastrad * q2 = rk
270 1.1 riastrad * q3 = nrk
271 1.1 riastrad * v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
272 1.4 riastrad * q16 = unshiftrows_rotword_1
273 1.4 riastrad * q17 = unshiftrows_rotword_3
274 1.1 riastrad *
275 1.1 riastrad * We have to compute, in q1:
276 1.1 riastrad *
277 1.1 riastrad * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
278 1.1 riastrad * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
279 1.1 riastrad * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
280 1.1 riastrad * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
281 1.1 riastrad * ^ nrk[1]
282 1.1 riastrad *
283 1.1 riastrad * And, if there's any more afterward, in q2:
284 1.1 riastrad *
285 1.1 riastrad * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
286 1.1 riastrad * ^ nrk[1] ^ nrk[2]
287 1.1 riastrad * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
288 1.1 riastrad * ^ nrk[1] ^ nrk[2] ^ nrk[3]
289 1.1 riastrad */
290 1.1 riastrad
291 1.1 riastrad /* q1 := RotWords(SubBytes(q3)) */
292 1.1 riastrad mov v1.16b, v3.16b
293 1.1 riastrad aese v1.16b, v0.16b
294 1.1 riastrad
295 1.1 riastrad /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
296 1.1 riastrad ld1r {v4.4s}, [x3], #4
297 1.4 riastrad tbl v1.16b, {v1.16b}, v17.16b
298 1.1 riastrad eor v1.16b, v1.16b, v4.16b
299 1.1 riastrad
300 1.1 riastrad /*
301 1.1 riastrad * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
302 1.1 riastrad * v4.4s := (0, rk[2], rk[3], nrk[0])
303 1.1 riastrad * v6.4s := (0, 0, rk[2], rk[3])
304 1.1 riastrad * v7.4s := (0, 0, 0, rk[2])
305 1.1 riastrad */
306 1.1 riastrad ext v4.16b, v0.16b, v5.16b, #12
307 1.1 riastrad ext v6.16b, v0.16b, v5.16b, #8
308 1.1 riastrad ext v7.16b, v0.16b, v5.16b, #4
309 1.1 riastrad
310 1.1 riastrad /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
311 1.1 riastrad eor v1.16b, v1.16b, v5.16b
312 1.1 riastrad eor v1.16b, v1.16b, v4.16b
313 1.1 riastrad eor v1.16b, v1.16b, v6.16b
314 1.1 riastrad eor v1.16b, v1.16b, v7.16b
315 1.1 riastrad
316 1.1 riastrad subs x2, x2, #3 /* count down three rounds */
317 1.1 riastrad str q1, [x0], #0x10 /* store third round key */
318 1.1 riastrad b.eq 2f
319 1.1 riastrad
320 1.1 riastrad /*
321 1.1 riastrad * v4.4s := (nrk[2], nrk[3], xxx, xxx)
322 1.1 riastrad * v5.4s := (0, nrk[2], xxx, xxx)
323 1.1 riastrad */
324 1.1 riastrad ext v4.16b, v3.16b, v0.16b, #8
325 1.1 riastrad ext v5.16b, v0.16b, v4.16b, #12
326 1.1 riastrad
327 1.1 riastrad /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
328 1.5 ryo dup v2.4s, v1.s[3]
329 1.1 riastrad
330 1.1 riastrad /*
331 1.1 riastrad * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
332 1.1 riastrad * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
333 1.1 riastrad * xxx, xxx)
334 1.1 riastrad */
335 1.1 riastrad eor v2.16b, v2.16b, v4.16b
336 1.1 riastrad eor v2.16b, v2.16b, v5.16b
337 1.1 riastrad
338 1.1 riastrad b 1b
339 1.1 riastrad
340 1.1 riastrad 2: ret
341 1.1 riastrad END(aesarmv8_setenckey192)
342 1.1 riastrad
343 1.1 riastrad /*
344 1.1 riastrad * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
345 1.1 riastrad *
346 1.1 riastrad * Expand a 32-byte AES-256 key into 14 round keys.
347 1.1 riastrad *
348 1.1 riastrad * Standard ABI calling convention.
349 1.1 riastrad */
350 1.1 riastrad ENTRY(aesarmv8_setenckey256)
351 1.1 riastrad /* q1 := key[0:128), q2 := key[128:256) */
352 1.1 riastrad ldp q1, q2, [x1], #0x20
353 1.1 riastrad
354 1.1 riastrad adrl x4, unshiftrows_rotword_3
355 1.1 riastrad adrl x5, unshiftrows_3
356 1.1 riastrad eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
357 1.4 riastrad ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */
358 1.4 riastrad ldr q17, [x5] /* q17 := unshiftrows_3 */
359 1.1 riastrad
360 1.1 riastrad /* store master key as first two round keys */
361 1.1 riastrad stp q1, q2, [x0], #0x20
362 1.1 riastrad mov x2, #14 /* round count */
363 1.1 riastrad adrl x3, rcon /* round constant */
364 1.1 riastrad
365 1.1 riastrad 1: /*
366 1.1 riastrad * q0 = 0
367 1.1 riastrad * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
368 1.1 riastrad * v2.4s = (prk[0], prk[1], prk[2], prk[3])
369 1.1 riastrad * x2 = round count
370 1.1 riastrad * x3 = rcon pointer
371 1.1 riastrad */
372 1.1 riastrad
373 1.1 riastrad /* q3 := ShiftRows(SubBytes(q2)) */
374 1.1 riastrad mov v3.16b, v2.16b
375 1.1 riastrad aese v3.16b, v0.16b
376 1.1 riastrad
377 1.1 riastrad /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
378 1.1 riastrad ld1r {v4.4s}, [x3], #4
379 1.4 riastrad tbl v3.16b, {v3.16b}, v16.16b
380 1.1 riastrad eor v3.16b, v3.16b, v4.16b
381 1.1 riastrad
382 1.1 riastrad /*
383 1.1 riastrad * v5.4s := (0,pprk[0],pprk[1],pprk[2])
384 1.1 riastrad * v6.4s := (0,0,pprk[0],pprk[1])
385 1.1 riastrad * v7.4s := (0,0,0,pprk[0])
386 1.1 riastrad */
387 1.1 riastrad ext v5.16b, v0.16b, v1.16b, #12
388 1.1 riastrad ext v6.16b, v0.16b, v1.16b, #8
389 1.1 riastrad ext v7.16b, v0.16b, v1.16b, #4
390 1.1 riastrad
391 1.1 riastrad /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
392 1.1 riastrad eor v1.16b, v1.16b, v3.16b
393 1.1 riastrad eor v1.16b, v1.16b, v5.16b
394 1.1 riastrad eor v1.16b, v1.16b, v6.16b
395 1.1 riastrad eor v1.16b, v1.16b, v7.16b
396 1.1 riastrad
397 1.1 riastrad subs x2, x2, #2 /* count down two rounds */
398 1.1 riastrad b.eq 2f /* stop if this is the last one */
399 1.1 riastrad
400 1.1 riastrad /* q3 := ShiftRows(SubBytes(q1)) */
401 1.1 riastrad mov v3.16b, v1.16b
402 1.1 riastrad aese v3.16b, v0.16b
403 1.1 riastrad
404 1.1 riastrad /* v3.4s[i] := SubBytes(rk[3]) */
405 1.4 riastrad tbl v3.16b, {v3.16b}, v17.16b
406 1.1 riastrad
407 1.1 riastrad /*
408 1.1 riastrad * v5.4s := (0,prk[0],prk[1],prk[2])
409 1.1 riastrad * v6.4s := (0,0,prk[0],prk[1])
410 1.1 riastrad * v7.4s := (0,0,0,prk[0])
411 1.1 riastrad */
412 1.1 riastrad ext v5.16b, v0.16b, v2.16b, #12
413 1.1 riastrad ext v6.16b, v0.16b, v2.16b, #8
414 1.1 riastrad ext v7.16b, v0.16b, v2.16b, #4
415 1.1 riastrad
416 1.1 riastrad /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
417 1.1 riastrad eor v2.16b, v2.16b, v3.16b
418 1.1 riastrad eor v2.16b, v2.16b, v5.16b
419 1.1 riastrad eor v2.16b, v2.16b, v6.16b
420 1.1 riastrad eor v2.16b, v2.16b, v7.16b
421 1.1 riastrad
422 1.1 riastrad stp q1, q2, [x0], #0x20 /* store two round keys */
423 1.1 riastrad b 1b
424 1.1 riastrad
425 1.1 riastrad 2: str q1, [x0] /* store last round key */
426 1.1 riastrad ret
427 1.1 riastrad END(aesarmv8_setenckey256)
428 1.1 riastrad
429 1.1 riastrad /*
430 1.1 riastrad * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
431 1.1 riastrad * uint32_t nrounds@x2)
432 1.1 riastrad *
433 1.1 riastrad * Convert AES encryption round keys to AES decryption round keys.
434 1.1 riastrad * `rounds' must be between 10 and 14.
435 1.1 riastrad *
436 1.1 riastrad * Standard ABI calling convention.
437 1.1 riastrad */
438 1.1 riastrad ENTRY(aesarmv8_enctodec)
439 1.1 riastrad ldr q0, [x0, x2, lsl #4] /* load last round key */
440 1.7 riastrad b 2f
441 1.7 riastrad 1: aesimc v0.16b, v0.16b /* convert encryption to decryption */
442 1.7 riastrad 2: str q0, [x1], #0x10 /* store round key */
443 1.1 riastrad subs x2, x2, #1 /* count down round */
444 1.1 riastrad ldr q0, [x0, x2, lsl #4] /* load previous round key */
445 1.7 riastrad b.ne 1b /* repeat if there's more */
446 1.7 riastrad str q0, [x1] /* store first round key verbatim */
447 1.1 riastrad ret
448 1.1 riastrad END(aesarmv8_enctodec)
449 1.1 riastrad
450 1.1 riastrad /*
451 1.1 riastrad * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
452 1.1 riastrad * uint8_t out[16] @x2, uint32_t nrounds@x3)
453 1.1 riastrad *
454 1.1 riastrad * Encrypt a single block.
455 1.1 riastrad *
456 1.1 riastrad * Standard ABI calling convention.
457 1.1 riastrad */
458 1.1 riastrad ENTRY(aesarmv8_enc)
459 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
460 1.1 riastrad mov fp, sp
461 1.4 riastrad ldr q0, [x1] /* q0 := ptxt */
462 1.4 riastrad bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
463 1.4 riastrad str q0, [x2] /* store ctxt */
464 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
465 1.1 riastrad ret
466 1.1 riastrad END(aesarmv8_enc)
467 1.1 riastrad
468 1.1 riastrad /*
469 1.1 riastrad * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
470 1.1 riastrad * uint8_t out[16] @x2, uint32_t nrounds@x3)
471 1.1 riastrad *
472 1.1 riastrad * Decrypt a single block.
473 1.1 riastrad *
474 1.1 riastrad * Standard ABI calling convention.
475 1.1 riastrad */
476 1.1 riastrad ENTRY(aesarmv8_dec)
477 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
478 1.1 riastrad mov fp, sp
479 1.4 riastrad ldr q0, [x1] /* q0 := ctxt */
480 1.4 riastrad bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */
481 1.4 riastrad str q0, [x2] /* store ptxt */
482 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
483 1.1 riastrad ret
484 1.1 riastrad END(aesarmv8_dec)
485 1.1 riastrad
486 1.1 riastrad /*
487 1.1 riastrad * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
488 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
489 1.1 riastrad * uint32_t nrounds@x5)
490 1.1 riastrad *
491 1.1 riastrad * Encrypt a contiguous sequence of blocks with AES-CBC.
492 1.1 riastrad *
493 1.1 riastrad * nbytes must be an integral multiple of 16.
494 1.1 riastrad *
495 1.1 riastrad * Standard ABI calling convention.
496 1.1 riastrad */
497 1.1 riastrad ENTRY(aesarmv8_cbc_enc)
498 1.1 riastrad cbz x3, 2f /* stop if nothing to do */
499 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
500 1.1 riastrad mov fp, sp
501 1.1 riastrad mov x9, x0 /* x9 := enckey */
502 1.1 riastrad mov x10, x3 /* x10 := nbytes */
503 1.1 riastrad ldr q0, [x4] /* q0 := chaining value */
504 1.1 riastrad 1: ldr q1, [x1], #0x10 /* q1 := plaintext block */
505 1.1 riastrad eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
506 1.1 riastrad mov x0, x9 /* x0 := enckey */
507 1.1 riastrad mov x3, x5 /* x3 := nrounds */
508 1.4 riastrad bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
509 1.1 riastrad subs x10, x10, #0x10 /* count down nbytes */
510 1.1 riastrad str q0, [x2], #0x10 /* store ciphertext block */
511 1.1 riastrad b.ne 1b /* repeat if x10 is nonzero */
512 1.1 riastrad str q0, [x4] /* store chaining value */
513 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
514 1.1 riastrad 2: ret
515 1.1 riastrad END(aesarmv8_cbc_enc)
516 1.1 riastrad
517 1.1 riastrad /*
518 1.1 riastrad * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
519 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
520 1.1 riastrad * uint32_t nrounds@x5)
521 1.1 riastrad *
522 1.1 riastrad * Decrypt a contiguous sequence of blocks with AES-CBC.
523 1.1 riastrad *
524 1.1 riastrad * nbytes must be a positive integral multiple of 16. This routine
525 1.1 riastrad * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
526 1.1 riastrad *
527 1.1 riastrad * Standard ABI calling convention.
528 1.1 riastrad */
529 1.1 riastrad ENTRY(aesarmv8_cbc_dec1)
530 1.4 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
531 1.1 riastrad mov fp, sp
532 1.4 riastrad ldr q24, [x4] /* q24 := iv */
533 1.1 riastrad mov x9, x0 /* x9 := enckey */
534 1.1 riastrad mov x10, x3 /* x10 := nbytes */
535 1.1 riastrad add x1, x1, x3 /* x1 := pointer past end of in */
536 1.1 riastrad add x2, x2, x3 /* x2 := pointer past end of out */
537 1.1 riastrad ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */
538 1.1 riastrad str q0, [x4] /* update iv */
539 1.7 riastrad b 2f
540 1.7 riastrad 1: ldr q31, [x1, #-0x10]! /* q31 := chaining value */
541 1.7 riastrad eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
542 1.7 riastrad str q0, [x2, #-0x10]! /* store plaintext block */
543 1.7 riastrad mov v0.16b, v31.16b /* move cv = ciphertext block */
544 1.7 riastrad 2: mov x0, x9 /* x0 := enckey */
545 1.1 riastrad mov x3, x5 /* x3 := nrounds */
546 1.4 riastrad bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */
547 1.1 riastrad subs x10, x10, #0x10 /* count down nbytes */
548 1.7 riastrad b.ne 1b /* repeat if more blocks */
549 1.7 riastrad eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
550 1.1 riastrad str q0, [x2, #-0x10]! /* store first plaintext block */
551 1.4 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
552 1.1 riastrad ret
553 1.1 riastrad END(aesarmv8_cbc_dec1)
554 1.1 riastrad
555 1.1 riastrad /*
556 1.1 riastrad * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
557 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
558 1.1 riastrad * uint32_t nrounds@x5)
559 1.1 riastrad *
560 1.1 riastrad * Decrypt a contiguous sequence of 8-block units with AES-CBC.
561 1.1 riastrad *
562 1.1 riastrad * nbytes must be a positive integral multiple of 128.
563 1.1 riastrad *
564 1.1 riastrad * Standard ABI calling convention.
565 1.1 riastrad */
566 1.1 riastrad ENTRY(aesarmv8_cbc_dec8)
567 1.4 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
568 1.1 riastrad mov fp, sp
569 1.4 riastrad ldr q24, [x4] /* q24 := iv */
570 1.1 riastrad mov x9, x0 /* x9 := enckey */
571 1.1 riastrad mov x10, x3 /* x10 := nbytes */
572 1.1 riastrad add x1, x1, x3 /* x1 := pointer past end of in */
573 1.1 riastrad add x2, x2, x3 /* x2 := pointer past end of out */
574 1.1 riastrad ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */
575 1.1 riastrad str q7, [x4] /* update iv */
576 1.7 riastrad b 2f
577 1.7 riastrad 1: ldp q6, q7, [x1, #-0x20]!
578 1.7 riastrad eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
579 1.7 riastrad stp q0, q1, [x2, #-0x20]!
580 1.7 riastrad 2: ldp q4, q5, [x1, #-0x20]!
581 1.1 riastrad ldp q2, q3, [x1, #-0x20]!
582 1.1 riastrad ldp q0, q1, [x1, #-0x20]!
583 1.4 riastrad mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */
584 1.4 riastrad mov v30.16b, v5.16b
585 1.4 riastrad mov v29.16b, v4.16b
586 1.4 riastrad mov v28.16b, v3.16b
587 1.4 riastrad mov v27.16b, v2.16b
588 1.4 riastrad mov v26.16b, v1.16b
589 1.4 riastrad mov v25.16b, v0.16b
590 1.1 riastrad mov x0, x9 /* x0 := enckey */
591 1.1 riastrad mov x3, x5 /* x3 := nrounds */
592 1.4 riastrad bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i];
593 1.4 riastrad * trash x0/x3/q16 */
594 1.4 riastrad eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
595 1.4 riastrad eor v6.16b, v6.16b, v30.16b
596 1.4 riastrad eor v5.16b, v5.16b, v29.16b
597 1.4 riastrad eor v4.16b, v4.16b, v28.16b
598 1.4 riastrad eor v3.16b, v3.16b, v27.16b
599 1.4 riastrad eor v2.16b, v2.16b, v26.16b
600 1.4 riastrad eor v1.16b, v1.16b, v25.16b
601 1.1 riastrad subs x10, x10, #0x80 /* count down nbytes */
602 1.1 riastrad stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
603 1.1 riastrad stp q4, q5, [x2, #-0x20]!
604 1.1 riastrad stp q2, q3, [x2, #-0x20]!
605 1.7 riastrad b.ne 1b /* repeat if there's more */
606 1.7 riastrad eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */
607 1.1 riastrad stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
608 1.4 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
609 1.1 riastrad ret
610 1.1 riastrad END(aesarmv8_cbc_dec8)
611 1.1 riastrad
612 1.1 riastrad /*
613 1.1 riastrad * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
614 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
615 1.1 riastrad * uint32_t nrounds@x5)
616 1.1 riastrad *
617 1.1 riastrad * Encrypt a contiguous sequence of blocks with AES-XTS.
618 1.1 riastrad *
619 1.1 riastrad * nbytes must be a positive integral multiple of 16. This routine
620 1.1 riastrad * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
621 1.1 riastrad *
622 1.1 riastrad * Standard ABI calling convention.
623 1.1 riastrad */
624 1.1 riastrad ENTRY(aesarmv8_xts_enc1)
625 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
626 1.1 riastrad mov fp, sp
627 1.1 riastrad mov x9, x0 /* x9 := enckey */
628 1.1 riastrad mov x10, x3 /* x10 := nbytes */
629 1.4 riastrad ldr q31, [x4] /* q31 := tweak */
630 1.1 riastrad 1: ldr q0, [x1], #0x10 /* q0 := ptxt */
631 1.1 riastrad mov x0, x9 /* x0 := enckey */
632 1.1 riastrad mov x3, x5 /* x3 := nrounds */
633 1.4 riastrad eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
634 1.4 riastrad bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */
635 1.4 riastrad eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
636 1.1 riastrad str q0, [x2], #0x10 /* store ciphertext block */
637 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
638 1.1 riastrad subs x10, x10, #0x10 /* count down nbytes */
639 1.1 riastrad b.ne 1b /* repeat if more blocks */
640 1.4 riastrad str q31, [x4] /* update tweak */
641 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
642 1.1 riastrad ret
643 1.1 riastrad END(aesarmv8_xts_enc1)
644 1.1 riastrad
645 1.1 riastrad /*
646 1.1 riastrad * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
647 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
648 1.1 riastrad * uint32_t nrounds@x5)
649 1.1 riastrad *
650 1.1 riastrad * Encrypt a contiguous sequence of blocks with AES-XTS.
651 1.1 riastrad *
652 1.1 riastrad * nbytes must be a positive integral multiple of 128.
653 1.1 riastrad *
654 1.1 riastrad * Standard ABI calling convention.
655 1.1 riastrad */
656 1.1 riastrad ENTRY(aesarmv8_xts_enc8)
657 1.4 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
658 1.1 riastrad mov fp, sp
659 1.1 riastrad mov x9, x0 /* x9 := enckey */
660 1.1 riastrad mov x10, x3 /* x10 := nbytes */
661 1.4 riastrad ldr q31, [x4] /* q31 := tweak */
662 1.4 riastrad 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
663 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
664 1.4 riastrad mov v25.16b, v31.16b /* q25 := tweak[1] */
665 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
666 1.4 riastrad mov v26.16b, v31.16b /* q26 := tweak[2] */
667 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
668 1.4 riastrad mov v27.16b, v31.16b /* q27 := tweak[3] */
669 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
670 1.4 riastrad mov v28.16b, v31.16b /* q28 := tweak[4] */
671 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
672 1.4 riastrad mov v29.16b, v31.16b /* q29 := tweak[5] */
673 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
674 1.4 riastrad mov v30.16b, v31.16b /* q30 := tweak[6] */
675 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
676 1.4 riastrad /* q31 := tweak[7] */
677 1.4 riastrad ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */
678 1.1 riastrad ldp q2, q3, [x1], #0x20
679 1.1 riastrad ldp q4, q5, [x1], #0x20
680 1.1 riastrad ldp q6, q7, [x1], #0x20
681 1.4 riastrad eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */
682 1.4 riastrad eor v1.16b, v1.16b, v25.16b
683 1.4 riastrad eor v2.16b, v2.16b, v26.16b
684 1.4 riastrad eor v3.16b, v3.16b, v27.16b
685 1.4 riastrad eor v4.16b, v4.16b, v28.16b
686 1.4 riastrad eor v5.16b, v5.16b, v29.16b
687 1.4 riastrad eor v6.16b, v6.16b, v30.16b
688 1.4 riastrad eor v7.16b, v7.16b, v31.16b
689 1.1 riastrad mov x0, x9 /* x0 := enckey */
690 1.1 riastrad mov x3, x5 /* x3 := nrounds */
691 1.4 riastrad bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */
692 1.4 riastrad eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
693 1.4 riastrad eor v1.16b, v1.16b, v25.16b
694 1.4 riastrad eor v2.16b, v2.16b, v26.16b
695 1.4 riastrad eor v3.16b, v3.16b, v27.16b
696 1.4 riastrad eor v4.16b, v4.16b, v28.16b
697 1.4 riastrad eor v5.16b, v5.16b, v29.16b
698 1.4 riastrad eor v6.16b, v6.16b, v30.16b
699 1.4 riastrad eor v7.16b, v7.16b, v31.16b
700 1.1 riastrad stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
701 1.4 riastrad stp q2, q3, [x2], #0x20
702 1.4 riastrad stp q4, q5, [x2], #0x20
703 1.4 riastrad stp q6, q7, [x2], #0x20
704 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
705 1.1 riastrad subs x10, x10, #0x80 /* count down nbytes */
706 1.1 riastrad b.ne 1b /* repeat if more block groups */
707 1.4 riastrad str q31, [x4] /* update tweak */
708 1.4 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
709 1.1 riastrad ret
710 1.1 riastrad END(aesarmv8_xts_enc8)
711 1.1 riastrad
712 1.1 riastrad /*
713 1.1 riastrad * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
714 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
715 1.1 riastrad * uint32_t nrounds@x5)
716 1.1 riastrad *
717 1.4 riastrad * Decrypt a contiguous sequdece of blocks with AES-XTS.
718 1.1 riastrad *
719 1.1 riastrad * nbytes must be a positive integral multiple of 16. This routine
720 1.1 riastrad * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
721 1.1 riastrad *
722 1.1 riastrad * Standard ABI calling convention.
723 1.1 riastrad */
724 1.1 riastrad ENTRY(aesarmv8_xts_dec1)
725 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
726 1.1 riastrad mov fp, sp
727 1.1 riastrad mov x9, x0 /* x9 := deckey */
728 1.1 riastrad mov x10, x3 /* x10 := nbytes */
729 1.4 riastrad ldr q31, [x4] /* q31 := tweak */
730 1.4 riastrad 1: ldr q0, [x1], #0x10 /* q0 := ctxt */
731 1.1 riastrad mov x0, x9 /* x0 := deckey */
732 1.1 riastrad mov x3, x5 /* x3 := nrounds */
733 1.4 riastrad eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */
734 1.4 riastrad bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */
735 1.4 riastrad eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */
736 1.4 riastrad str q0, [x2], #0x10 /* store plaintext block */
737 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
738 1.1 riastrad subs x10, x10, #0x10 /* count down nbytes */
739 1.1 riastrad b.ne 1b /* repeat if more blocks */
740 1.4 riastrad str q31, [x4] /* update tweak */
741 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
742 1.1 riastrad ret
743 1.1 riastrad END(aesarmv8_xts_dec1)
744 1.1 riastrad
745 1.1 riastrad /*
746 1.1 riastrad * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
747 1.1 riastrad * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
748 1.1 riastrad * uint32_t nrounds@x5)
749 1.1 riastrad *
750 1.4 riastrad * Decrypt a contiguous sequdece of blocks with AES-XTS.
751 1.1 riastrad *
752 1.1 riastrad * nbytes must be a positive integral multiple of 128.
753 1.1 riastrad *
754 1.1 riastrad * Standard ABI calling convention.
755 1.1 riastrad */
756 1.1 riastrad ENTRY(aesarmv8_xts_dec8)
757 1.4 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
758 1.1 riastrad mov fp, sp
759 1.1 riastrad mov x9, x0 /* x9 := deckey */
760 1.1 riastrad mov x10, x3 /* x10 := nbytes */
761 1.4 riastrad ldr q31, [x4] /* q31 := tweak */
762 1.4 riastrad 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
763 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
764 1.4 riastrad mov v25.16b, v31.16b /* q25 := tweak[1] */
765 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
766 1.4 riastrad mov v26.16b, v31.16b /* q26 := tweak[2] */
767 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
768 1.4 riastrad mov v27.16b, v31.16b /* q27 := tweak[3] */
769 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
770 1.4 riastrad mov v28.16b, v31.16b /* q28 := tweak[4] */
771 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
772 1.4 riastrad mov v29.16b, v31.16b /* q29 := tweak[5] */
773 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
774 1.4 riastrad mov v30.16b, v31.16b /* q30 := tweak[6] */
775 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
776 1.4 riastrad /* q31 := tweak[7] */
777 1.4 riastrad ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */
778 1.1 riastrad ldp q2, q3, [x1], #0x20
779 1.1 riastrad ldp q4, q5, [x1], #0x20
780 1.1 riastrad ldp q6, q7, [x1], #0x20
781 1.4 riastrad eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */
782 1.4 riastrad eor v1.16b, v1.16b, v25.16b
783 1.4 riastrad eor v2.16b, v2.16b, v26.16b
784 1.4 riastrad eor v3.16b, v3.16b, v27.16b
785 1.4 riastrad eor v4.16b, v4.16b, v28.16b
786 1.4 riastrad eor v5.16b, v5.16b, v29.16b
787 1.4 riastrad eor v6.16b, v6.16b, v30.16b
788 1.4 riastrad eor v7.16b, v7.16b, v31.16b
789 1.1 riastrad mov x0, x9 /* x0 := deckey */
790 1.1 riastrad mov x3, x5 /* x3 := nrounds */
791 1.4 riastrad bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */
792 1.4 riastrad eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
793 1.4 riastrad eor v1.16b, v1.16b, v25.16b
794 1.4 riastrad eor v2.16b, v2.16b, v26.16b
795 1.4 riastrad eor v3.16b, v3.16b, v27.16b
796 1.4 riastrad eor v4.16b, v4.16b, v28.16b
797 1.4 riastrad eor v5.16b, v5.16b, v29.16b
798 1.4 riastrad eor v6.16b, v6.16b, v30.16b
799 1.4 riastrad eor v7.16b, v7.16b, v31.16b
800 1.4 riastrad stp q0, q1, [x2], #0x20 /* store plaintext blocks */
801 1.4 riastrad stp q2, q3, [x2], #0x20
802 1.4 riastrad stp q4, q5, [x2], #0x20
803 1.4 riastrad stp q6, q7, [x2], #0x20
804 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
805 1.1 riastrad subs x10, x10, #0x80 /* count down nbytes */
806 1.1 riastrad b.ne 1b /* repeat if more block groups */
807 1.4 riastrad str q31, [x4] /* update tweak */
808 1.4 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
809 1.1 riastrad ret
810 1.1 riastrad END(aesarmv8_xts_dec8)
811 1.1 riastrad
812 1.1 riastrad /*
813 1.4 riastrad * aesarmv8_xts_mulx(tweak@q31)
814 1.1 riastrad *
815 1.4 riastrad * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
816 1.1 riastrad * Uses x0 and q0/q1 as temporaries.
817 1.1 riastrad */
818 1.1 riastrad .text
819 1.1 riastrad _ALIGN_TEXT
820 1.1 riastrad .type aesarmv8_xts_mulx,@function
821 1.1 riastrad aesarmv8_xts_mulx:
822 1.1 riastrad /*
823 1.1 riastrad * Simultaneously determine
824 1.1 riastrad * (a) whether the high bit of the low half must be
825 1.1 riastrad * shifted into the low bit of the high half, and
826 1.1 riastrad * (b) whether the high bit of the high half must be
827 1.1 riastrad * carried into x^128 = x^7 + x^2 + x + 1.
828 1.1 riastrad */
829 1.1 riastrad adrl x0, xtscarry
830 1.6 riastrad cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
831 1.1 riastrad ldr q0, [x0] /* q0 := xtscarry */
832 1.1 riastrad ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
833 1.4 riastrad shl v31.2d, v31.2d, #1 /* shift */
834 1.1 riastrad and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */
835 1.4 riastrad eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
836 1.1 riastrad ret
837 1.1 riastrad END(aesarmv8_xts_mulx)
838 1.1 riastrad
839 1.1 riastrad .section .rodata
840 1.2 riastrad .p2align 4
841 1.1 riastrad .type xtscarry,@object
842 1.1 riastrad xtscarry:
843 1.1 riastrad .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0
844 1.1 riastrad END(xtscarry)
845 1.1 riastrad
846 1.1 riastrad /*
847 1.1 riastrad * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
848 1.1 riastrad *
849 1.1 riastrad * Update an AES-XTS tweak.
850 1.1 riastrad *
851 1.1 riastrad * Standard ABI calling convention.
852 1.1 riastrad */
853 1.1 riastrad ENTRY(aesarmv8_xts_update)
854 1.1 riastrad stp fp, lr, [sp, #-16]! /* push stack frame */
855 1.1 riastrad mov fp, sp
856 1.4 riastrad ldr q31, [x0] /* load tweak */
857 1.4 riastrad bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
858 1.4 riastrad str q31, [x1] /* store tweak */
859 1.1 riastrad ldp fp, lr, [sp], #16 /* pop stack frame */
860 1.1 riastrad ret
861 1.1 riastrad END(aesarmv8_xts_update)
862 1.1 riastrad
863 1.1 riastrad /*
864 1.1 riastrad * aesarmv8_enc1(const struct aesenc *enckey@x0,
865 1.1 riastrad * uint128_t block@q0, uint32_t nrounds@x3)
866 1.1 riastrad *
867 1.1 riastrad * Encrypt a single AES block in q0.
868 1.1 riastrad *
869 1.4 riastrad * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
870 1.1 riastrad */
871 1.1 riastrad .text
872 1.1 riastrad _ALIGN_TEXT
873 1.1 riastrad .type aesarmv8_enc1,@function
874 1.1 riastrad aesarmv8_enc1:
875 1.4 riastrad ldr q16, [x0], #0x10 /* load round key */
876 1.7 riastrad b 2f
877 1.7 riastrad 1: /* q0 := MixColumns(q0) */
878 1.7 riastrad aesmc v0.16b, v0.16b
879 1.7 riastrad 2: subs x3, x3, #1
880 1.4 riastrad /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
881 1.4 riastrad aese v0.16b, v16.16b
882 1.4 riastrad ldr q16, [x0], #0x10 /* load next round key */
883 1.7 riastrad b.ne 1b
884 1.7 riastrad eor v0.16b, v0.16b, v16.16b
885 1.1 riastrad ret
886 1.1 riastrad END(aesarmv8_enc1)
887 1.1 riastrad
888 1.1 riastrad /*
889 1.1 riastrad * aesarmv8_enc8(const struct aesenc *enckey@x0,
890 1.1 riastrad * uint128_t block0@q0, ..., uint128_t block7@q7,
891 1.1 riastrad * uint32_t nrounds@x3)
892 1.1 riastrad *
893 1.1 riastrad * Encrypt eight AES blocks in q0 through q7 in parallel.
894 1.1 riastrad *
895 1.4 riastrad * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
896 1.1 riastrad */
897 1.1 riastrad .text
898 1.1 riastrad _ALIGN_TEXT
899 1.1 riastrad .type aesarmv8_enc8,@function
900 1.1 riastrad aesarmv8_enc8:
901 1.4 riastrad ldr q16, [x0], #0x10 /* load round key */
902 1.7 riastrad b 2f
903 1.7 riastrad 1: /* q[i] := MixColumns(q[i]) */
904 1.7 riastrad aesmc v0.16b, v0.16b
905 1.7 riastrad aesmc v1.16b, v1.16b
906 1.7 riastrad aesmc v2.16b, v2.16b
907 1.7 riastrad aesmc v3.16b, v3.16b
908 1.7 riastrad aesmc v4.16b, v4.16b
909 1.7 riastrad aesmc v5.16b, v5.16b
910 1.7 riastrad aesmc v6.16b, v6.16b
911 1.7 riastrad aesmc v7.16b, v7.16b
912 1.7 riastrad 2: subs x3, x3, #1
913 1.4 riastrad /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
914 1.4 riastrad aese v0.16b, v16.16b
915 1.4 riastrad aese v1.16b, v16.16b
916 1.4 riastrad aese v2.16b, v16.16b
917 1.4 riastrad aese v3.16b, v16.16b
918 1.4 riastrad aese v4.16b, v16.16b
919 1.4 riastrad aese v5.16b, v16.16b
920 1.4 riastrad aese v6.16b, v16.16b
921 1.4 riastrad aese v7.16b, v16.16b
922 1.4 riastrad ldr q16, [x0], #0x10 /* load next round key */
923 1.7 riastrad b.ne 1b
924 1.7 riastrad eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
925 1.4 riastrad eor v1.16b, v1.16b, v16.16b
926 1.4 riastrad eor v2.16b, v2.16b, v16.16b
927 1.4 riastrad eor v3.16b, v3.16b, v16.16b
928 1.4 riastrad eor v4.16b, v4.16b, v16.16b
929 1.4 riastrad eor v5.16b, v5.16b, v16.16b
930 1.4 riastrad eor v6.16b, v6.16b, v16.16b
931 1.4 riastrad eor v7.16b, v7.16b, v16.16b
932 1.1 riastrad ret
933 1.1 riastrad END(aesarmv8_enc8)
934 1.1 riastrad
935 1.1 riastrad /*
936 1.1 riastrad * aesarmv8_dec1(const struct aesdec *deckey@x0,
937 1.1 riastrad * uint128_t block@q0, uint32_t nrounds@x3)
938 1.1 riastrad *
939 1.1 riastrad * Decrypt a single AES block in q0.
940 1.1 riastrad *
941 1.4 riastrad * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
942 1.1 riastrad */
943 1.1 riastrad .text
944 1.1 riastrad _ALIGN_TEXT
945 1.1 riastrad .type aesarmv8_dec1,@function
946 1.1 riastrad aesarmv8_dec1:
947 1.4 riastrad ldr q16, [x0], #0x10 /* load round key */
948 1.7 riastrad b 2f
949 1.7 riastrad 1: /* q0 := InMixColumns(q0) */
950 1.7 riastrad aesimc v0.16b, v0.16b
951 1.7 riastrad 2: subs x3, x3, #1
952 1.4 riastrad /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
953 1.4 riastrad aesd v0.16b, v16.16b
954 1.4 riastrad ldr q16, [x0], #0x10 /* load next round key */
955 1.7 riastrad b.ne 1b
956 1.7 riastrad eor v0.16b, v0.16b, v16.16b
957 1.1 riastrad ret
958 1.1 riastrad END(aesarmv8_dec1)
959 1.1 riastrad
960 1.1 riastrad /*
961 1.1 riastrad * aesarmv8_dec8(const struct aesdec *deckey@x0,
962 1.1 riastrad * uint128_t block0@q0, ..., uint128_t block7@q7,
963 1.1 riastrad * uint32_t nrounds@x3)
964 1.1 riastrad *
965 1.1 riastrad * Decrypt eight AES blocks in q0 through q7 in parallel.
966 1.1 riastrad *
967 1.4 riastrad * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
968 1.1 riastrad */
969 1.1 riastrad .text
970 1.1 riastrad _ALIGN_TEXT
971 1.1 riastrad .type aesarmv8_dec8,@function
972 1.1 riastrad aesarmv8_dec8:
973 1.4 riastrad ldr q16, [x0], #0x10 /* load round key */
974 1.7 riastrad b 2f
975 1.7 riastrad 1: /* q[i] := InMixColumns(q[i]) */
976 1.7 riastrad aesimc v0.16b, v0.16b
977 1.7 riastrad aesimc v1.16b, v1.16b
978 1.7 riastrad aesimc v2.16b, v2.16b
979 1.7 riastrad aesimc v3.16b, v3.16b
980 1.7 riastrad aesimc v4.16b, v4.16b
981 1.7 riastrad aesimc v5.16b, v5.16b
982 1.7 riastrad aesimc v6.16b, v6.16b
983 1.7 riastrad aesimc v7.16b, v7.16b
984 1.7 riastrad 2: subs x3, x3, #1
985 1.4 riastrad /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
986 1.4 riastrad aesd v0.16b, v16.16b
987 1.4 riastrad aesd v1.16b, v16.16b
988 1.4 riastrad aesd v2.16b, v16.16b
989 1.4 riastrad aesd v3.16b, v16.16b
990 1.4 riastrad aesd v4.16b, v16.16b
991 1.4 riastrad aesd v5.16b, v16.16b
992 1.4 riastrad aesd v6.16b, v16.16b
993 1.4 riastrad aesd v7.16b, v16.16b
994 1.4 riastrad ldr q16, [x0], #0x10 /* load next round key */
995 1.7 riastrad b.ne 1b
996 1.7 riastrad eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
997 1.4 riastrad eor v1.16b, v1.16b, v16.16b
998 1.4 riastrad eor v2.16b, v2.16b, v16.16b
999 1.4 riastrad eor v3.16b, v3.16b, v16.16b
1000 1.4 riastrad eor v4.16b, v4.16b, v16.16b
1001 1.4 riastrad eor v5.16b, v5.16b, v16.16b
1002 1.4 riastrad eor v6.16b, v6.16b, v16.16b
1003 1.4 riastrad eor v7.16b, v7.16b, v16.16b
1004 1.1 riastrad ret
1005 1.1 riastrad END(aesarmv8_dec8)
1006