aes_armv8_64.S revision 1.14 1 /* $NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <aarch64/asm.h>
30
31 RCSID("$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $")
32
33 .arch_extension aes
34
35 /*
36 * uint32_t rcon[10]
37 *
38 * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
39 * Such elements of GF(8) need only eight bits to be represented,
40 * but we store them in 4-byte units so we can copy one into all
41 * four 4-byte lanes of a vector register with a single LD1R. The
42 * access pattern is fixed, so indices into this table are never
43 * secret.
44 */
45 .section .rodata
46 .p2align 2
47 .type rcon,@object
48 rcon:
49 .long 0x01
50 .long 0x02
51 .long 0x04
52 .long 0x08
53 .long 0x10
54 .long 0x20
55 .long 0x40
56 .long 0x80
57 .long 0x1b
58 .long 0x36
59 END(rcon)
60
61 /*
62 * uint128_t unshiftrows_rotword_1
63 *
64 * Table for TBL instruction to undo ShiftRows, and then do
65 * RotWord on word 1, and then copy it into all the other words.
66 */
67 .section .rodata
68 .p2align 4
69 .type unshiftrows_rotword_1,@object
70 unshiftrows_rotword_1:
71 .byte 0x01,0x0e,0x0b,0x04
72 .byte 0x01,0x0e,0x0b,0x04
73 .byte 0x01,0x0e,0x0b,0x04
74 .byte 0x01,0x0e,0x0b,0x04
75 END(unshiftrows_rotword_1)
76
77 /*
78 * uint128_t unshiftrows_3
79 *
80 * Table for TBL instruction to undo ShiftRows, and then copy word
81 * 3 into all the other words.
82 */
83 .section .rodata
84 .p2align 4
85 .type unshiftrows_3,@object
86 unshiftrows_3:
87 .byte 0x0c,0x09,0x06,0x03
88 .byte 0x0c,0x09,0x06,0x03
89 .byte 0x0c,0x09,0x06,0x03
90 .byte 0x0c,0x09,0x06,0x03
91 END(unshiftrows_3)
92
93 /*
94 * uint128_t unshiftrows_rotword_3
95 *
96 * Table for TBL instruction to undo ShiftRows, and then do
97 * RotWord on word 3, and then copy it into all the other words.
98 */
99 .section .rodata
100 .p2align 4
101 .type unshiftrows_rotword_3,@object
102 unshiftrows_rotword_3:
103 .byte 0x09,0x06,0x03,0x0c
104 .byte 0x09,0x06,0x03,0x0c
105 .byte 0x09,0x06,0x03,0x0c
106 .byte 0x09,0x06,0x03,0x0c
107 END(unshiftrows_rotword_3)
108
109 /*
110 * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
111 *
112 * Expand a 16-byte AES-128 key into 10 round keys.
113 *
114 * Standard ABI calling convention.
115 */
116 ENTRY(aesarmv8_setenckey128)
117 ld1 {v1.16b}, [x1] /* q1 := master key */
118
119 adrl x4, unshiftrows_rotword_3
120 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
121 ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_3 table */
122
123 str q1, [x0], #0x10 /* store master key as first round key */
124 mov x2, #10 /* round count */
125 adrl x3, rcon /* round constant */
126
127 1: /*
128 * q0 = 0
129 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
130 * x0 = pointer to round key to compute
131 * x2 = round count
132 * x3 = rcon pointer
133 */
134
135 /* q3 := ShiftRows(SubBytes(q1)) */
136 mov v3.16b, v1.16b
137 aese v3.16b, v0.16b
138
139 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
140 ld1r {v4.4s}, [x3], #4
141 tbl v3.16b, {v3.16b}, v16.16b
142 eor v3.16b, v3.16b, v4.16b
143
144 /*
145 * v5.4s := (0,prk[0],prk[1],prk[2])
146 * v6.4s := (0,0,prk[0],prk[1])
147 * v7.4s := (0,0,0,prk[0])
148 */
149 ext v5.16b, v0.16b, v1.16b, #12
150 ext v6.16b, v0.16b, v1.16b, #8
151 ext v7.16b, v0.16b, v1.16b, #4
152
153 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
154 eor v1.16b, v1.16b, v3.16b
155 eor v1.16b, v1.16b, v5.16b
156 eor v1.16b, v1.16b, v6.16b
157 eor v1.16b, v1.16b, v7.16b
158
159 subs x2, x2, #1 /* count down rounds */
160 str q1, [x0], #0x10 /* store round key */
161 b.ne 1b
162
163 ret
164 END(aesarmv8_setenckey128)
165
166 /*
167 * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
168 *
169 * Expand a 24-byte AES-192 key into 12 round keys.
170 *
171 * Standard ABI calling convention.
172 */
173 ENTRY(aesarmv8_setenckey192)
174 ld1 {v1.16b}, [x1], #0x10 /* q1 := master key[0:128) */
175 ld1 {v2.8b}, [x1] /* d2 := master key[128:192) */
176
177 adrl x4, unshiftrows_rotword_1
178 adrl x5, unshiftrows_rotword_3
179 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
180 ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_1 */
181 ld1 {v17.16b}, [x5] /* q17 := unshiftrows_rotword_3 */
182
183 str q1, [x0], #0x10 /* store master key[0:128) as round key */
184 mov x2, #12 /* round count */
185 adrl x3, rcon /* round constant */
186
187 1: /*
188 * q0 = 0
189 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
190 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
191 * x0 = pointer to three round keys to compute
192 * x2 = round count
193 * x3 = rcon pointer
194 */
195
196 /* q3 := ShiftRows(SubBytes(q2)) */
197 mov v3.16b, v2.16b
198 aese v3.16b, v0.16b
199
200 /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
201 ld1r {v4.4s}, [x3], #4
202 tbl v3.16b, {v3.16b}, v16.16b
203 eor v3.16b, v3.16b, v4.16b
204
205 /*
206 * We need to compute:
207 *
208 * rk[0] := rklo[0]
209 * rk[1] := rklo[1]
210 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
211 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
212 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
213 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
214 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
215 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
216 * ^ rklo[1]
217 */
218
219 /*
220 * v5.4s := (0,prk[0],prk[1],prk[2])
221 * v6.4s := (0,0,prk[0],prk[1])
222 * v7.4s := (0,0,0,prk[0])
223 */
224 ext v5.16b, v0.16b, v1.16b, #12
225 ext v6.16b, v0.16b, v1.16b, #8
226 ext v7.16b, v0.16b, v1.16b, #4
227
228 /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
229 eor v5.16b, v5.16b, v1.16b
230 eor v5.16b, v5.16b, v3.16b
231 eor v5.16b, v5.16b, v6.16b
232 eor v5.16b, v5.16b, v7.16b
233
234 /*
235 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
236 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
237 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
238 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
239 * (rklo[0],rklo[1],...).
240 */
241
242 /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
243 dup v1.4s, v5.s[3]
244 mov v1.s[0], v5.s[2]
245
246 /*
247 * v6.4s := (0, 0, rklo[0], rklo[1])
248 * v7.4s := (0, 0, 0, rklo[0])
249 */
250 ext v6.16b, v0.16b, v2.16b, #8
251 ext v7.16b, v0.16b, v2.16b, #4
252
253 /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
254 eor v3.16b, v1.16b, v6.16b
255 eor v3.16b, v3.16b, v7.16b
256
257 /*
258 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
259 * and v5.4s = (rk[2], rk[3], xxx, xxx). Set
260 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
261 */
262 mov v2.d[1], v5.d[0]
263
264 /* store two round keys */
265 stp q2, q3, [x0], #0x20
266
267 /*
268 * Live vector registers at this point:
269 *
270 * q0 = zero
271 * q2 = rk
272 * q3 = nrk
273 * v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
274 * q16 = unshiftrows_rotword_1
275 * q17 = unshiftrows_rotword_3
276 *
277 * We have to compute, in q1:
278 *
279 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
280 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
281 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
282 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
283 * ^ nrk[1]
284 *
285 * And, if there's any more afterward, in q2:
286 *
287 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
288 * ^ nrk[1] ^ nrk[2]
289 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
290 * ^ nrk[1] ^ nrk[2] ^ nrk[3]
291 */
292
293 /* q1 := RotWords(SubBytes(q3)) */
294 mov v1.16b, v3.16b
295 aese v1.16b, v0.16b
296
297 /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
298 ld1r {v4.4s}, [x3], #4
299 tbl v1.16b, {v1.16b}, v17.16b
300 eor v1.16b, v1.16b, v4.16b
301
302 /*
303 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
304 * v4.4s := (0, rk[2], rk[3], nrk[0])
305 * v6.4s := (0, 0, rk[2], rk[3])
306 * v7.4s := (0, 0, 0, rk[2])
307 */
308 ext v4.16b, v0.16b, v5.16b, #12
309 ext v6.16b, v0.16b, v5.16b, #8
310 ext v7.16b, v0.16b, v5.16b, #4
311
312 /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
313 eor v1.16b, v1.16b, v5.16b
314 eor v1.16b, v1.16b, v4.16b
315 eor v1.16b, v1.16b, v6.16b
316 eor v1.16b, v1.16b, v7.16b
317
318 subs x2, x2, #3 /* count down three rounds */
319 str q1, [x0], #0x10 /* store third round key */
320 b.eq 2f
321
322 /*
323 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
324 * v5.4s := (0, nrk[2], xxx, xxx)
325 */
326 ext v4.16b, v3.16b, v0.16b, #8
327 ext v5.16b, v0.16b, v4.16b, #12
328
329 /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
330 dup v2.4s, v1.s[3]
331
332 /*
333 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
334 * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
335 * xxx, xxx)
336 */
337 eor v2.16b, v2.16b, v4.16b
338 eor v2.16b, v2.16b, v5.16b
339
340 b 1b
341
342 2: ret
343 END(aesarmv8_setenckey192)
344
345 /*
346 * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
347 *
348 * Expand a 32-byte AES-256 key into 14 round keys.
349 *
350 * Standard ABI calling convention.
351 */
352 ENTRY(aesarmv8_setenckey256)
353 /* q1 := key[0:128), q2 := key[128:256) */
354 ld1 {v1.16b-v2.16b}, [x1], #0x20
355
356 adrl x4, unshiftrows_rotword_3
357 adrl x5, unshiftrows_3
358 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
359 ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_3 */
360 ld1 {v17.16b}, [x5] /* q17 := unshiftrows_3 */
361
362 /* store master key as first two round keys */
363 stp q1, q2, [x0], #0x20
364 mov x2, #14 /* round count */
365 adrl x3, rcon /* round constant */
366
367 1: /*
368 * q0 = 0
369 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
370 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
371 * x2 = round count
372 * x3 = rcon pointer
373 */
374
375 /* q3 := ShiftRows(SubBytes(q2)) */
376 mov v3.16b, v2.16b
377 aese v3.16b, v0.16b
378
379 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
380 ld1r {v4.4s}, [x3], #4
381 tbl v3.16b, {v3.16b}, v16.16b
382 eor v3.16b, v3.16b, v4.16b
383
384 /*
385 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
386 * v6.4s := (0,0,pprk[0],pprk[1])
387 * v7.4s := (0,0,0,pprk[0])
388 */
389 ext v5.16b, v0.16b, v1.16b, #12
390 ext v6.16b, v0.16b, v1.16b, #8
391 ext v7.16b, v0.16b, v1.16b, #4
392
393 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
394 eor v1.16b, v1.16b, v3.16b
395 eor v1.16b, v1.16b, v5.16b
396 eor v1.16b, v1.16b, v6.16b
397 eor v1.16b, v1.16b, v7.16b
398
399 subs x2, x2, #2 /* count down two rounds */
400 b.eq 2f /* stop if this is the last one */
401
402 /* q3 := ShiftRows(SubBytes(q1)) */
403 mov v3.16b, v1.16b
404 aese v3.16b, v0.16b
405
406 /* v3.4s[i] := SubBytes(rk[3]) */
407 tbl v3.16b, {v3.16b}, v17.16b
408
409 /*
410 * v5.4s := (0,prk[0],prk[1],prk[2])
411 * v6.4s := (0,0,prk[0],prk[1])
412 * v7.4s := (0,0,0,prk[0])
413 */
414 ext v5.16b, v0.16b, v2.16b, #12
415 ext v6.16b, v0.16b, v2.16b, #8
416 ext v7.16b, v0.16b, v2.16b, #4
417
418 /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
419 eor v2.16b, v2.16b, v3.16b
420 eor v2.16b, v2.16b, v5.16b
421 eor v2.16b, v2.16b, v6.16b
422 eor v2.16b, v2.16b, v7.16b
423
424 stp q1, q2, [x0], #0x20 /* store two round keys */
425 b 1b
426
427 2: str q1, [x0] /* store last round key */
428 ret
429 END(aesarmv8_setenckey256)
430
431 /*
432 * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
433 * uint32_t nrounds@x2)
434 *
435 * Convert AES encryption round keys to AES decryption round keys.
436 * `rounds' must be between 10 and 14.
437 *
438 * Standard ABI calling convention.
439 */
440 ENTRY(aesarmv8_enctodec)
441 ldr q0, [x0, x2, lsl #4] /* load last round key */
442 b 2f
443 _ALIGN_TEXT
444 1: aesimc v0.16b, v0.16b /* convert encryption to decryption */
445 2: str q0, [x1], #0x10 /* store round key */
446 subs x2, x2, #1 /* count down round */
447 ldr q0, [x0, x2, lsl #4] /* load previous round key */
448 b.ne 1b /* repeat if there's more */
449 str q0, [x1] /* store first round key verbatim */
450 ret
451 END(aesarmv8_enctodec)
452
453 /*
454 * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
455 * uint8_t out[16] @x2, uint32_t nrounds@x3)
456 *
457 * Encrypt a single block.
458 *
459 * Standard ABI calling convention.
460 */
461 ENTRY(aesarmv8_enc)
462 stp fp, lr, [sp, #-16]! /* push stack frame */
463 mov fp, sp
464 ld1 {v0.16b}, [x1] /* q0 := ptxt */
465 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
466 st1 {v0.16b}, [x2] /* store ctxt */
467 ldp fp, lr, [sp], #16 /* pop stack frame */
468 ret
469 END(aesarmv8_enc)
470
471 /*
472 * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
473 * uint8_t out[16] @x2, uint32_t nrounds@x3)
474 *
475 * Decrypt a single block.
476 *
477 * Standard ABI calling convention.
478 */
479 ENTRY(aesarmv8_dec)
480 stp fp, lr, [sp, #-16]! /* push stack frame */
481 mov fp, sp
482 ld1 {v0.16b}, [x1] /* q0 := ctxt */
483 bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */
484 st1 {v0.16b}, [x2] /* store ptxt */
485 ldp fp, lr, [sp], #16 /* pop stack frame */
486 ret
487 END(aesarmv8_dec)
488
489 /*
490 * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
491 * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
492 * uint32_t nrounds@x5)
493 *
494 * Encrypt a contiguous sequence of blocks with AES-CBC.
495 *
496 * nbytes must be an integral multiple of 16.
497 *
498 * Standard ABI calling convention.
499 */
500 ENTRY(aesarmv8_cbc_enc)
501 cbz x3, 2f /* stop if nothing to do */
502 stp fp, lr, [sp, #-16]! /* push stack frame */
503 mov fp, sp
504 mov x9, x0 /* x9 := enckey */
505 mov x10, x3 /* x10 := nbytes */
506 ld1 {v0.16b}, [x4] /* q0 := chaining value */
507 _ALIGN_TEXT
508 1: ld1 {v1.16b}, [x1], #0x10 /* q1 := plaintext block */
509 eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
510 mov x0, x9 /* x0 := enckey */
511 mov x3, x5 /* x3 := nrounds */
512 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
513 subs x10, x10, #0x10 /* count down nbytes */
514 st1 {v0.16b}, [x2], #0x10 /* store ciphertext block */
515 b.ne 1b /* repeat if x10 is nonzero */
516 st1 {v0.16b}, [x4] /* store chaining value */
517 ldp fp, lr, [sp], #16 /* pop stack frame */
518 2: ret
519 END(aesarmv8_cbc_enc)
520
521 /*
522 * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
523 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
524 * uint32_t nrounds@x5)
525 *
526 * Decrypt a contiguous sequence of blocks with AES-CBC.
527 *
528 * nbytes must be a positive integral multiple of 16. This routine
529 * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
530 *
531 * Standard ABI calling convention.
532 */
533 ENTRY(aesarmv8_cbc_dec1)
534 stp fp, lr, [sp, #-16]! /* push stack frame */
535 mov fp, sp
536 ld1 {v24.16b}, [x4] /* q24 := iv */
537 mov x9, x0 /* x9 := enckey */
538 mov x10, x3 /* x10 := nbytes */
539 add x1, x1, x3 /* x1 := pointer past end of in */
540 add x2, x2, x3 /* x2 := pointer past end of out */
541 sub x1, x1, #0x10
542 ld1 {v0.16b}, [x1] /* q0 := last ciphertext block */
543 st1 {v0.16b}, [x4] /* update iv */
544 b 2f
545 _ALIGN_TEXT
546 1: sub x1, x1, #0x10
547 ld1 {v31.16b}, [x1] /* q31 := chaining value */
548 sub x2, x2, #0x10
549 eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
550 st1 {v0.16b}, [x2] /* store plaintext block */
551 mov v0.16b, v31.16b /* move cv = ciphertext block */
552 2: mov x0, x9 /* x0 := enckey */
553 mov x3, x5 /* x3 := nrounds */
554 bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */
555 subs x10, x10, #0x10 /* count down nbytes */
556 b.ne 1b /* repeat if more blocks */
557 eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
558 sub x2, x2, #0x10 /* store first plaintext block */
559 st1 {v0.16b}, [x2]
560 ldp fp, lr, [sp], #16 /* pop stack frame */
561 ret
562 END(aesarmv8_cbc_dec1)
563
564 /*
565 * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
566 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
567 * uint32_t nrounds@x5)
568 *
569 * Decrypt a contiguous sequence of 8-block units with AES-CBC.
570 *
571 * nbytes must be a positive integral multiple of 128.
572 *
573 * Standard ABI calling convention.
574 */
575 ENTRY(aesarmv8_cbc_dec8)
576 stp fp, lr, [sp, #-16]! /* push stack frame */
577 mov fp, sp
578 ld1 {v24.16b}, [x4] /* q24 := iv */
579 mov x9, x0 /* x9 := enckey */
580 mov x10, x3 /* x10 := nbytes */
581 add x1, x1, x3 /* x1 := pointer past end of in */
582 add x2, x2, x3 /* x2 := pointer past end of out */
583 sub x1, x1, #0x20
584 ld1 {v6.16b, v7.16b}, [x1] /* q6, q7 := last ciphertext blocks */
585 st1 {v7.16b}, [x4] /* update iv */
586 b 2f
587 _ALIGN_TEXT
588 1: sub x1, x1, #0x20
589 ld1 {v6.16b, v7.16b}, [x1]
590 eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
591 sub x2, x2, #0x20
592 st1 {v0.16b, v1.16b}, [x2]
593 2: sub x1, x1, #0x20
594 ld1 {v4.16b-v5.16b}, [x1]
595 sub x1, x1, #0x40
596 ld1 {v0.16b-v3.16b}, [x1]
597
598 mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */
599 mov v30.16b, v5.16b
600 mov v29.16b, v4.16b
601 mov v28.16b, v3.16b
602 mov v27.16b, v2.16b
603 mov v26.16b, v1.16b
604 mov v25.16b, v0.16b
605 mov x0, x9 /* x0 := enckey */
606 mov x3, x5 /* x3 := nrounds */
607 bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i];
608 * trash x0/x3/q16 */
609 eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
610 eor v6.16b, v6.16b, v30.16b
611 eor v5.16b, v5.16b, v29.16b
612 eor v4.16b, v4.16b, v28.16b
613 eor v3.16b, v3.16b, v27.16b
614 eor v2.16b, v2.16b, v26.16b
615 eor v1.16b, v1.16b, v25.16b
616 subs x10, x10, #0x80 /* count down nbytes */
617 sub x2, x2, #0x20 /* store plaintext blocks */
618 st1 {v6.16b-v7.16b}, [x2]
619 sub x2, x2, #0x40
620 st1 {v2.16b-v5.16b}, [x2]
621 b.ne 1b /* repeat if there's more */
622 eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */
623 sub x2, x2, #0x20
624 st1 {v0.16b, v1.16b}, [x2] /* store first two plaintext blocks */
625 ldp fp, lr, [sp], #16 /* pop stack frame */
626 ret
627 END(aesarmv8_cbc_dec8)
628
629 /*
630 * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
631 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
632 * uint32_t nrounds@x5)
633 *
634 * Encrypt a contiguous sequence of blocks with AES-XTS.
635 *
636 * nbytes must be a positive integral multiple of 16. This routine
637 * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
638 *
639 * Standard ABI calling convention.
640 */
641 ENTRY(aesarmv8_xts_enc1)
642 stp fp, lr, [sp, #-16]! /* push stack frame */
643 mov fp, sp
644 mov x9, x0 /* x9 := enckey */
645 mov x10, x3 /* x10 := nbytes */
646 ld1 {v31.16b}, [x4] /* q31 := tweak */
647 _ALIGN_TEXT
648 1: ld1 {v0.16b}, [x1], #0x10 /* q0 := ptxt */
649 mov x0, x9 /* x0 := enckey */
650 mov x3, x5 /* x3 := nrounds */
651 eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
652 bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */
653 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
654 st1 {v0.16b}, [x2], #0x10 /* store ciphertext block */
655 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
656 subs x10, x10, #0x10 /* count down nbytes */
657 b.ne 1b /* repeat if more blocks */
658 st1 {v31.16b}, [x4] /* update tweak */
659 ldp fp, lr, [sp], #16 /* pop stack frame */
660 ret
661 END(aesarmv8_xts_enc1)
662
663 /*
664 * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
665 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
666 * uint32_t nrounds@x5)
667 *
668 * Encrypt a contiguous sequence of blocks with AES-XTS.
669 *
670 * nbytes must be a positive integral multiple of 128.
671 *
672 * Standard ABI calling convention.
673 */
674 ENTRY(aesarmv8_xts_enc8)
675 stp fp, lr, [sp, #-16]! /* push stack frame */
676 mov fp, sp
677 mov x9, x0 /* x9 := enckey */
678 mov x10, x3 /* x10 := nbytes */
679 ld1 {v31.16b}, [x4] /* q31 := tweak */
680 _ALIGN_TEXT
681 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
682 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
683 mov v25.16b, v31.16b /* q25 := tweak[1] */
684 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
685 mov v26.16b, v31.16b /* q26 := tweak[2] */
686 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
687 mov v27.16b, v31.16b /* q27 := tweak[3] */
688 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
689 mov v28.16b, v31.16b /* q28 := tweak[4] */
690 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
691 mov v29.16b, v31.16b /* q29 := tweak[5] */
692 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
693 mov v30.16b, v31.16b /* q30 := tweak[6] */
694 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
695 /* q31 := tweak[7] */
696 ld1 {v0.16b-v3.16b}, [x1], #0x40 /* q[i] := ptxt[i] */
697 ld1 {v4.16b-v7.16b}, [x1], #0x40
698 eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */
699 eor v1.16b, v1.16b, v25.16b
700 eor v2.16b, v2.16b, v26.16b
701 eor v3.16b, v3.16b, v27.16b
702 eor v4.16b, v4.16b, v28.16b
703 eor v5.16b, v5.16b, v29.16b
704 eor v6.16b, v6.16b, v30.16b
705 eor v7.16b, v7.16b, v31.16b
706 mov x0, x9 /* x0 := enckey */
707 mov x3, x5 /* x3 := nrounds */
708 bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */
709 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
710 eor v1.16b, v1.16b, v25.16b
711 eor v2.16b, v2.16b, v26.16b
712 eor v3.16b, v3.16b, v27.16b
713 eor v4.16b, v4.16b, v28.16b
714 eor v5.16b, v5.16b, v29.16b
715 eor v6.16b, v6.16b, v30.16b
716 eor v7.16b, v7.16b, v31.16b
717 st1 {v0.16b-v3.16b}, [x2], #0x40 /* store ciphertext blocks */
718 st1 {v4.16b-v7.16b}, [x2], #0x40
719 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
720 subs x10, x10, #0x80 /* count down nbytes */
721 b.ne 1b /* repeat if more block groups */
722 st1 {v31.16b}, [x4] /* update tweak */
723 ldp fp, lr, [sp], #16 /* pop stack frame */
724 ret
725 END(aesarmv8_xts_enc8)
726
727 /*
728 * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
729 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
730 * uint32_t nrounds@x5)
731 *
732 * Decrypt a contiguous sequdece of blocks with AES-XTS.
733 *
734 * nbytes must be a positive integral multiple of 16. This routine
735 * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
736 *
737 * Standard ABI calling convention.
738 */
739 ENTRY(aesarmv8_xts_dec1)
740 stp fp, lr, [sp, #-16]! /* push stack frame */
741 mov fp, sp
742 mov x9, x0 /* x9 := deckey */
743 mov x10, x3 /* x10 := nbytes */
744 ld1 {v31.16b}, [x4] /* q31 := tweak */
745 _ALIGN_TEXT
746 1: ld1 {v0.16b}, [x1], #0x10 /* q0 := ctxt */
747 mov x0, x9 /* x0 := deckey */
748 mov x3, x5 /* x3 := nrounds */
749 eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */
750 bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */
751 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */
752 st1 {v0.16b}, [x2], #0x10 /* store plaintext block */
753 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
754 subs x10, x10, #0x10 /* count down nbytes */
755 b.ne 1b /* repeat if more blocks */
756 st1 {v31.16b}, [x4] /* update tweak */
757 ldp fp, lr, [sp], #16 /* pop stack frame */
758 ret
759 END(aesarmv8_xts_dec1)
760
761 /*
762 * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
763 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
764 * uint32_t nrounds@x5)
765 *
766 * Decrypt a contiguous sequdece of blocks with AES-XTS.
767 *
768 * nbytes must be a positive integral multiple of 128.
769 *
770 * Standard ABI calling convention.
771 */
772 ENTRY(aesarmv8_xts_dec8)
773 stp fp, lr, [sp, #-16]! /* push stack frame */
774 mov fp, sp
775 mov x9, x0 /* x9 := deckey */
776 mov x10, x3 /* x10 := nbytes */
777 ld1 {v31.16b}, [x4] /* q31 := tweak */
778 _ALIGN_TEXT
779 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
780 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
781 mov v25.16b, v31.16b /* q25 := tweak[1] */
782 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
783 mov v26.16b, v31.16b /* q26 := tweak[2] */
784 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
785 mov v27.16b, v31.16b /* q27 := tweak[3] */
786 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
787 mov v28.16b, v31.16b /* q28 := tweak[4] */
788 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
789 mov v29.16b, v31.16b /* q29 := tweak[5] */
790 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
791 mov v30.16b, v31.16b /* q30 := tweak[6] */
792 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
793 /* q31 := tweak[7] */
794 ld1 {v0.16b-v3.16b}, [x1], #0x40 /* q[i] := ctxt[i] */
795 ld1 {v4.16b-v7.16b}, [x1], #0x40
796 eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */
797 eor v1.16b, v1.16b, v25.16b
798 eor v2.16b, v2.16b, v26.16b
799 eor v3.16b, v3.16b, v27.16b
800 eor v4.16b, v4.16b, v28.16b
801 eor v5.16b, v5.16b, v29.16b
802 eor v6.16b, v6.16b, v30.16b
803 eor v7.16b, v7.16b, v31.16b
804 mov x0, x9 /* x0 := deckey */
805 mov x3, x5 /* x3 := nrounds */
806 bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */
807 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
808 eor v1.16b, v1.16b, v25.16b
809 eor v2.16b, v2.16b, v26.16b
810 eor v3.16b, v3.16b, v27.16b
811 eor v4.16b, v4.16b, v28.16b
812 eor v5.16b, v5.16b, v29.16b
813 eor v6.16b, v6.16b, v30.16b
814 eor v7.16b, v7.16b, v31.16b
815 st1 {v0.16b-v3.16b}, [x2], #0x40 /* store plaintext blocks */
816 st1 {v4.16b-v7.16b}, [x2], #0x40
817 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
818 subs x10, x10, #0x80 /* count down nbytes */
819 b.ne 1b /* repeat if more block groups */
820 st1 {v31.16b}, [x4] /* update tweak */
821 ldp fp, lr, [sp], #16 /* pop stack frame */
822 ret
823 END(aesarmv8_xts_dec8)
824
825 /*
826 * aesarmv8_xts_mulx(tweak@q31)
827 *
828 * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
829 * Uses x0 and q0/q1 as temporaries.
830 */
831 .text
832 _ALIGN_TEXT
833 .type aesarmv8_xts_mulx,@function
834 aesarmv8_xts_mulx:
835 /*
836 * Simultaneously determine
837 * (a) whether the high bit of the low half must be
838 * shifted into the low bit of the high half, and
839 * (b) whether the high bit of the high half must be
840 * carried into x^128 = x^7 + x^2 + x + 1.
841 */
842 adrl x0, xtscarry
843 cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
844 ld1 {v0.16b}, [x0] /* q0 := xtscarry */
845 ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
846 shl v31.2d, v31.2d, #1 /* shift */
847 and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */
848 eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
849 ret
850 END(aesarmv8_xts_mulx)
851
852 .section .rodata
853 .p2align 4
854 .type xtscarry,@object
855 xtscarry:
856 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0
857 END(xtscarry)
858
859 /*
860 * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
861 *
862 * Update an AES-XTS tweak.
863 *
864 * Standard ABI calling convention.
865 */
866 ENTRY(aesarmv8_xts_update)
867 stp fp, lr, [sp, #-16]! /* push stack frame */
868 mov fp, sp
869 ld1 {v31.16b}, [x0] /* load tweak */
870 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
871 st1 {v31.16b}, [x1] /* store tweak */
872 ldp fp, lr, [sp], #16 /* pop stack frame */
873 ret
874 END(aesarmv8_xts_update)
875
876 /*
877 * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0,
878 * const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3,
879 * uint32_t nrounds@x4)
880 *
881 * Update CBC-MAC.
882 *
883 * nbytes must be a positive integral multiple of 16.
884 *
885 * Standard ABI calling convention.
886 */
887 ENTRY(aesarmv8_cbcmac_update1)
888 stp fp, lr, [sp, #-16]! /* push stack frame */
889 mov fp, sp
890 ld1 {v0.16b}, [x3] /* q0 := initial authenticator */
891 mov x9, x0 /* x9 := enckey */
892 mov x5, x3 /* x5 := &auth (enc1 trashes x3) */
893 _ALIGN_TEXT
894 1: ld1 {v1.16b}, [x1], #0x10 /* q1 := plaintext block */
895 mov x0, x9 /* x0 := enckey */
896 mov x3, x4 /* x3 := nrounds */
897 eor v0.16b, v0.16b, v1.16b /* q0 := auth ^ ptxt */
898 bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */
899 subs x2, x2, #0x10 /* count down nbytes */
900 b.ne 1b /* repeat if x10 is nonzero */
901 st1 {v0.16b}, [x5] /* store updated authenticator */
902 ldp fp, lr, [sp], #16 /* pop stack frame */
903 ret
904 END(aesarmv8_cbcmac_update1)
905
906 /*
907 * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
908 * uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
909 * uint32_t nrounds@x5)
910 *
911 * Update CCM encryption.
912 *
913 * nbytes must be a positive integral multiple of 16.
914 *
915 * Standard ABI calling convention.
916 */
917 ENTRY(aesarmv8_ccm_enc1)
918 stp fp, lr, [sp, #-16]! /* push stack frame */
919 mov fp, sp
920 ld1 {v0.16b, v1.16b}, [x4] /* q0 := auth, q2 := ctr (be) */
921 mov v2.16b, v1.16b
922 adrl x11, ctr32_inc /* x11 := &ctr32_inc */
923 ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */
924 mov x9, x0 /* x9 := enckey */
925 mov x10, x3 /* x10 := nbytes */
926 rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */
927 _ALIGN_TEXT
928 1: ld1 {v3.16b}, [x1], #0x10 /* q3 := plaintext block */
929 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
930 mov x0, x9 /* x0 := enckey */
931 mov x3, x5 /* x3 := nrounds */
932 rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */
933 eor v0.16b, v0.16b, v3.16b /* q0 := auth ^ ptxt */
934 bl aesarmv8_enc2 /* q0 := auth', q1 := pad;
935 * trash x0/x3/q16 */
936 eor v3.16b, v1.16b, v3.16b /* q3 := ciphertext block */
937 subs x10, x10, #0x10 /* count down bytes */
938 st1 {v3.16b}, [x2], #0x10 /* store ciphertext block */
939 b.ne 1b /* repeat if more blocks */
940 rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */
941 mov v1.16b, v2.16b /* store updated auth/ctr */
942 st1 {v0.16b-v1.16b}, [x4]
943 ldp fp, lr, [sp], #16 /* pop stack frame */
944 ret
945 END(aesarmv8_ccm_enc1)
946
947 /*
948 * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
949 * uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
950 * uint32_t nrounds@x5)
951 *
952 * Update CCM decryption.
953 *
954 * nbytes must be a positive integral multiple of 16.
955 *
956 * Standard ABI calling convention.
957 */
958 ENTRY(aesarmv8_ccm_dec1)
959 stp fp, lr, [sp, #-16]! /* push stack frame */
960 mov fp, sp
961 ld1 {v1.16b, v2.16b}, [x4] /* q1 := auth, q2 := ctr (be) */
962 adrl x11, ctr32_inc /* x11 := &ctr32_inc */
963 ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */
964 mov x9, x0 /* x9 := enckey */
965 mov x10, x3 /* x10 := nbytes */
966 rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */
967
968 /* Decrypt the first block. */
969 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
970 mov x3, x5 /* x3 := nrounds */
971 rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */
972 ld1 {v3.16b}, [x1], #0x10 /* q3 := ctxt */
973 bl aesarmv8_enc1 /* q0 := pad; trash x0/x3/q16 */
974 b 2f
975
976 _ALIGN_TEXT
977 1: /*
978 * Authenticate the last block and decrypt the next block
979 * simultaneously.
980 *
981 * q1 = auth ^ ptxt[-1]
982 * q2 = ctr[-1] (le)
983 */
984 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
985 mov x0, x9 /* x0 := enckey */
986 mov x3, x5 /* x3 := nrounds */
987 rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */
988 ld1 {v3.16b}, [x1], #0x10 /* q3 := ctxt */
989 bl aesarmv8_enc2 /* q0 := pad, q1 := auth';
990 * trash x0/x3/q16 */
991 2: eor v3.16b, v0.16b, v3.16b /* q3 := plaintext block */
992 subs x10, x10, #0x10
993 st1 {v3.16b}, [x2], #0x10 /* store plaintext */
994 eor v1.16b, v1.16b, v3.16b /* q1 := auth ^ ptxt */
995 b.ne 1b
996
997 rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */
998
999 /* Authenticate the last block. */
1000 mov x0, x9 /* x0 := enckey */
1001 mov x3, x5 /* x3 := nrounds */
1002 mov v0.16b, v1.16b /* q0 := auth ^ ptxt */
1003 bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */
1004
1005 mov v1.16b, v2.16b /* store updated auth/ctr */
1006 st1 {v0.16b-v1.16b}, [x4]
1007 ldp fp, lr, [sp], #16 /* pop stack frame */
1008 ret
1009 END(aesarmv8_ccm_dec1)
1010
1011 .section .rodata
1012 .p2align 4
1013 .type ctr32_inc,@object
1014 ctr32_inc:
1015 .int 0, 0, 0, 1
1016 END(ctr32_inc)
1017
1018 /*
1019 * aesarmv8_enc1(const struct aesenc *enckey@x0,
1020 * uint128_t block@q0, uint32_t nrounds@x3)
1021 *
1022 * Encrypt a single AES block in q0.
1023 *
1024 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1025 */
1026 .text
1027 _ALIGN_TEXT
1028 .type aesarmv8_enc1,@function
1029 aesarmv8_enc1:
1030 ldr q16, [x0], #0x10 /* load round key */
1031 sub x3, x3, #1
1032 _ALIGN_TEXT
1033 1: /* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
1034 aese v0.16b, v16.16b
1035 aesmc v0.16b, v0.16b
1036 ldr q16, [x0], #0x10
1037 subs x3, x3, #1
1038 b.ne 1b
1039 /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
1040 aese v0.16b, v16.16b
1041 ldr q16, [x0] /* load last round key */
1042 /* q0 := AddRoundKey_q16(q0) */
1043 eor v0.16b, v0.16b, v16.16b
1044 ret
1045 END(aesarmv8_enc1)
1046
1047 /*
1048 * aesarmv8_enc2(const struct aesenc *enckey@x0,
1049 * uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3)
1050 *
1051 * Encrypt two AES blocks in q0 and q1.
1052 *
1053 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1054 */
1055 .text
1056 _ALIGN_TEXT
1057 .type aesarmv8_enc2,@function
1058 aesarmv8_enc2:
1059 ldr q16, [x0], #0x10 /* load round key */
1060 sub x3, x3, #1
1061 _ALIGN_TEXT
1062 1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
1063 aese v0.16b, v16.16b
1064 aesmc v0.16b, v0.16b
1065 aese v1.16b, v16.16b
1066 aesmc v1.16b, v1.16b
1067 ldr q16, [x0], #0x10 /* load next round key */
1068 subs x3, x3, #1
1069 b.ne 1b
1070 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
1071 aese v0.16b, v16.16b
1072 aese v1.16b, v16.16b
1073 ldr q16, [x0] /* load last round key */
1074 /* q[i] := AddRoundKey_q16(q[i]) */
1075 eor v0.16b, v0.16b, v16.16b
1076 eor v1.16b, v1.16b, v16.16b
1077 ret
1078 END(aesarmv8_enc2)
1079
1080 /*
1081 * aesarmv8_enc8(const struct aesenc *enckey@x0,
1082 * uint128_t block0@q0, ..., uint128_t block7@q7,
1083 * uint32_t nrounds@x3)
1084 *
1085 * Encrypt eight AES blocks in q0 through q7 in parallel.
1086 *
1087 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1088 */
1089 .text
1090 _ALIGN_TEXT
1091 .type aesarmv8_enc8,@function
1092 aesarmv8_enc8:
1093 ldr q16, [x0], #0x10 /* load round key */
1094 sub x3, x3, #1
1095 _ALIGN_TEXT
1096 1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
1097 aese v0.16b, v16.16b
1098 aesmc v0.16b, v0.16b
1099 aese v1.16b, v16.16b
1100 aesmc v1.16b, v1.16b
1101 aese v2.16b, v16.16b
1102 aesmc v2.16b, v2.16b
1103 aese v3.16b, v16.16b
1104 aesmc v3.16b, v3.16b
1105 aese v4.16b, v16.16b
1106 aesmc v4.16b, v4.16b
1107 aese v5.16b, v16.16b
1108 aesmc v5.16b, v5.16b
1109 aese v6.16b, v16.16b
1110 aesmc v6.16b, v6.16b
1111 aese v7.16b, v16.16b
1112 aesmc v7.16b, v7.16b
1113 ldr q16, [x0], #0x10 /* load next round key */
1114 subs x3, x3, #1
1115 b.ne 1b
1116 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
1117 aese v0.16b, v16.16b
1118 aese v1.16b, v16.16b
1119 aese v2.16b, v16.16b
1120 aese v3.16b, v16.16b
1121 aese v4.16b, v16.16b
1122 aese v5.16b, v16.16b
1123 aese v6.16b, v16.16b
1124 aese v7.16b, v16.16b
1125 ldr q16, [x0] /* load last round key */
1126 /* q[i] := AddRoundKey_q16(q[i]) */
1127 eor v0.16b, v0.16b, v16.16b
1128 eor v1.16b, v1.16b, v16.16b
1129 eor v2.16b, v2.16b, v16.16b
1130 eor v3.16b, v3.16b, v16.16b
1131 eor v4.16b, v4.16b, v16.16b
1132 eor v5.16b, v5.16b, v16.16b
1133 eor v6.16b, v6.16b, v16.16b
1134 eor v7.16b, v7.16b, v16.16b
1135 ret
1136 END(aesarmv8_enc8)
1137
1138 /*
1139 * aesarmv8_dec1(const struct aesdec *deckey@x0,
1140 * uint128_t block@q0, uint32_t nrounds@x3)
1141 *
1142 * Decrypt a single AES block in q0.
1143 *
1144 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1145 */
1146 .text
1147 _ALIGN_TEXT
1148 .type aesarmv8_dec1,@function
1149 aesarmv8_dec1:
1150 ldr q16, [x0], #0x10 /* load round key */
1151 sub x3, x3, #1
1152 _ALIGN_TEXT
1153 1: /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
1154 aesd v0.16b, v16.16b
1155 /* q0 := InMixColumns(q0) */
1156 aesimc v0.16b, v0.16b
1157 ldr q16, [x0], #0x10 /* load next round key */
1158 subs x3, x3, #1
1159 b.ne 1b
1160 /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
1161 aesd v0.16b, v16.16b
1162 ldr q16, [x0] /* load last round key */
1163 /* q0 := AddRoundKey_q16(q0) */
1164 eor v0.16b, v0.16b, v16.16b
1165 ret
1166 END(aesarmv8_dec1)
1167
1168 /*
1169 * aesarmv8_dec8(const struct aesdec *deckey@x0,
1170 * uint128_t block0@q0, ..., uint128_t block7@q7,
1171 * uint32_t nrounds@x3)
1172 *
1173 * Decrypt eight AES blocks in q0 through q7 in parallel.
1174 *
1175 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1176 */
1177 .text
1178 _ALIGN_TEXT
1179 .type aesarmv8_dec8,@function
1180 aesarmv8_dec8:
1181 ldr q16, [x0], #0x10 /* load round key */
1182 sub x3, x3, #1
1183 _ALIGN_TEXT
1184 1: /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
1185 aesd v0.16b, v16.16b
1186 /* q[i] := InMixColumns(q[i]) */
1187 aesimc v0.16b, v0.16b
1188 aesd v1.16b, v16.16b
1189 aesimc v1.16b, v1.16b
1190 aesd v2.16b, v16.16b
1191 aesimc v2.16b, v2.16b
1192 aesd v3.16b, v16.16b
1193 aesimc v3.16b, v3.16b
1194 aesd v4.16b, v16.16b
1195 aesimc v4.16b, v4.16b
1196 aesd v5.16b, v16.16b
1197 aesimc v5.16b, v5.16b
1198 aesd v6.16b, v16.16b
1199 aesimc v6.16b, v6.16b
1200 aesd v7.16b, v16.16b
1201 aesimc v7.16b, v7.16b
1202 ldr q16, [x0], #0x10 /* load next round key */
1203 subs x3, x3, #1
1204 b.ne 1b
1205 /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
1206 aesd v0.16b, v16.16b
1207 aesd v1.16b, v16.16b
1208 aesd v2.16b, v16.16b
1209 aesd v3.16b, v16.16b
1210 aesd v4.16b, v16.16b
1211 aesd v5.16b, v16.16b
1212 aesd v6.16b, v16.16b
1213 aesd v7.16b, v16.16b
1214 ldr q16, [x0] /* load last round key */
1215 /* q[i] := AddRoundKey_q16(q[i]) */
1216 eor v0.16b, v0.16b, v16.16b
1217 eor v1.16b, v1.16b, v16.16b
1218 eor v2.16b, v2.16b, v16.16b
1219 eor v3.16b, v3.16b, v16.16b
1220 eor v4.16b, v4.16b, v16.16b
1221 eor v5.16b, v5.16b, v16.16b
1222 eor v6.16b, v6.16b, v16.16b
1223 eor v7.16b, v7.16b, v16.16b
1224 ret
1225 END(aesarmv8_dec8)
1226