aes_armv8_64.S revision 1.11 1 /* $NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/endian.h>
30
31 #include <aarch64/asm.h>
32
33 RCSID("$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $")
34
35 .arch_extension aes
36
37 /*
38 * uint32_t rcon[10]
39 *
40 * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
41 * Such elements of GF(8) need only eight bits to be represented,
42 * but we store them in 4-byte units so we can copy one into all
43 * four 4-byte lanes of a vector register with a single LD1R. The
44 * access pattern is fixed, so indices into this table are never
45 * secret.
46 */
47 .section .rodata
48 .p2align 2
49 .type rcon,@object
50 rcon:
51 .long 0x01
52 .long 0x02
53 .long 0x04
54 .long 0x08
55 .long 0x10
56 .long 0x20
57 .long 0x40
58 .long 0x80
59 .long 0x1b
60 .long 0x36
61 END(rcon)
62
63 /*
64 * uint128_t unshiftrows_rotword_1
65 *
66 * Table for TBL instruction to undo ShiftRows, and then do
67 * RotWord on word 1, and then copy it into all the other words.
68 */
69 .section .rodata
70 .p2align 4
71 .type unshiftrows_rotword_1,@object
72 unshiftrows_rotword_1:
73 .byte 0x01,0x0e,0x0b,0x04
74 .byte 0x01,0x0e,0x0b,0x04
75 .byte 0x01,0x0e,0x0b,0x04
76 .byte 0x01,0x0e,0x0b,0x04
77 END(unshiftrows_rotword_1)
78
79 /*
80 * uint128_t unshiftrows_3
81 *
82 * Table for TBL instruction to undo ShiftRows, and then copy word
83 * 3 into all the other words.
84 */
85 .section .rodata
86 .p2align 4
87 .type unshiftrows_3,@object
88 unshiftrows_3:
89 .byte 0x0c,0x09,0x06,0x03
90 .byte 0x0c,0x09,0x06,0x03
91 .byte 0x0c,0x09,0x06,0x03
92 .byte 0x0c,0x09,0x06,0x03
93 END(unshiftrows_3)
94
95 /*
96 * uint128_t unshiftrows_rotword_3
97 *
98 * Table for TBL instruction to undo ShiftRows, and then do
99 * RotWord on word 3, and then copy it into all the other words.
100 */
101 .section .rodata
102 .p2align 4
103 .type unshiftrows_rotword_3,@object
104 unshiftrows_rotword_3:
105 .byte 0x09,0x06,0x03,0x0c
106 .byte 0x09,0x06,0x03,0x0c
107 .byte 0x09,0x06,0x03,0x0c
108 .byte 0x09,0x06,0x03,0x0c
109 END(unshiftrows_rotword_3)
110
111 /*
112 * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
113 *
114 * Expand a 16-byte AES-128 key into 10 round keys.
115 *
116 * Standard ABI calling convention.
117 */
118 ENTRY(aesarmv8_setenckey128)
119 ldr q1, [x1] /* q1 := master key */
120
121 adrl x4, unshiftrows_rotword_3
122 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
123 ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */
124
125 str q1, [x0], #0x10 /* store master key as first round key */
126 mov x2, #10 /* round count */
127 adrl x3, rcon /* round constant */
128
129 1: /*
130 * q0 = 0
131 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
132 * x0 = pointer to round key to compute
133 * x2 = round count
134 * x3 = rcon pointer
135 */
136
137 /* q3 := ShiftRows(SubBytes(q1)) */
138 mov v3.16b, v1.16b
139 aese v3.16b, v0.16b
140
141 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
142 ld1r {v4.4s}, [x3], #4
143 tbl v3.16b, {v3.16b}, v16.16b
144 eor v3.16b, v3.16b, v4.16b
145
146 /*
147 * v5.4s := (0,prk[0],prk[1],prk[2])
148 * v6.4s := (0,0,prk[0],prk[1])
149 * v7.4s := (0,0,0,prk[0])
150 */
151 ext v5.16b, v0.16b, v1.16b, #12
152 ext v6.16b, v0.16b, v1.16b, #8
153 ext v7.16b, v0.16b, v1.16b, #4
154
155 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
156 eor v1.16b, v1.16b, v3.16b
157 eor v1.16b, v1.16b, v5.16b
158 eor v1.16b, v1.16b, v6.16b
159 eor v1.16b, v1.16b, v7.16b
160
161 subs x2, x2, #1 /* count down rounds */
162 str q1, [x0], #0x10 /* store round key */
163 b.ne 1b
164
165 ret
166 END(aesarmv8_setenckey128)
167
168 /*
169 * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
170 *
171 * Expand a 24-byte AES-192 key into 12 round keys.
172 *
173 * Standard ABI calling convention.
174 */
175 ENTRY(aesarmv8_setenckey192)
176 ldr q1, [x1], #0x10 /* q1 := master key[0:128) */
177 ldr d2, [x1] /* d2 := master key[128:192) */
178
179 adrl x4, unshiftrows_rotword_1
180 adrl x5, unshiftrows_rotword_3
181 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
182 ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */
183 ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */
184
185 str q1, [x0], #0x10 /* store master key[0:128) as round key */
186 mov x2, #12 /* round count */
187 adrl x3, rcon /* round constant */
188
189 1: /*
190 * q0 = 0
191 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
192 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
193 * x0 = pointer to three round keys to compute
194 * x2 = round count
195 * x3 = rcon pointer
196 */
197
198 /* q3 := ShiftRows(SubBytes(q2)) */
199 mov v3.16b, v2.16b
200 aese v3.16b, v0.16b
201
202 /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
203 ld1r {v4.4s}, [x3], #4
204 tbl v3.16b, {v3.16b}, v16.16b
205 eor v3.16b, v3.16b, v4.16b
206
207 /*
208 * We need to compute:
209 *
210 * rk[0] := rklo[0]
211 * rk[1] := rklo[1]
212 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
213 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
214 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
215 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
216 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
217 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
218 * ^ rklo[1]
219 */
220
221 /*
222 * v5.4s := (0,prk[0],prk[1],prk[2])
223 * v6.4s := (0,0,prk[0],prk[1])
224 * v7.4s := (0,0,0,prk[0])
225 */
226 ext v5.16b, v0.16b, v1.16b, #12
227 ext v6.16b, v0.16b, v1.16b, #8
228 ext v7.16b, v0.16b, v1.16b, #4
229
230 /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
231 eor v5.16b, v5.16b, v1.16b
232 eor v5.16b, v5.16b, v3.16b
233 eor v5.16b, v5.16b, v6.16b
234 eor v5.16b, v5.16b, v7.16b
235
236 /*
237 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
238 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
239 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
240 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
241 * (rklo[0],rklo[1],...).
242 */
243
244 /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
245 dup v1.4s, v5.s[3]
246 mov v1.s[0], v5.s[2]
247
248 /*
249 * v6.4s := (0, 0, rklo[0], rklo[1])
250 * v7.4s := (0, 0, 0, rklo[0])
251 */
252 ext v6.16b, v0.16b, v2.16b, #8
253 ext v7.16b, v0.16b, v2.16b, #4
254
255 /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
256 eor v3.16b, v1.16b, v6.16b
257 eor v3.16b, v3.16b, v7.16b
258
259 /*
260 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
261 * and v5.4s = (rk[2], rk[3], xxx, xxx). Set
262 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
263 */
264 mov v2.d[1], v5.d[0]
265
266 /* store two round keys */
267 stp q2, q3, [x0], #0x20
268
269 /*
270 * Live vector registers at this point:
271 *
272 * q0 = zero
273 * q2 = rk
274 * q3 = nrk
275 * v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
276 * q16 = unshiftrows_rotword_1
277 * q17 = unshiftrows_rotword_3
278 *
279 * We have to compute, in q1:
280 *
281 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
282 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
283 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
284 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
285 * ^ nrk[1]
286 *
287 * And, if there's any more afterward, in q2:
288 *
289 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
290 * ^ nrk[1] ^ nrk[2]
291 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
292 * ^ nrk[1] ^ nrk[2] ^ nrk[3]
293 */
294
295 /* q1 := RotWords(SubBytes(q3)) */
296 mov v1.16b, v3.16b
297 aese v1.16b, v0.16b
298
299 /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
300 ld1r {v4.4s}, [x3], #4
301 tbl v1.16b, {v1.16b}, v17.16b
302 eor v1.16b, v1.16b, v4.16b
303
304 /*
305 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
306 * v4.4s := (0, rk[2], rk[3], nrk[0])
307 * v6.4s := (0, 0, rk[2], rk[3])
308 * v7.4s := (0, 0, 0, rk[2])
309 */
310 ext v4.16b, v0.16b, v5.16b, #12
311 ext v6.16b, v0.16b, v5.16b, #8
312 ext v7.16b, v0.16b, v5.16b, #4
313
314 /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
315 eor v1.16b, v1.16b, v5.16b
316 eor v1.16b, v1.16b, v4.16b
317 eor v1.16b, v1.16b, v6.16b
318 eor v1.16b, v1.16b, v7.16b
319
320 subs x2, x2, #3 /* count down three rounds */
321 str q1, [x0], #0x10 /* store third round key */
322 b.eq 2f
323
324 /*
325 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
326 * v5.4s := (0, nrk[2], xxx, xxx)
327 */
328 ext v4.16b, v3.16b, v0.16b, #8
329 ext v5.16b, v0.16b, v4.16b, #12
330
331 /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
332 dup v2.4s, v1.s[3]
333
334 /*
335 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
336 * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
337 * xxx, xxx)
338 */
339 eor v2.16b, v2.16b, v4.16b
340 eor v2.16b, v2.16b, v5.16b
341
342 b 1b
343
344 2: ret
345 END(aesarmv8_setenckey192)
346
347 /*
348 * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
349 *
350 * Expand a 32-byte AES-256 key into 14 round keys.
351 *
352 * Standard ABI calling convention.
353 */
354 ENTRY(aesarmv8_setenckey256)
355 /* q1 := key[0:128), q2 := key[128:256) */
356 ldp q1, q2, [x1], #0x20
357
358 adrl x4, unshiftrows_rotword_3
359 adrl x5, unshiftrows_3
360 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
361 ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */
362 ldr q17, [x5] /* q17 := unshiftrows_3 */
363
364 /* store master key as first two round keys */
365 stp q1, q2, [x0], #0x20
366 mov x2, #14 /* round count */
367 adrl x3, rcon /* round constant */
368
369 1: /*
370 * q0 = 0
371 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
372 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
373 * x2 = round count
374 * x3 = rcon pointer
375 */
376
377 /* q3 := ShiftRows(SubBytes(q2)) */
378 mov v3.16b, v2.16b
379 aese v3.16b, v0.16b
380
381 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
382 ld1r {v4.4s}, [x3], #4
383 tbl v3.16b, {v3.16b}, v16.16b
384 eor v3.16b, v3.16b, v4.16b
385
386 /*
387 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
388 * v6.4s := (0,0,pprk[0],pprk[1])
389 * v7.4s := (0,0,0,pprk[0])
390 */
391 ext v5.16b, v0.16b, v1.16b, #12
392 ext v6.16b, v0.16b, v1.16b, #8
393 ext v7.16b, v0.16b, v1.16b, #4
394
395 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
396 eor v1.16b, v1.16b, v3.16b
397 eor v1.16b, v1.16b, v5.16b
398 eor v1.16b, v1.16b, v6.16b
399 eor v1.16b, v1.16b, v7.16b
400
401 subs x2, x2, #2 /* count down two rounds */
402 b.eq 2f /* stop if this is the last one */
403
404 /* q3 := ShiftRows(SubBytes(q1)) */
405 mov v3.16b, v1.16b
406 aese v3.16b, v0.16b
407
408 /* v3.4s[i] := SubBytes(rk[3]) */
409 tbl v3.16b, {v3.16b}, v17.16b
410
411 /*
412 * v5.4s := (0,prk[0],prk[1],prk[2])
413 * v6.4s := (0,0,prk[0],prk[1])
414 * v7.4s := (0,0,0,prk[0])
415 */
416 ext v5.16b, v0.16b, v2.16b, #12
417 ext v6.16b, v0.16b, v2.16b, #8
418 ext v7.16b, v0.16b, v2.16b, #4
419
420 /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
421 eor v2.16b, v2.16b, v3.16b
422 eor v2.16b, v2.16b, v5.16b
423 eor v2.16b, v2.16b, v6.16b
424 eor v2.16b, v2.16b, v7.16b
425
426 stp q1, q2, [x0], #0x20 /* store two round keys */
427 b 1b
428
429 2: str q1, [x0] /* store last round key */
430 ret
431 END(aesarmv8_setenckey256)
432
433 /*
434 * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
435 * uint32_t nrounds@x2)
436 *
437 * Convert AES encryption round keys to AES decryption round keys.
438 * `rounds' must be between 10 and 14.
439 *
440 * Standard ABI calling convention.
441 */
442 ENTRY(aesarmv8_enctodec)
443 ldr q0, [x0, x2, lsl #4] /* load last round key */
444 b 2f
445 _ALIGN_TEXT
446 1: aesimc v0.16b, v0.16b /* convert encryption to decryption */
447 2: str q0, [x1], #0x10 /* store round key */
448 subs x2, x2, #1 /* count down round */
449 ldr q0, [x0, x2, lsl #4] /* load previous round key */
450 b.ne 1b /* repeat if there's more */
451 str q0, [x1] /* store first round key verbatim */
452 ret
453 END(aesarmv8_enctodec)
454
455 /*
456 * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
457 * uint8_t out[16] @x2, uint32_t nrounds@x3)
458 *
459 * Encrypt a single block.
460 *
461 * Standard ABI calling convention.
462 */
463 ENTRY(aesarmv8_enc)
464 stp fp, lr, [sp, #-16]! /* push stack frame */
465 mov fp, sp
466 ldr q0, [x1] /* q0 := ptxt */
467 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
468 str q0, [x2] /* store ctxt */
469 ldp fp, lr, [sp], #16 /* pop stack frame */
470 ret
471 END(aesarmv8_enc)
472
473 /*
474 * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
475 * uint8_t out[16] @x2, uint32_t nrounds@x3)
476 *
477 * Decrypt a single block.
478 *
479 * Standard ABI calling convention.
480 */
481 ENTRY(aesarmv8_dec)
482 stp fp, lr, [sp, #-16]! /* push stack frame */
483 mov fp, sp
484 ldr q0, [x1] /* q0 := ctxt */
485 bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */
486 str q0, [x2] /* store ptxt */
487 ldp fp, lr, [sp], #16 /* pop stack frame */
488 ret
489 END(aesarmv8_dec)
490
491 /*
492 * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
493 * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
494 * uint32_t nrounds@x5)
495 *
496 * Encrypt a contiguous sequence of blocks with AES-CBC.
497 *
498 * nbytes must be an integral multiple of 16.
499 *
500 * Standard ABI calling convention.
501 */
502 ENTRY(aesarmv8_cbc_enc)
503 cbz x3, 2f /* stop if nothing to do */
504 stp fp, lr, [sp, #-16]! /* push stack frame */
505 mov fp, sp
506 mov x9, x0 /* x9 := enckey */
507 mov x10, x3 /* x10 := nbytes */
508 ldr q0, [x4] /* q0 := chaining value */
509 _ALIGN_TEXT
510 1: ldr q1, [x1], #0x10 /* q1 := plaintext block */
511 eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
512 mov x0, x9 /* x0 := enckey */
513 mov x3, x5 /* x3 := nrounds */
514 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
515 subs x10, x10, #0x10 /* count down nbytes */
516 str q0, [x2], #0x10 /* store ciphertext block */
517 b.ne 1b /* repeat if x10 is nonzero */
518 str q0, [x4] /* store chaining value */
519 ldp fp, lr, [sp], #16 /* pop stack frame */
520 2: ret
521 END(aesarmv8_cbc_enc)
522
523 /*
524 * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
525 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
526 * uint32_t nrounds@x5)
527 *
528 * Decrypt a contiguous sequence of blocks with AES-CBC.
529 *
530 * nbytes must be a positive integral multiple of 16. This routine
531 * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
532 *
533 * Standard ABI calling convention.
534 */
535 ENTRY(aesarmv8_cbc_dec1)
536 stp fp, lr, [sp, #-16]! /* push stack frame */
537 mov fp, sp
538 ldr q24, [x4] /* q24 := iv */
539 mov x9, x0 /* x9 := enckey */
540 mov x10, x3 /* x10 := nbytes */
541 add x1, x1, x3 /* x1 := pointer past end of in */
542 add x2, x2, x3 /* x2 := pointer past end of out */
543 ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */
544 str q0, [x4] /* update iv */
545 b 2f
546 _ALIGN_TEXT
547 1: ldr q31, [x1, #-0x10]! /* q31 := chaining value */
548 eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
549 str q0, [x2, #-0x10]! /* store plaintext block */
550 mov v0.16b, v31.16b /* move cv = ciphertext block */
551 2: mov x0, x9 /* x0 := enckey */
552 mov x3, x5 /* x3 := nrounds */
553 bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */
554 subs x10, x10, #0x10 /* count down nbytes */
555 b.ne 1b /* repeat if more blocks */
556 eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
557 str q0, [x2, #-0x10]! /* store first plaintext block */
558 ldp fp, lr, [sp], #16 /* pop stack frame */
559 ret
560 END(aesarmv8_cbc_dec1)
561
562 /*
563 * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
564 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
565 * uint32_t nrounds@x5)
566 *
567 * Decrypt a contiguous sequence of 8-block units with AES-CBC.
568 *
569 * nbytes must be a positive integral multiple of 128.
570 *
571 * Standard ABI calling convention.
572 */
573 ENTRY(aesarmv8_cbc_dec8)
574 stp fp, lr, [sp, #-16]! /* push stack frame */
575 mov fp, sp
576 ldr q24, [x4] /* q24 := iv */
577 mov x9, x0 /* x9 := enckey */
578 mov x10, x3 /* x10 := nbytes */
579 add x1, x1, x3 /* x1 := pointer past end of in */
580 add x2, x2, x3 /* x2 := pointer past end of out */
581 ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */
582 str q7, [x4] /* update iv */
583 b 2f
584 _ALIGN_TEXT
585 1: ldp q6, q7, [x1, #-0x20]!
586 eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
587 stp q0, q1, [x2, #-0x20]!
588 2: ldp q4, q5, [x1, #-0x20]!
589 ldp q2, q3, [x1, #-0x20]!
590 ldp q0, q1, [x1, #-0x20]!
591 mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */
592 mov v30.16b, v5.16b
593 mov v29.16b, v4.16b
594 mov v28.16b, v3.16b
595 mov v27.16b, v2.16b
596 mov v26.16b, v1.16b
597 mov v25.16b, v0.16b
598 mov x0, x9 /* x0 := enckey */
599 mov x3, x5 /* x3 := nrounds */
600 bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i];
601 * trash x0/x3/q16 */
602 eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
603 eor v6.16b, v6.16b, v30.16b
604 eor v5.16b, v5.16b, v29.16b
605 eor v4.16b, v4.16b, v28.16b
606 eor v3.16b, v3.16b, v27.16b
607 eor v2.16b, v2.16b, v26.16b
608 eor v1.16b, v1.16b, v25.16b
609 subs x10, x10, #0x80 /* count down nbytes */
610 stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
611 stp q4, q5, [x2, #-0x20]!
612 stp q2, q3, [x2, #-0x20]!
613 b.ne 1b /* repeat if there's more */
614 eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */
615 stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
616 ldp fp, lr, [sp], #16 /* pop stack frame */
617 ret
618 END(aesarmv8_cbc_dec8)
619
620 /*
621 * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
622 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
623 * uint32_t nrounds@x5)
624 *
625 * Encrypt a contiguous sequence of blocks with AES-XTS.
626 *
627 * nbytes must be a positive integral multiple of 16. This routine
628 * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
629 *
630 * Standard ABI calling convention.
631 */
632 ENTRY(aesarmv8_xts_enc1)
633 stp fp, lr, [sp, #-16]! /* push stack frame */
634 mov fp, sp
635 mov x9, x0 /* x9 := enckey */
636 mov x10, x3 /* x10 := nbytes */
637 ldr q31, [x4] /* q31 := tweak */
638 _ALIGN_TEXT
639 1: ldr q0, [x1], #0x10 /* q0 := ptxt */
640 mov x0, x9 /* x0 := enckey */
641 mov x3, x5 /* x3 := nrounds */
642 eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
643 bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */
644 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
645 str q0, [x2], #0x10 /* store ciphertext block */
646 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
647 subs x10, x10, #0x10 /* count down nbytes */
648 b.ne 1b /* repeat if more blocks */
649 str q31, [x4] /* update tweak */
650 ldp fp, lr, [sp], #16 /* pop stack frame */
651 ret
652 END(aesarmv8_xts_enc1)
653
654 /*
655 * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
656 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
657 * uint32_t nrounds@x5)
658 *
659 * Encrypt a contiguous sequence of blocks with AES-XTS.
660 *
661 * nbytes must be a positive integral multiple of 128.
662 *
663 * Standard ABI calling convention.
664 */
665 ENTRY(aesarmv8_xts_enc8)
666 stp fp, lr, [sp, #-16]! /* push stack frame */
667 mov fp, sp
668 mov x9, x0 /* x9 := enckey */
669 mov x10, x3 /* x10 := nbytes */
670 ldr q31, [x4] /* q31 := tweak */
671 _ALIGN_TEXT
672 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
673 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
674 mov v25.16b, v31.16b /* q25 := tweak[1] */
675 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
676 mov v26.16b, v31.16b /* q26 := tweak[2] */
677 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
678 mov v27.16b, v31.16b /* q27 := tweak[3] */
679 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
680 mov v28.16b, v31.16b /* q28 := tweak[4] */
681 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
682 mov v29.16b, v31.16b /* q29 := tweak[5] */
683 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
684 mov v30.16b, v31.16b /* q30 := tweak[6] */
685 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
686 /* q31 := tweak[7] */
687 ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */
688 ldp q2, q3, [x1], #0x20
689 ldp q4, q5, [x1], #0x20
690 ldp q6, q7, [x1], #0x20
691 eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */
692 eor v1.16b, v1.16b, v25.16b
693 eor v2.16b, v2.16b, v26.16b
694 eor v3.16b, v3.16b, v27.16b
695 eor v4.16b, v4.16b, v28.16b
696 eor v5.16b, v5.16b, v29.16b
697 eor v6.16b, v6.16b, v30.16b
698 eor v7.16b, v7.16b, v31.16b
699 mov x0, x9 /* x0 := enckey */
700 mov x3, x5 /* x3 := nrounds */
701 bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */
702 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
703 eor v1.16b, v1.16b, v25.16b
704 eor v2.16b, v2.16b, v26.16b
705 eor v3.16b, v3.16b, v27.16b
706 eor v4.16b, v4.16b, v28.16b
707 eor v5.16b, v5.16b, v29.16b
708 eor v6.16b, v6.16b, v30.16b
709 eor v7.16b, v7.16b, v31.16b
710 stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
711 stp q2, q3, [x2], #0x20
712 stp q4, q5, [x2], #0x20
713 stp q6, q7, [x2], #0x20
714 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
715 subs x10, x10, #0x80 /* count down nbytes */
716 b.ne 1b /* repeat if more block groups */
717 str q31, [x4] /* update tweak */
718 ldp fp, lr, [sp], #16 /* pop stack frame */
719 ret
720 END(aesarmv8_xts_enc8)
721
722 /*
723 * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
724 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
725 * uint32_t nrounds@x5)
726 *
727 * Decrypt a contiguous sequdece of blocks with AES-XTS.
728 *
729 * nbytes must be a positive integral multiple of 16. This routine
730 * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
731 *
732 * Standard ABI calling convention.
733 */
734 ENTRY(aesarmv8_xts_dec1)
735 stp fp, lr, [sp, #-16]! /* push stack frame */
736 mov fp, sp
737 mov x9, x0 /* x9 := deckey */
738 mov x10, x3 /* x10 := nbytes */
739 ldr q31, [x4] /* q31 := tweak */
740 _ALIGN_TEXT
741 1: ldr q0, [x1], #0x10 /* q0 := ctxt */
742 mov x0, x9 /* x0 := deckey */
743 mov x3, x5 /* x3 := nrounds */
744 eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */
745 bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */
746 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */
747 str q0, [x2], #0x10 /* store plaintext block */
748 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
749 subs x10, x10, #0x10 /* count down nbytes */
750 b.ne 1b /* repeat if more blocks */
751 str q31, [x4] /* update tweak */
752 ldp fp, lr, [sp], #16 /* pop stack frame */
753 ret
754 END(aesarmv8_xts_dec1)
755
756 /*
757 * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
758 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
759 * uint32_t nrounds@x5)
760 *
761 * Decrypt a contiguous sequdece of blocks with AES-XTS.
762 *
763 * nbytes must be a positive integral multiple of 128.
764 *
765 * Standard ABI calling convention.
766 */
767 ENTRY(aesarmv8_xts_dec8)
768 stp fp, lr, [sp, #-16]! /* push stack frame */
769 mov fp, sp
770 mov x9, x0 /* x9 := deckey */
771 mov x10, x3 /* x10 := nbytes */
772 ldr q31, [x4] /* q31 := tweak */
773 _ALIGN_TEXT
774 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
775 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
776 mov v25.16b, v31.16b /* q25 := tweak[1] */
777 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
778 mov v26.16b, v31.16b /* q26 := tweak[2] */
779 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
780 mov v27.16b, v31.16b /* q27 := tweak[3] */
781 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
782 mov v28.16b, v31.16b /* q28 := tweak[4] */
783 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
784 mov v29.16b, v31.16b /* q29 := tweak[5] */
785 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
786 mov v30.16b, v31.16b /* q30 := tweak[6] */
787 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
788 /* q31 := tweak[7] */
789 ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */
790 ldp q2, q3, [x1], #0x20
791 ldp q4, q5, [x1], #0x20
792 ldp q6, q7, [x1], #0x20
793 eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */
794 eor v1.16b, v1.16b, v25.16b
795 eor v2.16b, v2.16b, v26.16b
796 eor v3.16b, v3.16b, v27.16b
797 eor v4.16b, v4.16b, v28.16b
798 eor v5.16b, v5.16b, v29.16b
799 eor v6.16b, v6.16b, v30.16b
800 eor v7.16b, v7.16b, v31.16b
801 mov x0, x9 /* x0 := deckey */
802 mov x3, x5 /* x3 := nrounds */
803 bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */
804 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
805 eor v1.16b, v1.16b, v25.16b
806 eor v2.16b, v2.16b, v26.16b
807 eor v3.16b, v3.16b, v27.16b
808 eor v4.16b, v4.16b, v28.16b
809 eor v5.16b, v5.16b, v29.16b
810 eor v6.16b, v6.16b, v30.16b
811 eor v7.16b, v7.16b, v31.16b
812 stp q0, q1, [x2], #0x20 /* store plaintext blocks */
813 stp q2, q3, [x2], #0x20
814 stp q4, q5, [x2], #0x20
815 stp q6, q7, [x2], #0x20
816 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
817 subs x10, x10, #0x80 /* count down nbytes */
818 b.ne 1b /* repeat if more block groups */
819 str q31, [x4] /* update tweak */
820 ldp fp, lr, [sp], #16 /* pop stack frame */
821 ret
822 END(aesarmv8_xts_dec8)
823
824 /*
825 * aesarmv8_xts_mulx(tweak@q31)
826 *
827 * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
828 * Uses x0 and q0/q1 as temporaries.
829 */
830 .text
831 _ALIGN_TEXT
832 .type aesarmv8_xts_mulx,@function
833 aesarmv8_xts_mulx:
834 /*
835 * Simultaneously determine
836 * (a) whether the high bit of the low half must be
837 * shifted into the low bit of the high half, and
838 * (b) whether the high bit of the high half must be
839 * carried into x^128 = x^7 + x^2 + x + 1.
840 */
841 adrl x0, xtscarry
842 cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
843 ldr q0, [x0] /* q0 := xtscarry */
844 ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
845 shl v31.2d, v31.2d, #1 /* shift */
846 and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */
847 eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
848 ret
849 END(aesarmv8_xts_mulx)
850
851 .section .rodata
852 .p2align 4
853 .type xtscarry,@object
854 xtscarry:
855 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0
856 END(xtscarry)
857
858 /*
859 * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
860 *
861 * Update an AES-XTS tweak.
862 *
863 * Standard ABI calling convention.
864 */
865 ENTRY(aesarmv8_xts_update)
866 stp fp, lr, [sp, #-16]! /* push stack frame */
867 mov fp, sp
868 ldr q31, [x0] /* load tweak */
869 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
870 str q31, [x1] /* store tweak */
871 ldp fp, lr, [sp], #16 /* pop stack frame */
872 ret
873 END(aesarmv8_xts_update)
874
875 /*
876 * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0,
877 * const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3,
878 * uint32_t nrounds@x4)
879 *
880 * Update CBC-MAC.
881 *
882 * nbytes must be a positive integral multiple of 16.
883 *
884 * Standard ABI calling convention.
885 */
886 ENTRY(aesarmv8_cbcmac_update1)
887 stp fp, lr, [sp, #-16]! /* push stack frame */
888 mov fp, sp
889 ldr q0, [x3] /* q0 := initial authenticator */
890 mov x9, x0 /* x9 := enckey */
891 mov x5, x3 /* x5 := &auth (enc1 trashes x3) */
892 _ALIGN_TEXT
893 1: ldr q1, [x1], #0x10 /* q1 := plaintext block */
894 mov x0, x9 /* x0 := enckey */
895 mov x3, x4 /* x3 := nrounds */
896 eor v0.16b, v0.16b, v1.16b /* q0 := auth ^ ptxt */
897 bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */
898 subs x2, x2, #0x10 /* count down nbytes */
899 b.ne 1b /* repeat if x10 is nonzero */
900 str q0, [x5] /* store updated authenticator */
901 ldp fp, lr, [sp], #16 /* pop stack frame */
902 ret
903 END(aesarmv8_cbcmac_update1)
904
905 /*
906 * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
907 * uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
908 * uint32_t nrounds@x5)
909 *
910 * Update CCM encryption.
911 *
912 * nbytes must be a positive integral multiple of 16.
913 *
914 * Standard ABI calling convention.
915 */
916 ENTRY(aesarmv8_ccm_enc1)
917 stp fp, lr, [sp, #-16]! /* push stack frame */
918 mov fp, sp
919 ldp q0, q2, [x4] /* q0 := auth, q2 := ctr (be) */
920 adrl x11, ctr32_inc /* x11 := &ctr32_inc */
921 ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */
922 mov x9, x0 /* x9 := enckey */
923 mov x10, x3 /* x10 := nbytes */
924 #if _BYTE_ORDER == _LITTLE_ENDIAN
925 rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */
926 #endif
927 _ALIGN_TEXT
928 1: ldr q3, [x1], #0x10 /* q3 := plaintext block */
929 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
930 mov x0, x9 /* x0 := enckey */
931 mov x3, x5 /* x3 := nrounds */
932 #if _BYTE_ORDER == _LITTLE_ENDIAN
933 rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */
934 #else
935 mov v1.16b, v2.16b /* q1 := ctr (big-endian) */
936 #endif
937 eor v0.16b, v0.16b, v3.16b /* q0 := auth ^ ptxt */
938 bl aesarmv8_enc2 /* q0 := auth', q1 := pad;
939 * trash x0/x3/q16 */
940 eor v3.16b, v1.16b, v3.16b /* q3 := ciphertext block */
941 subs x10, x10, #0x10 /* count down bytes */
942 str q3, [x2], #0x10 /* store ciphertext block */
943 b.ne 1b /* repeat if more blocks */
944 #if _BYTE_ORDER == _LITTLE_ENDIAN
945 rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */
946 #endif
947 stp q0, q2, [x4] /* store updated auth/ctr */
948 ldp fp, lr, [sp], #16 /* pop stack frame */
949 ret
950 END(aesarmv8_ccm_enc1)
951
952 /*
953 * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
954 * uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
955 * uint32_t nrounds@x5)
956 *
957 * Update CCM decryption.
958 *
959 * nbytes must be a positive integral multiple of 16.
960 *
961 * Standard ABI calling convention.
962 */
963 ENTRY(aesarmv8_ccm_dec1)
964 stp fp, lr, [sp, #-16]! /* push stack frame */
965 mov fp, sp
966 ldp q1, q2, [x4] /* q1 := auth, q2 := ctr (be) */
967 adrl x11, ctr32_inc /* x11 := &ctr32_inc */
968 ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */
969 mov x9, x0 /* x9 := enckey */
970 mov x10, x3 /* x10 := nbytes */
971 #if _BYTE_ORDER == _LITTLE_ENDIAN
972 rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */
973 #endif
974
975 /* Decrypt the first block. */
976 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
977 mov x3, x5 /* x3 := nrounds */
978 #if _BYTE_ORDER == _LITTLE_ENDIAN
979 rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */
980 #else
981 mov v0.16b, v2.16b /* q0 := ctr (big-endian) */
982 #endif
983 ldr q3, [x1], #0x10 /* q3 := ctxt */
984 bl aesarmv8_enc1 /* q0 := pad; trash x0/x3/q16 */
985 b 2f
986
987 _ALIGN_TEXT
988 1: /*
989 * Authenticate the last block and decrypt the next block
990 * simultaneously.
991 *
992 * q1 = auth ^ ptxt[-1]
993 * q2 = ctr[-1] (le)
994 */
995 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
996 mov x0, x9 /* x0 := enckey */
997 mov x3, x5 /* x3 := nrounds */
998 #if _BYTE_ORDER == _LITTLE_ENDIAN
999 rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */
1000 #else
1001 mov v0.16b, v2.16b /* q0 := ctr (big-endian) */
1002 #endif
1003 ldr q3, [x1], #0x10 /* q3 := ctxt */
1004 bl aesarmv8_enc2 /* q0 := pad, q1 := auth';
1005 * trash x0/x3/q16 */
1006 2: eor v3.16b, v0.16b, v3.16b /* q3 := plaintext block */
1007 subs x10, x10, #0x10
1008 str q3, [x2], #0x10 /* store plaintext */
1009 eor v1.16b, v1.16b, v3.16b /* q1 := auth ^ ptxt */
1010 b.ne 1b
1011
1012 #if _BYTE_ORDER == _LITTLE_ENDIAN
1013 rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */
1014 #endif
1015
1016 /* Authenticate the last block. */
1017 mov x0, x9 /* x0 := enckey */
1018 mov x3, x5 /* x3 := nrounds */
1019 mov v0.16b, v1.16b /* q0 := auth ^ ptxt */
1020 bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */
1021 stp q0, q2, [x4] /* store updated auth/ctr */
1022 ldp fp, lr, [sp], #16 /* pop stack frame */
1023 ret
1024 END(aesarmv8_ccm_dec1)
1025
1026 .section .rodata
1027 .p2align 4
1028 .type ctr32_inc,@object
1029 ctr32_inc:
1030 .int 0, 0, 0, 1
1031 END(ctr32_inc)
1032
1033 /*
1034 * aesarmv8_enc1(const struct aesenc *enckey@x0,
1035 * uint128_t block@q0, uint32_t nrounds@x3)
1036 *
1037 * Encrypt a single AES block in q0.
1038 *
1039 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1040 */
1041 .text
1042 _ALIGN_TEXT
1043 .type aesarmv8_enc1,@function
1044 aesarmv8_enc1:
1045 ldr q16, [x0], #0x10 /* load round key */
1046 sub x3, x3, #1
1047 _ALIGN_TEXT
1048 1: /* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
1049 aese v0.16b, v16.16b
1050 aesmc v0.16b, v0.16b
1051 ldr q16, [x0], #0x10
1052 subs x3, x3, #1
1053 b.ne 1b
1054 /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
1055 aese v0.16b, v16.16b
1056 ldr q16, [x0] /* load last round key */
1057 /* q0 := AddRoundKey_q16(q0) */
1058 eor v0.16b, v0.16b, v16.16b
1059 ret
1060 END(aesarmv8_enc1)
1061
1062 /*
1063 * aesarmv8_enc2(const struct aesenc *enckey@x0,
1064 * uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3)
1065 *
1066 * Encrypt two AES blocks in q0 and q1.
1067 *
1068 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1069 */
1070 .text
1071 _ALIGN_TEXT
1072 .type aesarmv8_enc2,@function
1073 aesarmv8_enc2:
1074 ldr q16, [x0], #0x10 /* load round key */
1075 sub x3, x3, #1
1076 _ALIGN_TEXT
1077 1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
1078 aese v0.16b, v16.16b
1079 aesmc v0.16b, v0.16b
1080 aese v1.16b, v16.16b
1081 aesmc v1.16b, v1.16b
1082 ldr q16, [x0], #0x10 /* load next round key */
1083 subs x3, x3, #1
1084 b.ne 1b
1085 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
1086 aese v0.16b, v16.16b
1087 aese v1.16b, v16.16b
1088 ldr q16, [x0] /* load last round key */
1089 /* q[i] := AddRoundKey_q16(q[i]) */
1090 eor v0.16b, v0.16b, v16.16b
1091 eor v1.16b, v1.16b, v16.16b
1092 ret
1093 END(aesarmv8_enc2)
1094
1095 /*
1096 * aesarmv8_enc8(const struct aesenc *enckey@x0,
1097 * uint128_t block0@q0, ..., uint128_t block7@q7,
1098 * uint32_t nrounds@x3)
1099 *
1100 * Encrypt eight AES blocks in q0 through q7 in parallel.
1101 *
1102 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1103 */
1104 .text
1105 _ALIGN_TEXT
1106 .type aesarmv8_enc8,@function
1107 aesarmv8_enc8:
1108 ldr q16, [x0], #0x10 /* load round key */
1109 sub x3, x3, #1
1110 _ALIGN_TEXT
1111 1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
1112 aese v0.16b, v16.16b
1113 aesmc v0.16b, v0.16b
1114 aese v1.16b, v16.16b
1115 aesmc v1.16b, v1.16b
1116 aese v2.16b, v16.16b
1117 aesmc v2.16b, v2.16b
1118 aese v3.16b, v16.16b
1119 aesmc v3.16b, v3.16b
1120 aese v4.16b, v16.16b
1121 aesmc v4.16b, v4.16b
1122 aese v5.16b, v16.16b
1123 aesmc v5.16b, v5.16b
1124 aese v6.16b, v16.16b
1125 aesmc v6.16b, v6.16b
1126 aese v7.16b, v16.16b
1127 aesmc v7.16b, v7.16b
1128 ldr q16, [x0], #0x10 /* load next round key */
1129 subs x3, x3, #1
1130 b.ne 1b
1131 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
1132 aese v0.16b, v16.16b
1133 aese v1.16b, v16.16b
1134 aese v2.16b, v16.16b
1135 aese v3.16b, v16.16b
1136 aese v4.16b, v16.16b
1137 aese v5.16b, v16.16b
1138 aese v6.16b, v16.16b
1139 aese v7.16b, v16.16b
1140 ldr q16, [x0] /* load last round key */
1141 /* q[i] := AddRoundKey_q16(q[i]) */
1142 eor v0.16b, v0.16b, v16.16b
1143 eor v1.16b, v1.16b, v16.16b
1144 eor v2.16b, v2.16b, v16.16b
1145 eor v3.16b, v3.16b, v16.16b
1146 eor v4.16b, v4.16b, v16.16b
1147 eor v5.16b, v5.16b, v16.16b
1148 eor v6.16b, v6.16b, v16.16b
1149 eor v7.16b, v7.16b, v16.16b
1150 ret
1151 END(aesarmv8_enc8)
1152
1153 /*
1154 * aesarmv8_dec1(const struct aesdec *deckey@x0,
1155 * uint128_t block@q0, uint32_t nrounds@x3)
1156 *
1157 * Decrypt a single AES block in q0.
1158 *
1159 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1160 */
1161 .text
1162 _ALIGN_TEXT
1163 .type aesarmv8_dec1,@function
1164 aesarmv8_dec1:
1165 ldr q16, [x0], #0x10 /* load round key */
1166 sub x3, x3, #1
1167 _ALIGN_TEXT
1168 1: /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
1169 aesd v0.16b, v16.16b
1170 /* q0 := InMixColumns(q0) */
1171 aesimc v0.16b, v0.16b
1172 ldr q16, [x0], #0x10 /* load next round key */
1173 subs x3, x3, #1
1174 b.ne 1b
1175 /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
1176 aesd v0.16b, v16.16b
1177 ldr q16, [x0] /* load last round key */
1178 /* q0 := AddRoundKey_q16(q0) */
1179 eor v0.16b, v0.16b, v16.16b
1180 ret
1181 END(aesarmv8_dec1)
1182
1183 /*
1184 * aesarmv8_dec8(const struct aesdec *deckey@x0,
1185 * uint128_t block0@q0, ..., uint128_t block7@q7,
1186 * uint32_t nrounds@x3)
1187 *
1188 * Decrypt eight AES blocks in q0 through q7 in parallel.
1189 *
1190 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
1191 */
1192 .text
1193 _ALIGN_TEXT
1194 .type aesarmv8_dec8,@function
1195 aesarmv8_dec8:
1196 ldr q16, [x0], #0x10 /* load round key */
1197 sub x3, x3, #1
1198 _ALIGN_TEXT
1199 1: /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
1200 aesd v0.16b, v16.16b
1201 /* q[i] := InMixColumns(q[i]) */
1202 aesimc v0.16b, v0.16b
1203 aesd v1.16b, v16.16b
1204 aesimc v1.16b, v1.16b
1205 aesd v2.16b, v16.16b
1206 aesimc v2.16b, v2.16b
1207 aesd v3.16b, v16.16b
1208 aesimc v3.16b, v3.16b
1209 aesd v4.16b, v16.16b
1210 aesimc v4.16b, v4.16b
1211 aesd v5.16b, v16.16b
1212 aesimc v5.16b, v5.16b
1213 aesd v6.16b, v16.16b
1214 aesimc v6.16b, v6.16b
1215 aesd v7.16b, v16.16b
1216 aesimc v7.16b, v7.16b
1217 ldr q16, [x0], #0x10 /* load next round key */
1218 subs x3, x3, #1
1219 b.ne 1b
1220 /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
1221 aesd v0.16b, v16.16b
1222 aesd v1.16b, v16.16b
1223 aesd v2.16b, v16.16b
1224 aesd v3.16b, v16.16b
1225 aesd v4.16b, v16.16b
1226 aesd v5.16b, v16.16b
1227 aesd v6.16b, v16.16b
1228 aesd v7.16b, v16.16b
1229 ldr q16, [x0] /* load last round key */
1230 /* q[i] := AddRoundKey_q16(q[i]) */
1231 eor v0.16b, v0.16b, v16.16b
1232 eor v1.16b, v1.16b, v16.16b
1233 eor v2.16b, v2.16b, v16.16b
1234 eor v3.16b, v3.16b, v16.16b
1235 eor v4.16b, v4.16b, v16.16b
1236 eor v5.16b, v5.16b, v16.16b
1237 eor v6.16b, v6.16b, v16.16b
1238 eor v7.16b, v7.16b, v16.16b
1239 ret
1240 END(aesarmv8_dec8)
1241