aes_armv8_64.S revision 1.6 1 /* $NetBSD: aes_armv8_64.S,v 1.6 2020/07/22 06:15:21 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <aarch64/asm.h>
30
31 .arch_extension aes
32
33 /*
34 * uint32_t rcon[10]
35 *
36 * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
37 * Such elements of GF(8) need only eight bits to be represented,
38 * but we store them in 4-byte units so we can copy one into all
39 * four 4-byte lanes of a vector register with a single LD1R. The
40 * access pattern is fixed, so indices into this table are never
41 * secret.
42 */
43 .section .rodata
44 .p2align 2
45 .type rcon,@object
46 rcon:
47 .long 0x01
48 .long 0x02
49 .long 0x04
50 .long 0x08
51 .long 0x10
52 .long 0x20
53 .long 0x40
54 .long 0x80
55 .long 0x1b
56 .long 0x36
57 END(rcon)
58
59 /*
60 * uint128_t unshiftrows_rotword_1
61 *
62 * Table for TBL instruction to undo ShiftRows, and then do
63 * RotWord on word 1, and then copy it into all the other words.
64 */
65 .section .rodata
66 .p2align 4
67 .type unshiftrows_rotword_1,@object
68 unshiftrows_rotword_1:
69 .byte 0x01,0x0e,0x0b,0x04
70 .byte 0x01,0x0e,0x0b,0x04
71 .byte 0x01,0x0e,0x0b,0x04
72 .byte 0x01,0x0e,0x0b,0x04
73 END(unshiftrows_rotword_1)
74
75 /*
76 * uint128_t unshiftrows_3
77 *
78 * Table for TBL instruction to undo ShiftRows, and then copy word
79 * 3 into all the other words.
80 */
81 .section .rodata
82 .p2align 4
83 .type unshiftrows_3,@object
84 unshiftrows_3:
85 .byte 0x0c,0x09,0x06,0x03
86 .byte 0x0c,0x09,0x06,0x03
87 .byte 0x0c,0x09,0x06,0x03
88 .byte 0x0c,0x09,0x06,0x03
89 END(unshiftrows_3)
90
91 /*
92 * uint128_t unshiftrows_rotword_3
93 *
94 * Table for TBL instruction to undo ShiftRows, and then do
95 * RotWord on word 3, and then copy it into all the other words.
96 */
97 .section .rodata
98 .p2align 4
99 .type unshiftrows_rotword_3,@object
100 unshiftrows_rotword_3:
101 .byte 0x09,0x06,0x03,0x0c
102 .byte 0x09,0x06,0x03,0x0c
103 .byte 0x09,0x06,0x03,0x0c
104 .byte 0x09,0x06,0x03,0x0c
105 END(unshiftrows_rotword_3)
106
107 /*
108 * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
109 *
110 * Expand a 16-byte AES-128 key into 10 round keys.
111 *
112 * Standard ABI calling convention.
113 */
114 ENTRY(aesarmv8_setenckey128)
115 ldr q1, [x1] /* q1 := master key */
116
117 adrl x4, unshiftrows_rotword_3
118 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
119 ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */
120
121 str q1, [x0], #0x10 /* store master key as first round key */
122 mov x2, #10 /* round count */
123 adrl x3, rcon /* round constant */
124
125 1: /*
126 * q0 = 0
127 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
128 * x0 = pointer to round key to compute
129 * x2 = round count
130 * x3 = rcon pointer
131 */
132
133 /* q3 := ShiftRows(SubBytes(q1)) */
134 mov v3.16b, v1.16b
135 aese v3.16b, v0.16b
136
137 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
138 ld1r {v4.4s}, [x3], #4
139 tbl v3.16b, {v3.16b}, v16.16b
140 eor v3.16b, v3.16b, v4.16b
141
142 /*
143 * v5.4s := (0,prk[0],prk[1],prk[2])
144 * v6.4s := (0,0,prk[0],prk[1])
145 * v7.4s := (0,0,0,prk[0])
146 */
147 ext v5.16b, v0.16b, v1.16b, #12
148 ext v6.16b, v0.16b, v1.16b, #8
149 ext v7.16b, v0.16b, v1.16b, #4
150
151 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
152 eor v1.16b, v1.16b, v3.16b
153 eor v1.16b, v1.16b, v5.16b
154 eor v1.16b, v1.16b, v6.16b
155 eor v1.16b, v1.16b, v7.16b
156
157 subs x2, x2, #1 /* count down rounds */
158 str q1, [x0], #0x10 /* store round key */
159 b.ne 1b
160
161 ret
162 END(aesarmv8_setenckey128)
163
164 /*
165 * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
166 *
167 * Expand a 24-byte AES-192 key into 12 round keys.
168 *
169 * Standard ABI calling convention.
170 */
171 ENTRY(aesarmv8_setenckey192)
172 ldr q1, [x1], #0x10 /* q1 := master key[0:128) */
173 ldr d2, [x1] /* d2 := master key[128:192) */
174
175 adrl x4, unshiftrows_rotword_1
176 adrl x5, unshiftrows_rotword_3
177 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
178 ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */
179 ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */
180
181 str q1, [x0], #0x10 /* store master key[0:128) as round key */
182 mov x2, #12 /* round count */
183 adrl x3, rcon /* round constant */
184
185 1: /*
186 * q0 = 0
187 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
188 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
189 * x0 = pointer to three round keys to compute
190 * x2 = round count
191 * x3 = rcon pointer
192 */
193
194 /* q3 := ShiftRows(SubBytes(q2)) */
195 mov v3.16b, v2.16b
196 aese v3.16b, v0.16b
197
198 /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
199 ld1r {v4.4s}, [x3], #4
200 tbl v3.16b, {v3.16b}, v16.16b
201 eor v3.16b, v3.16b, v4.16b
202
203 /*
204 * We need to compute:
205 *
206 * rk[0] := rklo[0]
207 * rk[1] := rklo[1]
208 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
209 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
210 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
211 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
212 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
213 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
214 * ^ rklo[1]
215 */
216
217 /*
218 * v5.4s := (0,prk[0],prk[1],prk[2])
219 * v6.4s := (0,0,prk[0],prk[1])
220 * v7.4s := (0,0,0,prk[0])
221 */
222 ext v5.16b, v0.16b, v1.16b, #12
223 ext v6.16b, v0.16b, v1.16b, #8
224 ext v7.16b, v0.16b, v1.16b, #4
225
226 /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
227 eor v5.16b, v5.16b, v1.16b
228 eor v5.16b, v5.16b, v3.16b
229 eor v5.16b, v5.16b, v6.16b
230 eor v5.16b, v5.16b, v7.16b
231
232 /*
233 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
234 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
235 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
236 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
237 * (rklo[0],rklo[1],...).
238 */
239
240 /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
241 dup v1.4s, v5.s[3]
242 mov v1.s[0], v5.s[2]
243
244 /*
245 * v6.4s := (0, 0, rklo[0], rklo[1])
246 * v7.4s := (0, 0, 0, rklo[0])
247 */
248 ext v6.16b, v0.16b, v2.16b, #8
249 ext v7.16b, v0.16b, v2.16b, #4
250
251 /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
252 eor v3.16b, v1.16b, v6.16b
253 eor v3.16b, v3.16b, v7.16b
254
255 /*
256 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
257 * and v5.4s = (rk[2], rk[3], xxx, xxx). Set
258 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
259 */
260 mov v2.d[1], v5.d[0]
261
262 /* store two round keys */
263 stp q2, q3, [x0], #0x20
264
265 /*
266 * Live vector registers at this point:
267 *
268 * q0 = zero
269 * q2 = rk
270 * q3 = nrk
271 * v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
272 * q16 = unshiftrows_rotword_1
273 * q17 = unshiftrows_rotword_3
274 *
275 * We have to compute, in q1:
276 *
277 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
278 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
279 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
280 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
281 * ^ nrk[1]
282 *
283 * And, if there's any more afterward, in q2:
284 *
285 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
286 * ^ nrk[1] ^ nrk[2]
287 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
288 * ^ nrk[1] ^ nrk[2] ^ nrk[3]
289 */
290
291 /* q1 := RotWords(SubBytes(q3)) */
292 mov v1.16b, v3.16b
293 aese v1.16b, v0.16b
294
295 /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
296 ld1r {v4.4s}, [x3], #4
297 tbl v1.16b, {v1.16b}, v17.16b
298 eor v1.16b, v1.16b, v4.16b
299
300 /*
301 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
302 * v4.4s := (0, rk[2], rk[3], nrk[0])
303 * v6.4s := (0, 0, rk[2], rk[3])
304 * v7.4s := (0, 0, 0, rk[2])
305 */
306 ext v4.16b, v0.16b, v5.16b, #12
307 ext v6.16b, v0.16b, v5.16b, #8
308 ext v7.16b, v0.16b, v5.16b, #4
309
310 /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
311 eor v1.16b, v1.16b, v5.16b
312 eor v1.16b, v1.16b, v4.16b
313 eor v1.16b, v1.16b, v6.16b
314 eor v1.16b, v1.16b, v7.16b
315
316 subs x2, x2, #3 /* count down three rounds */
317 str q1, [x0], #0x10 /* store third round key */
318 b.eq 2f
319
320 /*
321 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
322 * v5.4s := (0, nrk[2], xxx, xxx)
323 */
324 ext v4.16b, v3.16b, v0.16b, #8
325 ext v5.16b, v0.16b, v4.16b, #12
326
327 /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
328 dup v2.4s, v1.s[3]
329
330 /*
331 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
332 * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
333 * xxx, xxx)
334 */
335 eor v2.16b, v2.16b, v4.16b
336 eor v2.16b, v2.16b, v5.16b
337
338 b 1b
339
340 2: ret
341 END(aesarmv8_setenckey192)
342
343 /*
344 * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
345 *
346 * Expand a 32-byte AES-256 key into 14 round keys.
347 *
348 * Standard ABI calling convention.
349 */
350 ENTRY(aesarmv8_setenckey256)
351 /* q1 := key[0:128), q2 := key[128:256) */
352 ldp q1, q2, [x1], #0x20
353
354 adrl x4, unshiftrows_rotword_3
355 adrl x5, unshiftrows_3
356 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
357 ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */
358 ldr q17, [x5] /* q17 := unshiftrows_3 */
359
360 /* store master key as first two round keys */
361 stp q1, q2, [x0], #0x20
362 mov x2, #14 /* round count */
363 adrl x3, rcon /* round constant */
364
365 1: /*
366 * q0 = 0
367 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
368 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
369 * x2 = round count
370 * x3 = rcon pointer
371 */
372
373 /* q3 := ShiftRows(SubBytes(q2)) */
374 mov v3.16b, v2.16b
375 aese v3.16b, v0.16b
376
377 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
378 ld1r {v4.4s}, [x3], #4
379 tbl v3.16b, {v3.16b}, v16.16b
380 eor v3.16b, v3.16b, v4.16b
381
382 /*
383 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
384 * v6.4s := (0,0,pprk[0],pprk[1])
385 * v7.4s := (0,0,0,pprk[0])
386 */
387 ext v5.16b, v0.16b, v1.16b, #12
388 ext v6.16b, v0.16b, v1.16b, #8
389 ext v7.16b, v0.16b, v1.16b, #4
390
391 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
392 eor v1.16b, v1.16b, v3.16b
393 eor v1.16b, v1.16b, v5.16b
394 eor v1.16b, v1.16b, v6.16b
395 eor v1.16b, v1.16b, v7.16b
396
397 subs x2, x2, #2 /* count down two rounds */
398 b.eq 2f /* stop if this is the last one */
399
400 /* q3 := ShiftRows(SubBytes(q1)) */
401 mov v3.16b, v1.16b
402 aese v3.16b, v0.16b
403
404 /* v3.4s[i] := SubBytes(rk[3]) */
405 tbl v3.16b, {v3.16b}, v17.16b
406
407 /*
408 * v5.4s := (0,prk[0],prk[1],prk[2])
409 * v6.4s := (0,0,prk[0],prk[1])
410 * v7.4s := (0,0,0,prk[0])
411 */
412 ext v5.16b, v0.16b, v2.16b, #12
413 ext v6.16b, v0.16b, v2.16b, #8
414 ext v7.16b, v0.16b, v2.16b, #4
415
416 /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
417 eor v2.16b, v2.16b, v3.16b
418 eor v2.16b, v2.16b, v5.16b
419 eor v2.16b, v2.16b, v6.16b
420 eor v2.16b, v2.16b, v7.16b
421
422 stp q1, q2, [x0], #0x20 /* store two round keys */
423 b 1b
424
425 2: str q1, [x0] /* store last round key */
426 ret
427 END(aesarmv8_setenckey256)
428
429 /*
430 * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
431 * uint32_t nrounds@x2)
432 *
433 * Convert AES encryption round keys to AES decryption round keys.
434 * `rounds' must be between 10 and 14.
435 *
436 * Standard ABI calling convention.
437 */
438 ENTRY(aesarmv8_enctodec)
439 ldr q0, [x0, x2, lsl #4] /* load last round key */
440 1: str q0, [x1], #0x10 /* store round key */
441 subs x2, x2, #1 /* count down round */
442 ldr q0, [x0, x2, lsl #4] /* load previous round key */
443 b.eq 2f /* stop if this is the last one */
444 aesimc v0.16b, v0.16b /* convert encryption to decryption */
445 b 1b
446 2: str q0, [x1] /* store first round key verbatim */
447 ret
448 END(aesarmv8_enctodec)
449
450 /*
451 * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
452 * uint8_t out[16] @x2, uint32_t nrounds@x3)
453 *
454 * Encrypt a single block.
455 *
456 * Standard ABI calling convention.
457 */
458 ENTRY(aesarmv8_enc)
459 stp fp, lr, [sp, #-16]! /* push stack frame */
460 mov fp, sp
461 ldr q0, [x1] /* q0 := ptxt */
462 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
463 str q0, [x2] /* store ctxt */
464 ldp fp, lr, [sp], #16 /* pop stack frame */
465 ret
466 END(aesarmv8_enc)
467
468 /*
469 * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
470 * uint8_t out[16] @x2, uint32_t nrounds@x3)
471 *
472 * Decrypt a single block.
473 *
474 * Standard ABI calling convention.
475 */
476 ENTRY(aesarmv8_dec)
477 stp fp, lr, [sp, #-16]! /* push stack frame */
478 mov fp, sp
479 ldr q0, [x1] /* q0 := ctxt */
480 bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */
481 str q0, [x2] /* store ptxt */
482 ldp fp, lr, [sp], #16 /* pop stack frame */
483 ret
484 END(aesarmv8_dec)
485
486 /*
487 * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
488 * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
489 * uint32_t nrounds@x5)
490 *
491 * Encrypt a contiguous sequence of blocks with AES-CBC.
492 *
493 * nbytes must be an integral multiple of 16.
494 *
495 * Standard ABI calling convention.
496 */
497 ENTRY(aesarmv8_cbc_enc)
498 cbz x3, 2f /* stop if nothing to do */
499 stp fp, lr, [sp, #-16]! /* push stack frame */
500 mov fp, sp
501 mov x9, x0 /* x9 := enckey */
502 mov x10, x3 /* x10 := nbytes */
503 ldr q0, [x4] /* q0 := chaining value */
504 1: ldr q1, [x1], #0x10 /* q1 := plaintext block */
505 eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
506 mov x0, x9 /* x0 := enckey */
507 mov x3, x5 /* x3 := nrounds */
508 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
509 subs x10, x10, #0x10 /* count down nbytes */
510 str q0, [x2], #0x10 /* store ciphertext block */
511 b.ne 1b /* repeat if x10 is nonzero */
512 str q0, [x4] /* store chaining value */
513 ldp fp, lr, [sp], #16 /* pop stack frame */
514 2: ret
515 END(aesarmv8_cbc_enc)
516
517 /*
518 * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
519 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
520 * uint32_t nrounds@x5)
521 *
522 * Decrypt a contiguous sequence of blocks with AES-CBC.
523 *
524 * nbytes must be a positive integral multiple of 16. This routine
525 * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
526 *
527 * Standard ABI calling convention.
528 */
529 ENTRY(aesarmv8_cbc_dec1)
530 stp fp, lr, [sp, #-16]! /* push stack frame */
531 mov fp, sp
532 ldr q24, [x4] /* q24 := iv */
533 mov x9, x0 /* x9 := enckey */
534 mov x10, x3 /* x10 := nbytes */
535 add x1, x1, x3 /* x1 := pointer past end of in */
536 add x2, x2, x3 /* x2 := pointer past end of out */
537 ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */
538 str q0, [x4] /* update iv */
539 1: mov x0, x9 /* x0 := enckey */
540 mov x3, x5 /* x3 := nrounds */
541 bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */
542 subs x10, x10, #0x10 /* count down nbytes */
543 b.eq 2f /* stop if this is the first block */
544 ldr q31, [x1, #-0x10]! /* q31 := chaining value */
545 eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
546 str q0, [x2, #-0x10]! /* store plaintext block */
547 mov v0.16b, v31.16b /* move cv = ciphertext block */
548 b 1b
549 2: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
550 str q0, [x2, #-0x10]! /* store first plaintext block */
551 ldp fp, lr, [sp], #16 /* pop stack frame */
552 ret
553 END(aesarmv8_cbc_dec1)
554
555 /*
556 * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
557 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
558 * uint32_t nrounds@x5)
559 *
560 * Decrypt a contiguous sequence of 8-block units with AES-CBC.
561 *
562 * nbytes must be a positive integral multiple of 128.
563 *
564 * Standard ABI calling convention.
565 */
566 ENTRY(aesarmv8_cbc_dec8)
567 stp fp, lr, [sp, #-16]! /* push stack frame */
568 mov fp, sp
569 ldr q24, [x4] /* q24 := iv */
570 mov x9, x0 /* x9 := enckey */
571 mov x10, x3 /* x10 := nbytes */
572 add x1, x1, x3 /* x1 := pointer past end of in */
573 add x2, x2, x3 /* x2 := pointer past end of out */
574 ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */
575 str q7, [x4] /* update iv */
576 1: ldp q4, q5, [x1, #-0x20]!
577 ldp q2, q3, [x1, #-0x20]!
578 ldp q0, q1, [x1, #-0x20]!
579 mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */
580 mov v30.16b, v5.16b
581 mov v29.16b, v4.16b
582 mov v28.16b, v3.16b
583 mov v27.16b, v2.16b
584 mov v26.16b, v1.16b
585 mov v25.16b, v0.16b
586 mov x0, x9 /* x0 := enckey */
587 mov x3, x5 /* x3 := nrounds */
588 bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i];
589 * trash x0/x3/q16 */
590 eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
591 eor v6.16b, v6.16b, v30.16b
592 eor v5.16b, v5.16b, v29.16b
593 eor v4.16b, v4.16b, v28.16b
594 eor v3.16b, v3.16b, v27.16b
595 eor v2.16b, v2.16b, v26.16b
596 eor v1.16b, v1.16b, v25.16b
597 subs x10, x10, #0x80 /* count down nbytes */
598 stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
599 stp q4, q5, [x2, #-0x20]!
600 stp q2, q3, [x2, #-0x20]!
601 b.eq 2f /* stop if this is the first block */
602 ldp q6, q7, [x1, #-0x20]!
603 eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
604 stp q0, q1, [x2, #-0x20]!
605 b 1b
606 2: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */
607 stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
608 ldp fp, lr, [sp], #16 /* pop stack frame */
609 ret
610 END(aesarmv8_cbc_dec8)
611
612 /*
613 * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
614 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
615 * uint32_t nrounds@x5)
616 *
617 * Encrypt a contiguous sequence of blocks with AES-XTS.
618 *
619 * nbytes must be a positive integral multiple of 16. This routine
620 * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
621 *
622 * Standard ABI calling convention.
623 */
624 ENTRY(aesarmv8_xts_enc1)
625 stp fp, lr, [sp, #-16]! /* push stack frame */
626 mov fp, sp
627 mov x9, x0 /* x9 := enckey */
628 mov x10, x3 /* x10 := nbytes */
629 ldr q31, [x4] /* q31 := tweak */
630 1: ldr q0, [x1], #0x10 /* q0 := ptxt */
631 mov x0, x9 /* x0 := enckey */
632 mov x3, x5 /* x3 := nrounds */
633 eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
634 bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */
635 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
636 str q0, [x2], #0x10 /* store ciphertext block */
637 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
638 subs x10, x10, #0x10 /* count down nbytes */
639 b.ne 1b /* repeat if more blocks */
640 str q31, [x4] /* update tweak */
641 ldp fp, lr, [sp], #16 /* pop stack frame */
642 ret
643 END(aesarmv8_xts_enc1)
644
645 /*
646 * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
647 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
648 * uint32_t nrounds@x5)
649 *
650 * Encrypt a contiguous sequence of blocks with AES-XTS.
651 *
652 * nbytes must be a positive integral multiple of 128.
653 *
654 * Standard ABI calling convention.
655 */
656 ENTRY(aesarmv8_xts_enc8)
657 stp fp, lr, [sp, #-16]! /* push stack frame */
658 mov fp, sp
659 mov x9, x0 /* x9 := enckey */
660 mov x10, x3 /* x10 := nbytes */
661 ldr q31, [x4] /* q31 := tweak */
662 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
663 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
664 mov v25.16b, v31.16b /* q25 := tweak[1] */
665 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
666 mov v26.16b, v31.16b /* q26 := tweak[2] */
667 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
668 mov v27.16b, v31.16b /* q27 := tweak[3] */
669 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
670 mov v28.16b, v31.16b /* q28 := tweak[4] */
671 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
672 mov v29.16b, v31.16b /* q29 := tweak[5] */
673 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
674 mov v30.16b, v31.16b /* q30 := tweak[6] */
675 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
676 /* q31 := tweak[7] */
677 ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */
678 ldp q2, q3, [x1], #0x20
679 ldp q4, q5, [x1], #0x20
680 ldp q6, q7, [x1], #0x20
681 eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */
682 eor v1.16b, v1.16b, v25.16b
683 eor v2.16b, v2.16b, v26.16b
684 eor v3.16b, v3.16b, v27.16b
685 eor v4.16b, v4.16b, v28.16b
686 eor v5.16b, v5.16b, v29.16b
687 eor v6.16b, v6.16b, v30.16b
688 eor v7.16b, v7.16b, v31.16b
689 mov x0, x9 /* x0 := enckey */
690 mov x3, x5 /* x3 := nrounds */
691 bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */
692 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
693 eor v1.16b, v1.16b, v25.16b
694 eor v2.16b, v2.16b, v26.16b
695 eor v3.16b, v3.16b, v27.16b
696 eor v4.16b, v4.16b, v28.16b
697 eor v5.16b, v5.16b, v29.16b
698 eor v6.16b, v6.16b, v30.16b
699 eor v7.16b, v7.16b, v31.16b
700 stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
701 stp q2, q3, [x2], #0x20
702 stp q4, q5, [x2], #0x20
703 stp q6, q7, [x2], #0x20
704 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
705 subs x10, x10, #0x80 /* count down nbytes */
706 b.ne 1b /* repeat if more block groups */
707 str q31, [x4] /* update tweak */
708 ldp fp, lr, [sp], #16 /* pop stack frame */
709 ret
710 END(aesarmv8_xts_enc8)
711
712 /*
713 * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
714 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
715 * uint32_t nrounds@x5)
716 *
717 * Decrypt a contiguous sequdece of blocks with AES-XTS.
718 *
719 * nbytes must be a positive integral multiple of 16. This routine
720 * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
721 *
722 * Standard ABI calling convention.
723 */
724 ENTRY(aesarmv8_xts_dec1)
725 stp fp, lr, [sp, #-16]! /* push stack frame */
726 mov fp, sp
727 mov x9, x0 /* x9 := deckey */
728 mov x10, x3 /* x10 := nbytes */
729 ldr q31, [x4] /* q31 := tweak */
730 1: ldr q0, [x1], #0x10 /* q0 := ctxt */
731 mov x0, x9 /* x0 := deckey */
732 mov x3, x5 /* x3 := nrounds */
733 eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */
734 bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */
735 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */
736 str q0, [x2], #0x10 /* store plaintext block */
737 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
738 subs x10, x10, #0x10 /* count down nbytes */
739 b.ne 1b /* repeat if more blocks */
740 str q31, [x4] /* update tweak */
741 ldp fp, lr, [sp], #16 /* pop stack frame */
742 ret
743 END(aesarmv8_xts_dec1)
744
745 /*
746 * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
747 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
748 * uint32_t nrounds@x5)
749 *
750 * Decrypt a contiguous sequdece of blocks with AES-XTS.
751 *
752 * nbytes must be a positive integral multiple of 128.
753 *
754 * Standard ABI calling convention.
755 */
756 ENTRY(aesarmv8_xts_dec8)
757 stp fp, lr, [sp, #-16]! /* push stack frame */
758 mov fp, sp
759 mov x9, x0 /* x9 := deckey */
760 mov x10, x3 /* x10 := nbytes */
761 ldr q31, [x4] /* q31 := tweak */
762 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
763 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
764 mov v25.16b, v31.16b /* q25 := tweak[1] */
765 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
766 mov v26.16b, v31.16b /* q26 := tweak[2] */
767 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
768 mov v27.16b, v31.16b /* q27 := tweak[3] */
769 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
770 mov v28.16b, v31.16b /* q28 := tweak[4] */
771 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
772 mov v29.16b, v31.16b /* q29 := tweak[5] */
773 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
774 mov v30.16b, v31.16b /* q30 := tweak[6] */
775 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
776 /* q31 := tweak[7] */
777 ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */
778 ldp q2, q3, [x1], #0x20
779 ldp q4, q5, [x1], #0x20
780 ldp q6, q7, [x1], #0x20
781 eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */
782 eor v1.16b, v1.16b, v25.16b
783 eor v2.16b, v2.16b, v26.16b
784 eor v3.16b, v3.16b, v27.16b
785 eor v4.16b, v4.16b, v28.16b
786 eor v5.16b, v5.16b, v29.16b
787 eor v6.16b, v6.16b, v30.16b
788 eor v7.16b, v7.16b, v31.16b
789 mov x0, x9 /* x0 := deckey */
790 mov x3, x5 /* x3 := nrounds */
791 bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */
792 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
793 eor v1.16b, v1.16b, v25.16b
794 eor v2.16b, v2.16b, v26.16b
795 eor v3.16b, v3.16b, v27.16b
796 eor v4.16b, v4.16b, v28.16b
797 eor v5.16b, v5.16b, v29.16b
798 eor v6.16b, v6.16b, v30.16b
799 eor v7.16b, v7.16b, v31.16b
800 stp q0, q1, [x2], #0x20 /* store plaintext blocks */
801 stp q2, q3, [x2], #0x20
802 stp q4, q5, [x2], #0x20
803 stp q6, q7, [x2], #0x20
804 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
805 subs x10, x10, #0x80 /* count down nbytes */
806 b.ne 1b /* repeat if more block groups */
807 str q31, [x4] /* update tweak */
808 ldp fp, lr, [sp], #16 /* pop stack frame */
809 ret
810 END(aesarmv8_xts_dec8)
811
812 /*
813 * aesarmv8_xts_mulx(tweak@q31)
814 *
815 * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
816 * Uses x0 and q0/q1 as temporaries.
817 */
818 .text
819 _ALIGN_TEXT
820 .type aesarmv8_xts_mulx,@function
821 aesarmv8_xts_mulx:
822 /*
823 * Simultaneously determine
824 * (a) whether the high bit of the low half must be
825 * shifted into the low bit of the high half, and
826 * (b) whether the high bit of the high half must be
827 * carried into x^128 = x^7 + x^2 + x + 1.
828 */
829 adrl x0, xtscarry
830 cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
831 ldr q0, [x0] /* q0 := xtscarry */
832 ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
833 shl v31.2d, v31.2d, #1 /* shift */
834 and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */
835 eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
836 ret
837 END(aesarmv8_xts_mulx)
838
839 .section .rodata
840 .p2align 4
841 .type xtscarry,@object
842 xtscarry:
843 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0
844 END(xtscarry)
845
846 /*
847 * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
848 *
849 * Update an AES-XTS tweak.
850 *
851 * Standard ABI calling convention.
852 */
853 ENTRY(aesarmv8_xts_update)
854 stp fp, lr, [sp, #-16]! /* push stack frame */
855 mov fp, sp
856 ldr q31, [x0] /* load tweak */
857 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
858 str q31, [x1] /* store tweak */
859 ldp fp, lr, [sp], #16 /* pop stack frame */
860 ret
861 END(aesarmv8_xts_update)
862
863 /*
864 * aesarmv8_enc1(const struct aesenc *enckey@x0,
865 * uint128_t block@q0, uint32_t nrounds@x3)
866 *
867 * Encrypt a single AES block in q0.
868 *
869 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
870 */
871 .text
872 _ALIGN_TEXT
873 .type aesarmv8_enc1,@function
874 aesarmv8_enc1:
875 ldr q16, [x0], #0x10 /* load round key */
876 1: subs x3, x3, #1
877 /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
878 aese v0.16b, v16.16b
879 ldr q16, [x0], #0x10 /* load next round key */
880 b.eq 2f
881 /* q0 := MixColumns(q0) */
882 aesmc v0.16b, v0.16b
883 b 1b
884 2: eor v0.16b, v0.16b, v16.16b
885 ret
886 END(aesarmv8_enc1)
887
888 /*
889 * aesarmv8_enc8(const struct aesenc *enckey@x0,
890 * uint128_t block0@q0, ..., uint128_t block7@q7,
891 * uint32_t nrounds@x3)
892 *
893 * Encrypt eight AES blocks in q0 through q7 in parallel.
894 *
895 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
896 */
897 .text
898 _ALIGN_TEXT
899 .type aesarmv8_enc8,@function
900 aesarmv8_enc8:
901 ldr q16, [x0], #0x10 /* load round key */
902 1: subs x3, x3, #1
903 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
904 aese v0.16b, v16.16b
905 aese v1.16b, v16.16b
906 aese v2.16b, v16.16b
907 aese v3.16b, v16.16b
908 aese v4.16b, v16.16b
909 aese v5.16b, v16.16b
910 aese v6.16b, v16.16b
911 aese v7.16b, v16.16b
912 ldr q16, [x0], #0x10 /* load next round key */
913 b.eq 2f
914 /* q[i] := MixColumns(q[i]) */
915 aesmc v0.16b, v0.16b
916 aesmc v1.16b, v1.16b
917 aesmc v2.16b, v2.16b
918 aesmc v3.16b, v3.16b
919 aesmc v4.16b, v4.16b
920 aesmc v5.16b, v5.16b
921 aesmc v6.16b, v6.16b
922 aesmc v7.16b, v7.16b
923 b 1b
924 2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
925 eor v1.16b, v1.16b, v16.16b
926 eor v2.16b, v2.16b, v16.16b
927 eor v3.16b, v3.16b, v16.16b
928 eor v4.16b, v4.16b, v16.16b
929 eor v5.16b, v5.16b, v16.16b
930 eor v6.16b, v6.16b, v16.16b
931 eor v7.16b, v7.16b, v16.16b
932 ret
933 END(aesarmv8_enc8)
934
935 /*
936 * aesarmv8_dec1(const struct aesdec *deckey@x0,
937 * uint128_t block@q0, uint32_t nrounds@x3)
938 *
939 * Decrypt a single AES block in q0.
940 *
941 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
942 */
943 .text
944 _ALIGN_TEXT
945 .type aesarmv8_dec1,@function
946 aesarmv8_dec1:
947 ldr q16, [x0], #0x10 /* load round key */
948 1: subs x3, x3, #1
949 /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
950 aesd v0.16b, v16.16b
951 ldr q16, [x0], #0x10 /* load next round key */
952 b.eq 2f
953 /* q0 := InMixColumns(q0) */
954 aesimc v0.16b, v0.16b
955 b 1b
956 2: eor v0.16b, v0.16b, v16.16b
957 ret
958 END(aesarmv8_dec1)
959
960 /*
961 * aesarmv8_dec8(const struct aesdec *deckey@x0,
962 * uint128_t block0@q0, ..., uint128_t block7@q7,
963 * uint32_t nrounds@x3)
964 *
965 * Decrypt eight AES blocks in q0 through q7 in parallel.
966 *
967 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
968 */
969 .text
970 _ALIGN_TEXT
971 .type aesarmv8_dec8,@function
972 aesarmv8_dec8:
973 ldr q16, [x0], #0x10 /* load round key */
974 1: subs x3, x3, #1
975 /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
976 aesd v0.16b, v16.16b
977 aesd v1.16b, v16.16b
978 aesd v2.16b, v16.16b
979 aesd v3.16b, v16.16b
980 aesd v4.16b, v16.16b
981 aesd v5.16b, v16.16b
982 aesd v6.16b, v16.16b
983 aesd v7.16b, v16.16b
984 ldr q16, [x0], #0x10 /* load next round key */
985 b.eq 2f
986 /* q[i] := InMixColumns(q[i]) */
987 aesimc v0.16b, v0.16b
988 aesimc v1.16b, v1.16b
989 aesimc v2.16b, v2.16b
990 aesimc v3.16b, v3.16b
991 aesimc v4.16b, v4.16b
992 aesimc v5.16b, v5.16b
993 aesimc v6.16b, v6.16b
994 aesimc v7.16b, v7.16b
995 b 1b
996 2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
997 eor v1.16b, v1.16b, v16.16b
998 eor v2.16b, v2.16b, v16.16b
999 eor v3.16b, v3.16b, v16.16b
1000 eor v4.16b, v4.16b, v16.16b
1001 eor v5.16b, v5.16b, v16.16b
1002 eor v6.16b, v6.16b, v16.16b
1003 eor v7.16b, v7.16b, v16.16b
1004 ret
1005 END(aesarmv8_dec8)
1006