Home | History | Annotate | Line # | Download | only in arm
aes_armv8_64.S revision 1.11
      1 /*	$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/endian.h>
     30 
     31 #include <aarch64/asm.h>
     32 
     33 RCSID("$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $")
     34 
     35 	.arch_extension	aes
     36 
     37 /*
     38  * uint32_t rcon[10]
     39  *
     40  *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
     41  *	Such elements of GF(8) need only eight bits to be represented,
     42  *	but we store them in 4-byte units so we can copy one into all
     43  *	four 4-byte lanes of a vector register with a single LD1R.  The
     44  *	access pattern is fixed, so indices into this table are never
     45  *	secret.
     46  */
     47 	.section .rodata
     48 	.p2align 2
     49 	.type	rcon,@object
     50 rcon:
     51 	.long	0x01
     52 	.long	0x02
     53 	.long	0x04
     54 	.long	0x08
     55 	.long	0x10
     56 	.long	0x20
     57 	.long	0x40
     58 	.long	0x80
     59 	.long	0x1b
     60 	.long	0x36
     61 END(rcon)
     62 
     63 /*
     64  * uint128_t unshiftrows_rotword_1
     65  *
     66  *	Table for TBL instruction to undo ShiftRows, and then do
     67  *	RotWord on word 1, and then copy it into all the other words.
     68  */
     69 	.section .rodata
     70 	.p2align 4
     71 	.type	unshiftrows_rotword_1,@object
     72 unshiftrows_rotword_1:
     73 	.byte	0x01,0x0e,0x0b,0x04
     74 	.byte	0x01,0x0e,0x0b,0x04
     75 	.byte	0x01,0x0e,0x0b,0x04
     76 	.byte	0x01,0x0e,0x0b,0x04
     77 END(unshiftrows_rotword_1)
     78 
     79 /*
     80  * uint128_t unshiftrows_3
     81  *
     82  *	Table for TBL instruction to undo ShiftRows, and then copy word
     83  *	3 into all the other words.
     84  */
     85 	.section .rodata
     86 	.p2align 4
     87 	.type	unshiftrows_3,@object
     88 unshiftrows_3:
     89 	.byte	0x0c,0x09,0x06,0x03
     90 	.byte	0x0c,0x09,0x06,0x03
     91 	.byte	0x0c,0x09,0x06,0x03
     92 	.byte	0x0c,0x09,0x06,0x03
     93 END(unshiftrows_3)
     94 
     95 /*
     96  * uint128_t unshiftrows_rotword_3
     97  *
     98  *	Table for TBL instruction to undo ShiftRows, and then do
     99  *	RotWord on word 3, and then copy it into all the other words.
    100  */
    101 	.section .rodata
    102 	.p2align 4
    103 	.type	unshiftrows_rotword_3,@object
    104 unshiftrows_rotword_3:
    105 	.byte	0x09,0x06,0x03,0x0c
    106 	.byte	0x09,0x06,0x03,0x0c
    107 	.byte	0x09,0x06,0x03,0x0c
    108 	.byte	0x09,0x06,0x03,0x0c
    109 END(unshiftrows_rotword_3)
    110 
    111 /*
    112  * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
    113  *
    114  *	Expand a 16-byte AES-128 key into 10 round keys.
    115  *
    116  *	Standard ABI calling convention.
    117  */
    118 ENTRY(aesarmv8_setenckey128)
    119 	ldr	q1, [x1]	/* q1 := master key */
    120 
    121 	adrl	x4, unshiftrows_rotword_3
    122 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    123 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 table */
    124 
    125 	str	q1, [x0], #0x10	/* store master key as first round key */
    126 	mov	x2, #10		/* round count */
    127 	adrl	x3, rcon	/* round constant */
    128 
    129 1:	/*
    130 	 * q0 = 0
    131 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    132 	 * x0 = pointer to round key to compute
    133 	 * x2 = round count
    134 	 * x3 = rcon pointer
    135 	 */
    136 
    137 	/* q3 := ShiftRows(SubBytes(q1)) */
    138 	mov	v3.16b, v1.16b
    139 	aese	v3.16b, v0.16b
    140 
    141 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    142 	ld1r	{v4.4s}, [x3], #4
    143 	tbl	v3.16b, {v3.16b}, v16.16b
    144 	eor	v3.16b, v3.16b, v4.16b
    145 
    146 	/*
    147 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    148 	 * v6.4s := (0,0,prk[0],prk[1])
    149 	 * v7.4s := (0,0,0,prk[0])
    150 	 */
    151 	ext	v5.16b, v0.16b, v1.16b, #12
    152 	ext	v6.16b, v0.16b, v1.16b, #8
    153 	ext	v7.16b, v0.16b, v1.16b, #4
    154 
    155 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    156 	eor	v1.16b, v1.16b, v3.16b
    157 	eor	v1.16b, v1.16b, v5.16b
    158 	eor	v1.16b, v1.16b, v6.16b
    159 	eor	v1.16b, v1.16b, v7.16b
    160 
    161 	subs	x2, x2, #1	/* count down rounds */
    162 	str	q1, [x0], #0x10	/* store round key */
    163 	b.ne	1b
    164 
    165 	ret
    166 END(aesarmv8_setenckey128)
    167 
    168 /*
    169  * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
    170  *
    171  *	Expand a 24-byte AES-192 key into 12 round keys.
    172  *
    173  *	Standard ABI calling convention.
    174  */
    175 ENTRY(aesarmv8_setenckey192)
    176 	ldr	q1, [x1], #0x10	/* q1 := master key[0:128) */
    177 	ldr	d2, [x1]	/* d2 := master key[128:192) */
    178 
    179 	adrl	x4, unshiftrows_rotword_1
    180 	adrl	x5, unshiftrows_rotword_3
    181 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    182 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_1 */
    183 	ldr	q17, [x5]	/* q17 := unshiftrows_rotword_3 */
    184 
    185 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
    186 	mov	x2, #12		/* round count */
    187 	adrl	x3, rcon	/* round constant */
    188 
    189 1:	/*
    190 	 * q0 = 0
    191 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    192 	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
    193 	 * x0 = pointer to three round keys to compute
    194 	 * x2 = round count
    195 	 * x3 = rcon pointer
    196 	 */
    197 
    198 	/* q3 := ShiftRows(SubBytes(q2)) */
    199 	mov	v3.16b, v2.16b
    200 	aese	v3.16b, v0.16b
    201 
    202 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
    203 	ld1r	{v4.4s}, [x3], #4
    204 	tbl	v3.16b, {v3.16b}, v16.16b
    205 	eor	v3.16b, v3.16b, v4.16b
    206 
    207 	/*
    208 	 * We need to compute:
    209 	 *
    210 	 * rk[0] := rklo[0]
    211 	 * rk[1] := rklo[1]
    212 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    213 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    214 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    215 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    216 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    217 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    218 	 *     ^ rklo[1]
    219 	 */
    220 
    221 	/*
    222 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    223 	 * v6.4s := (0,0,prk[0],prk[1])
    224 	 * v7.4s := (0,0,0,prk[0])
    225 	 */
    226 	ext	v5.16b, v0.16b, v1.16b, #12
    227 	ext	v6.16b, v0.16b, v1.16b, #8
    228 	ext	v7.16b, v0.16b, v1.16b, #4
    229 
    230 	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
    231 	eor	v5.16b, v5.16b, v1.16b
    232 	eor	v5.16b, v5.16b, v3.16b
    233 	eor	v5.16b, v5.16b, v6.16b
    234 	eor	v5.16b, v5.16b, v7.16b
    235 
    236 	/*
    237 	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
    238 	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
    239 	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
    240 	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
    241 	 * (rklo[0],rklo[1],...).
    242 	 */
    243 
    244 	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    245 	dup	v1.4s, v5.s[3]
    246 	mov	v1.s[0], v5.s[2]
    247 
    248 	/*
    249 	 * v6.4s := (0, 0, rklo[0], rklo[1])
    250 	 * v7.4s := (0, 0, 0, rklo[0])
    251 	 */
    252 	ext	v6.16b, v0.16b, v2.16b, #8
    253 	ext	v7.16b, v0.16b, v2.16b, #4
    254 
    255 	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    256 	eor	v3.16b, v1.16b, v6.16b
    257 	eor	v3.16b, v3.16b, v7.16b
    258 
    259 	/*
    260 	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
    261 	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
    262 	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
    263 	 */
    264 	mov	v2.d[1], v5.d[0]
    265 
    266 	/* store two round keys */
    267 	stp	q2, q3, [x0], #0x20
    268 
    269 	/*
    270 	 * Live vector registers at this point:
    271 	 *
    272 	 *	q0 = zero
    273 	 *	q2 = rk
    274 	 *	q3 = nrk
    275 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
    276 	 *	q16 = unshiftrows_rotword_1
    277 	 *	q17 = unshiftrows_rotword_3
    278 	 *
    279 	 * We have to compute, in q1:
    280 	 *
    281 	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
    282 	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
    283 	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    284 	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    285 	 *     ^ nrk[1]
    286 	 *
    287 	 * And, if there's any more afterward, in q2:
    288 	 *
    289 	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    290 	 *     ^ nrk[1] ^ nrk[2]
    291 	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    292 	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
    293 	 */
    294 
    295 	/* q1 := RotWords(SubBytes(q3)) */
    296 	mov	v1.16b, v3.16b
    297 	aese	v1.16b, v0.16b
    298 
    299 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
    300 	ld1r	{v4.4s}, [x3], #4
    301 	tbl	v1.16b, {v1.16b}, v17.16b
    302 	eor	v1.16b, v1.16b, v4.16b
    303 
    304 	/*
    305 	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
    306 	 * v4.4s := (0, rk[2], rk[3], nrk[0])
    307 	 * v6.4s := (0, 0, rk[2], rk[3])
    308 	 * v7.4s := (0, 0, 0, rk[2])
    309 	 */
    310 	ext	v4.16b, v0.16b, v5.16b, #12
    311 	ext	v6.16b, v0.16b, v5.16b, #8
    312 	ext	v7.16b, v0.16b, v5.16b, #4
    313 
    314 	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
    315 	eor	v1.16b, v1.16b, v5.16b
    316 	eor	v1.16b, v1.16b, v4.16b
    317 	eor	v1.16b, v1.16b, v6.16b
    318 	eor	v1.16b, v1.16b, v7.16b
    319 
    320 	subs	x2, x2, #3	/* count down three rounds */
    321 	str	q1, [x0], #0x10	/* store third round key */
    322 	b.eq	2f
    323 
    324 	/*
    325 	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
    326 	 * v5.4s := (0, nrk[2], xxx, xxx)
    327 	 */
    328 	ext	v4.16b, v3.16b, v0.16b, #8
    329 	ext	v5.16b, v0.16b, v4.16b, #12
    330 
    331 	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
    332 	dup	v2.4s, v1.s[3]
    333 
    334 	/*
    335 	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
    336 	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
    337 	 *     xxx, xxx)
    338 	 */
    339 	eor	v2.16b, v2.16b, v4.16b
    340 	eor	v2.16b, v2.16b, v5.16b
    341 
    342 	b	1b
    343 
    344 2:	ret
    345 END(aesarmv8_setenckey192)
    346 
    347 /*
    348  * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
    349  *
    350  *	Expand a 32-byte AES-256 key into 14 round keys.
    351  *
    352  *	Standard ABI calling convention.
    353  */
    354 ENTRY(aesarmv8_setenckey256)
    355 	/* q1 := key[0:128), q2 := key[128:256) */
    356 	ldp	q1, q2, [x1], #0x20
    357 
    358 	adrl	x4, unshiftrows_rotword_3
    359 	adrl	x5, unshiftrows_3
    360 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    361 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 */
    362 	ldr	q17, [x5]	/* q17 := unshiftrows_3 */
    363 
    364 	/* store master key as first two round keys */
    365 	stp	q1, q2, [x0], #0x20
    366 	mov	x2, #14		/* round count */
    367 	adrl	x3, rcon	/* round constant */
    368 
    369 1:	/*
    370 	 * q0 = 0
    371 	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
    372 	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
    373 	 * x2 = round count
    374 	 * x3 = rcon pointer
    375 	 */
    376 
    377 	/* q3 := ShiftRows(SubBytes(q2)) */
    378 	mov	v3.16b, v2.16b
    379 	aese	v3.16b, v0.16b
    380 
    381 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    382 	ld1r	{v4.4s}, [x3], #4
    383 	tbl	v3.16b, {v3.16b}, v16.16b
    384 	eor	v3.16b, v3.16b, v4.16b
    385 
    386 	/*
    387 	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
    388 	 * v6.4s := (0,0,pprk[0],pprk[1])
    389 	 * v7.4s := (0,0,0,pprk[0])
    390 	 */
    391 	ext	v5.16b, v0.16b, v1.16b, #12
    392 	ext	v6.16b, v0.16b, v1.16b, #8
    393 	ext	v7.16b, v0.16b, v1.16b, #4
    394 
    395 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    396 	eor	v1.16b, v1.16b, v3.16b
    397 	eor	v1.16b, v1.16b, v5.16b
    398 	eor	v1.16b, v1.16b, v6.16b
    399 	eor	v1.16b, v1.16b, v7.16b
    400 
    401 	subs	x2, x2, #2		/* count down two rounds */
    402 	b.eq	2f			/* stop if this is the last one */
    403 
    404 	/* q3 := ShiftRows(SubBytes(q1)) */
    405 	mov	v3.16b, v1.16b
    406 	aese	v3.16b, v0.16b
    407 
    408 	/* v3.4s[i] := SubBytes(rk[3]) */
    409 	tbl	v3.16b, {v3.16b}, v17.16b
    410 
    411 	/*
    412 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    413 	 * v6.4s := (0,0,prk[0],prk[1])
    414 	 * v7.4s := (0,0,0,prk[0])
    415 	 */
    416 	ext	v5.16b, v0.16b, v2.16b, #12
    417 	ext	v6.16b, v0.16b, v2.16b, #8
    418 	ext	v7.16b, v0.16b, v2.16b, #4
    419 
    420 	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    421 	eor	v2.16b, v2.16b, v3.16b
    422 	eor	v2.16b, v2.16b, v5.16b
    423 	eor	v2.16b, v2.16b, v6.16b
    424 	eor	v2.16b, v2.16b, v7.16b
    425 
    426 	stp	q1, q2, [x0], #0x20	/* store two round keys */
    427 	b	1b
    428 
    429 2:	str	q1, [x0]		/* store last round key */
    430 	ret
    431 END(aesarmv8_setenckey256)
    432 
    433 /*
    434  * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
    435  *     uint32_t nrounds@x2)
    436  *
    437  *	Convert AES encryption round keys to AES decryption round keys.
    438  *	`rounds' must be between 10 and 14.
    439  *
    440  *	Standard ABI calling convention.
    441  */
    442 ENTRY(aesarmv8_enctodec)
    443 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
    444 	b	2f
    445 	_ALIGN_TEXT
    446 1:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
    447 2:	str	q0, [x1], #0x10	/* store round key */
    448 	subs	x2, x2, #1	/* count down round */
    449 	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
    450 	b.ne	1b		/* repeat if there's more */
    451 	str	q0, [x1]	/* store first round key verbatim */
    452 	ret
    453 END(aesarmv8_enctodec)
    454 
    455 /*
    456  * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
    457  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    458  *
    459  *	Encrypt a single block.
    460  *
    461  *	Standard ABI calling convention.
    462  */
    463 ENTRY(aesarmv8_enc)
    464 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    465 	mov	fp, sp
    466 	ldr	q0, [x1]	/* q0 := ptxt */
    467 	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
    468 	str	q0, [x2]	/* store ctxt */
    469 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    470 	ret
    471 END(aesarmv8_enc)
    472 
    473 /*
    474  * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
    475  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    476  *
    477  *	Decrypt a single block.
    478  *
    479  *	Standard ABI calling convention.
    480  */
    481 ENTRY(aesarmv8_dec)
    482 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    483 	mov	fp, sp
    484 	ldr	q0, [x1]	/* q0 := ctxt */
    485 	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
    486 	str	q0, [x2]	/* store ptxt */
    487 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    488 	ret
    489 END(aesarmv8_dec)
    490 
    491 /*
    492  * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    493  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
    494  *     uint32_t nrounds@x5)
    495  *
    496  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    497  *
    498  *	nbytes must be an integral multiple of 16.
    499  *
    500  *	Standard ABI calling convention.
    501  */
    502 ENTRY(aesarmv8_cbc_enc)
    503 	cbz	x3, 2f			/* stop if nothing to do */
    504 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    505 	mov	fp, sp
    506 	mov	x9, x0			/* x9 := enckey */
    507 	mov	x10, x3			/* x10 := nbytes */
    508 	ldr	q0, [x4]		/* q0 := chaining value */
    509 	_ALIGN_TEXT
    510 1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
    511 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
    512 	mov	x0, x9			/* x0 := enckey */
    513 	mov	x3, x5			/* x3 := nrounds */
    514 	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
    515 	subs	x10, x10, #0x10		/* count down nbytes */
    516 	str	q0, [x2], #0x10		/* store ciphertext block */
    517 	b.ne	1b			/* repeat if x10 is nonzero */
    518 	str	q0, [x4]		/* store chaining value */
    519 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    520 2:	ret
    521 END(aesarmv8_cbc_enc)
    522 
    523 /*
    524  * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    525  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    526  *     uint32_t nrounds@x5)
    527  *
    528  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    529  *
    530  *	nbytes must be a positive integral multiple of 16.  This routine
    531  *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
    532  *
    533  *	Standard ABI calling convention.
    534  */
    535 ENTRY(aesarmv8_cbc_dec1)
    536 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    537 	mov	fp, sp
    538 	ldr	q24, [x4]		/* q24 := iv */
    539 	mov	x9, x0			/* x9 := enckey */
    540 	mov	x10, x3			/* x10 := nbytes */
    541 	add	x1, x1, x3		/* x1 := pointer past end of in */
    542 	add	x2, x2, x3		/* x2 := pointer past end of out */
    543 	ldr	q0, [x1, #-0x10]!	/* q0 := last ciphertext block */
    544 	str	q0, [x4]		/* update iv */
    545 	b	2f
    546 	_ALIGN_TEXT
    547 1:	ldr	q31, [x1, #-0x10]!	/* q31 := chaining value */
    548 	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
    549 	str	q0, [x2, #-0x10]!	/* store plaintext block */
    550 	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
    551 2:	mov	x0, x9			/* x0 := enckey */
    552 	mov	x3, x5			/* x3 := nrounds */
    553 	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
    554 	subs	x10, x10, #0x10		/* count down nbytes */
    555 	b.ne	1b			/* repeat if more blocks */
    556 	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
    557 	str	q0, [x2, #-0x10]!	/* store first plaintext block */
    558 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    559 	ret
    560 END(aesarmv8_cbc_dec1)
    561 
    562 /*
    563  * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    564  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    565  *     uint32_t nrounds@x5)
    566  *
    567  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    568  *
    569  *	nbytes must be a positive integral multiple of 128.
    570  *
    571  *	Standard ABI calling convention.
    572  */
    573 ENTRY(aesarmv8_cbc_dec8)
    574 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    575 	mov	fp, sp
    576 	ldr	q24, [x4]		/* q24 := iv */
    577 	mov	x9, x0			/* x9 := enckey */
    578 	mov	x10, x3			/* x10 := nbytes */
    579 	add	x1, x1, x3		/* x1 := pointer past end of in */
    580 	add	x2, x2, x3		/* x2 := pointer past end of out */
    581 	ldp	q6, q7, [x1, #-0x20]!	/* q6, q7 := last ciphertext blocks */
    582 	str	q7, [x4]		/* update iv */
    583 	b	2f
    584 	_ALIGN_TEXT
    585 1:	ldp	q6, q7, [x1, #-0x20]!
    586 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
    587 	stp	q0, q1, [x2, #-0x20]!
    588 2:	ldp	q4, q5, [x1, #-0x20]!
    589 	ldp	q2, q3, [x1, #-0x20]!
    590 	ldp	q0, q1, [x1, #-0x20]!
    591 	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0<i<8 */
    592 	mov	v30.16b, v5.16b
    593 	mov	v29.16b, v4.16b
    594 	mov	v28.16b, v3.16b
    595 	mov	v27.16b, v2.16b
    596 	mov	v26.16b, v1.16b
    597 	mov	v25.16b, v0.16b
    598 	mov	x0, x9			/* x0 := enckey */
    599 	mov	x3, x5			/* x3 := nrounds */
    600 	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i];
    601 					 * trash x0/x3/q16 */
    602 	eor	v7.16b, v7.16b, v31.16b	/* q[i] := pt[i] */
    603 	eor	v6.16b, v6.16b, v30.16b
    604 	eor	v5.16b, v5.16b, v29.16b
    605 	eor	v4.16b, v4.16b, v28.16b
    606 	eor	v3.16b, v3.16b, v27.16b
    607 	eor	v2.16b, v2.16b, v26.16b
    608 	eor	v1.16b, v1.16b, v25.16b
    609 	subs	x10, x10, #0x80		/* count down nbytes */
    610 	stp	q6, q7, [x2, #-0x20]!	/* store plaintext blocks */
    611 	stp	q4, q5, [x2, #-0x20]!
    612 	stp	q2, q3, [x2, #-0x20]!
    613 	b.ne	1b			/* repeat if there's more */
    614 	eor	v0.16b, v0.16b, v24.16b	/* q0 := pt0 */
    615 	stp	q0, q1, [x2, #-0x20]!	/* store first two plaintext blocks */
    616 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    617 	ret
    618 END(aesarmv8_cbc_dec8)
    619 
    620 /*
    621  * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    622  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    623  *     uint32_t nrounds@x5)
    624  *
    625  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    626  *
    627  *	nbytes must be a positive integral multiple of 16.  This routine
    628  *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
    629  *
    630  *	Standard ABI calling convention.
    631  */
    632 ENTRY(aesarmv8_xts_enc1)
    633 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    634 	mov	fp, sp
    635 	mov	x9, x0			/* x9 := enckey */
    636 	mov	x10, x3			/* x10 := nbytes */
    637 	ldr	q31, [x4]		/* q31 := tweak */
    638 	_ALIGN_TEXT
    639 1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
    640 	mov	x0, x9			/* x0 := enckey */
    641 	mov	x3, x5			/* x3 := nrounds */
    642 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ptxt ^ tweak */
    643 	bl	aesarmv8_enc1		/* q0 := AES(...); trash x0/x3/q16 */
    644 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
    645 	str	q0, [x2], #0x10		/* store ciphertext block */
    646 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    647 	subs	x10, x10, #0x10		/* count down nbytes */
    648 	b.ne	1b			/* repeat if more blocks */
    649 	str	q31, [x4]		/* update tweak */
    650 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    651 	ret
    652 END(aesarmv8_xts_enc1)
    653 
    654 /*
    655  * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    656  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    657  *     uint32_t nrounds@x5)
    658  *
    659  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    660  *
    661  *	nbytes must be a positive integral multiple of 128.
    662  *
    663  *	Standard ABI calling convention.
    664  */
    665 ENTRY(aesarmv8_xts_enc8)
    666 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    667 	mov	fp, sp
    668 	mov	x9, x0			/* x9 := enckey */
    669 	mov	x10, x3			/* x10 := nbytes */
    670 	ldr	q31, [x4]		/* q31 := tweak */
    671 	_ALIGN_TEXT
    672 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    673 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    674 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    675 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    676 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    677 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    678 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    679 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    680 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    681 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    682 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    683 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    684 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    685 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    686 					/* q31 := tweak[7] */
    687 	ldp	q0, q1, [x1], #0x20	/* q[i] := ptxt[i] */
    688 	ldp	q2, q3, [x1], #0x20
    689 	ldp	q4, q5, [x1], #0x20
    690 	ldp	q6, q7, [x1], #0x20
    691 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
    692 	eor	v1.16b, v1.16b, v25.16b
    693 	eor	v2.16b, v2.16b, v26.16b
    694 	eor	v3.16b, v3.16b, v27.16b
    695 	eor	v4.16b, v4.16b, v28.16b
    696 	eor	v5.16b, v5.16b, v29.16b
    697 	eor	v6.16b, v6.16b, v30.16b
    698 	eor	v7.16b, v7.16b, v31.16b
    699 	mov	x0, x9			/* x0 := enckey */
    700 	mov	x3, x5			/* x3 := nrounds */
    701 	bl	aesarmv8_enc8		/* encrypt q0-q7; trash x0/x3/q16 */
    702 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    703 	eor	v1.16b, v1.16b, v25.16b
    704 	eor	v2.16b, v2.16b, v26.16b
    705 	eor	v3.16b, v3.16b, v27.16b
    706 	eor	v4.16b, v4.16b, v28.16b
    707 	eor	v5.16b, v5.16b, v29.16b
    708 	eor	v6.16b, v6.16b, v30.16b
    709 	eor	v7.16b, v7.16b, v31.16b
    710 	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
    711 	stp	q2, q3, [x2], #0x20
    712 	stp	q4, q5, [x2], #0x20
    713 	stp	q6, q7, [x2], #0x20
    714 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    715 	subs	x10, x10, #0x80		/* count down nbytes */
    716 	b.ne	1b			/* repeat if more block groups */
    717 	str	q31, [x4]		/* update tweak */
    718 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    719 	ret
    720 END(aesarmv8_xts_enc8)
    721 
    722 /*
    723  * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    724  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    725  *     uint32_t nrounds@x5)
    726  *
    727  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    728  *
    729  *	nbytes must be a positive integral multiple of 16.  This routine
    730  *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
    731  *
    732  *	Standard ABI calling convention.
    733  */
    734 ENTRY(aesarmv8_xts_dec1)
    735 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    736 	mov	fp, sp
    737 	mov	x9, x0			/* x9 := deckey */
    738 	mov	x10, x3			/* x10 := nbytes */
    739 	ldr	q31, [x4]		/* q31 := tweak */
    740 	_ALIGN_TEXT
    741 1:	ldr	q0, [x1], #0x10		/* q0 := ctxt */
    742 	mov	x0, x9			/* x0 := deckey */
    743 	mov	x3, x5			/* x3 := nrounds */
    744 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ctxt ^ tweak */
    745 	bl	aesarmv8_dec1		/* q0 := AES(...); trash x0/x3/q16 */
    746 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ctxt ^ tweak) ^ tweak */
    747 	str	q0, [x2], #0x10		/* store plaintext block */
    748 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    749 	subs	x10, x10, #0x10		/* count down nbytes */
    750 	b.ne	1b			/* repeat if more blocks */
    751 	str	q31, [x4]		/* update tweak */
    752 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    753 	ret
    754 END(aesarmv8_xts_dec1)
    755 
    756 /*
    757  * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    758  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    759  *     uint32_t nrounds@x5)
    760  *
    761  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    762  *
    763  *	nbytes must be a positive integral multiple of 128.
    764  *
    765  *	Standard ABI calling convention.
    766  */
    767 ENTRY(aesarmv8_xts_dec8)
    768 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    769 	mov	fp, sp
    770 	mov	x9, x0			/* x9 := deckey */
    771 	mov	x10, x3			/* x10 := nbytes */
    772 	ldr	q31, [x4]		/* q31 := tweak */
    773 	_ALIGN_TEXT
    774 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    775 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    776 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    777 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    778 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    779 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    780 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    781 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    782 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    783 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    784 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    785 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    786 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    787 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    788 					/* q31 := tweak[7] */
    789 	ldp	q0, q1, [x1], #0x20	/* q[i] := ctxt[i] */
    790 	ldp	q2, q3, [x1], #0x20
    791 	ldp	q4, q5, [x1], #0x20
    792 	ldp	q6, q7, [x1], #0x20
    793 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ctxt[i] ^ tweak[i] */
    794 	eor	v1.16b, v1.16b, v25.16b
    795 	eor	v2.16b, v2.16b, v26.16b
    796 	eor	v3.16b, v3.16b, v27.16b
    797 	eor	v4.16b, v4.16b, v28.16b
    798 	eor	v5.16b, v5.16b, v29.16b
    799 	eor	v6.16b, v6.16b, v30.16b
    800 	eor	v7.16b, v7.16b, v31.16b
    801 	mov	x0, x9			/* x0 := deckey */
    802 	mov	x3, x5			/* x3 := nrounds */
    803 	bl	aesarmv8_dec8		/* decrypt q0-q7; trash x0/x3/q16 */
    804 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    805 	eor	v1.16b, v1.16b, v25.16b
    806 	eor	v2.16b, v2.16b, v26.16b
    807 	eor	v3.16b, v3.16b, v27.16b
    808 	eor	v4.16b, v4.16b, v28.16b
    809 	eor	v5.16b, v5.16b, v29.16b
    810 	eor	v6.16b, v6.16b, v30.16b
    811 	eor	v7.16b, v7.16b, v31.16b
    812 	stp	q0, q1, [x2], #0x20	/* store plaintext blocks */
    813 	stp	q2, q3, [x2], #0x20
    814 	stp	q4, q5, [x2], #0x20
    815 	stp	q6, q7, [x2], #0x20
    816 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    817 	subs	x10, x10, #0x80		/* count down nbytes */
    818 	b.ne	1b			/* repeat if more block groups */
    819 	str	q31, [x4]		/* update tweak */
    820 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    821 	ret
    822 END(aesarmv8_xts_dec8)
    823 
    824 /*
    825  * aesarmv8_xts_mulx(tweak@q31)
    826  *
    827  *	Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    828  *	Uses x0 and q0/q1 as temporaries.
    829  */
    830 	.text
    831 	_ALIGN_TEXT
    832 	.type	aesarmv8_xts_mulx,@function
    833 aesarmv8_xts_mulx:
    834 	/*
    835 	 * Simultaneously determine
    836 	 * (a) whether the high bit of the low half must be
    837 	 *     shifted into the low bit of the high half, and
    838 	 * (b) whether the high bit of the high half must be
    839 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    840 	 */
    841 	adrl	x0, xtscarry
    842 	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
    843 	ldr	q0, [x0]		/* q0 := xtscarry */
    844 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
    845 	shl	v31.2d, v31.2d, #1	/* shift */
    846 	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
    847 	eor	v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
    848 	ret
    849 END(aesarmv8_xts_mulx)
    850 
    851 	.section .rodata
    852 	.p2align 4
    853 	.type	xtscarry,@object
    854 xtscarry:
    855 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    856 END(xtscarry)
    857 
    858 /*
    859  * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
    860  *
    861  *	Update an AES-XTS tweak.
    862  *
    863  *	Standard ABI calling convention.
    864  */
    865 ENTRY(aesarmv8_xts_update)
    866 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    867 	mov	fp, sp
    868 	ldr	q31, [x0]		/* load tweak */
    869 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    870 	str	q31, [x1]		/* store tweak */
    871 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    872 	ret
    873 END(aesarmv8_xts_update)
    874 
    875 /*
    876  * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0,
    877  *     const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3,
    878  *     uint32_t nrounds@x4)
    879  *
    880  *	Update CBC-MAC.
    881  *
    882  *	nbytes must be a positive integral multiple of 16.
    883  *
    884  *	Standard ABI calling convention.
    885  */
    886 ENTRY(aesarmv8_cbcmac_update1)
    887 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    888 	mov	fp, sp
    889 	ldr	q0, [x3]		/* q0 := initial authenticator */
    890 	mov	x9, x0			/* x9 := enckey */
    891 	mov	x5, x3			/* x5 := &auth (enc1 trashes x3) */
    892 	_ALIGN_TEXT
    893 1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
    894 	mov	x0, x9			/* x0 := enckey */
    895 	mov	x3, x4			/* x3 := nrounds */
    896 	eor	v0.16b, v0.16b, v1.16b	/* q0 := auth ^ ptxt */
    897 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
    898 	subs	x2, x2, #0x10		/* count down nbytes */
    899 	b.ne	1b			/* repeat if x10 is nonzero */
    900 	str	q0, [x5]		/* store updated authenticator */
    901 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    902 	ret
    903 END(aesarmv8_cbcmac_update1)
    904 
    905 /*
    906  * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    907  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    908  *     uint32_t nrounds@x5)
    909  *
    910  *	Update CCM encryption.
    911  *
    912  *	nbytes must be a positive integral multiple of 16.
    913  *
    914  *	Standard ABI calling convention.
    915  */
    916 ENTRY(aesarmv8_ccm_enc1)
    917 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    918 	mov	fp, sp
    919 	ldp	q0, q2, [x4]		/* q0 := auth, q2 := ctr (be) */
    920 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    921 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    922 	mov	x9, x0			/* x9 := enckey */
    923 	mov	x10, x3			/* x10 := nbytes */
    924 #if _BYTE_ORDER == _LITTLE_ENDIAN
    925 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
    926 #endif
    927 	_ALIGN_TEXT
    928 1:	ldr	q3, [x1], #0x10		/* q3 := plaintext block */
    929 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    930 	mov	x0, x9			/* x0 := enckey */
    931 	mov	x3, x5			/* x3 := nrounds */
    932 #if _BYTE_ORDER == _LITTLE_ENDIAN
    933 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    934 #else
    935 	mov	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    936 #endif
    937 	eor	v0.16b, v0.16b, v3.16b	/* q0 := auth ^ ptxt */
    938 	bl	aesarmv8_enc2		/* q0 := auth', q1 := pad;
    939 					 * trash x0/x3/q16 */
    940 	eor	v3.16b, v1.16b, v3.16b	/* q3 := ciphertext block */
    941 	subs	x10, x10, #0x10		/* count down bytes */
    942 	str	q3, [x2], #0x10		/* store ciphertext block */
    943 	b.ne	1b			/* repeat if more blocks */
    944 #if _BYTE_ORDER == _LITTLE_ENDIAN
    945 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
    946 #endif
    947 	stp	q0, q2, [x4]		/* store updated auth/ctr */
    948 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    949 	ret
    950 END(aesarmv8_ccm_enc1)
    951 
    952 /*
    953  * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    954  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    955  *     uint32_t nrounds@x5)
    956  *
    957  *	Update CCM decryption.
    958  *
    959  *	nbytes must be a positive integral multiple of 16.
    960  *
    961  *	Standard ABI calling convention.
    962  */
    963 ENTRY(aesarmv8_ccm_dec1)
    964 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    965 	mov	fp, sp
    966 	ldp	q1, q2, [x4]		/* q1 := auth, q2 := ctr (be) */
    967 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    968 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    969 	mov	x9, x0			/* x9 := enckey */
    970 	mov	x10, x3			/* x10 := nbytes */
    971 #if _BYTE_ORDER == _LITTLE_ENDIAN
    972 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
    973 #endif
    974 
    975 	/* Decrypt the first block.  */
    976 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    977 	mov	x3, x5			/* x3 := nrounds */
    978 #if _BYTE_ORDER == _LITTLE_ENDIAN
    979 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    980 #else
    981 	mov	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    982 #endif
    983 	ldr	q3, [x1], #0x10		/* q3 := ctxt */
    984 	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
    985 	b	2f
    986 
    987 	_ALIGN_TEXT
    988 1:	/*
    989 	 * Authenticate the last block and decrypt the next block
    990 	 * simultaneously.
    991 	 *
    992 	 *	q1 = auth ^ ptxt[-1]
    993 	 *	q2 = ctr[-1] (le)
    994 	 */
    995 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    996 	mov	x0, x9			/* x0 := enckey */
    997 	mov	x3, x5			/* x3 := nrounds */
    998 #if _BYTE_ORDER == _LITTLE_ENDIAN
    999 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
   1000 #else
   1001 	mov	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
   1002 #endif
   1003 	ldr	q3, [x1], #0x10		/* q3 := ctxt */
   1004 	bl	aesarmv8_enc2		/* q0 := pad, q1 := auth';
   1005 					 * trash x0/x3/q16 */
   1006 2:	eor	v3.16b, v0.16b, v3.16b	/* q3 := plaintext block */
   1007 	subs	x10, x10, #0x10
   1008 	str	q3, [x2], #0x10		/* store plaintext */
   1009 	eor	v1.16b, v1.16b, v3.16b	/* q1 := auth ^ ptxt */
   1010 	b.ne	1b
   1011 
   1012 #if _BYTE_ORDER == _LITTLE_ENDIAN
   1013 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
   1014 #endif
   1015 
   1016 	/* Authenticate the last block.  */
   1017 	mov	x0, x9			/* x0 := enckey */
   1018 	mov	x3, x5			/* x3 := nrounds */
   1019 	mov	v0.16b, v1.16b		/* q0 := auth ^ ptxt */
   1020 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
   1021 	stp	q0, q2, [x4]		/* store updated auth/ctr */
   1022 	ldp	fp, lr, [sp], #16	/* pop stack frame */
   1023 	ret
   1024 END(aesarmv8_ccm_dec1)
   1025 
   1026 	.section .rodata
   1027 	.p2align 4
   1028 	.type	ctr32_inc,@object
   1029 ctr32_inc:
   1030 	.int	0, 0, 0, 1
   1031 END(ctr32_inc)
   1032 
   1033 /*
   1034  * aesarmv8_enc1(const struct aesenc *enckey@x0,
   1035  *     uint128_t block@q0, uint32_t nrounds@x3)
   1036  *
   1037  *	Encrypt a single AES block in q0.
   1038  *
   1039  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1040  */
   1041 	.text
   1042 	_ALIGN_TEXT
   1043 	.type	aesarmv8_enc1,@function
   1044 aesarmv8_enc1:
   1045 	ldr	q16, [x0], #0x10	/* load round key */
   1046 	sub	x3, x3, #1
   1047 	_ALIGN_TEXT
   1048 1:	/* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
   1049 	aese	v0.16b, v16.16b
   1050 	aesmc	v0.16b, v0.16b
   1051 	ldr	q16, [x0], #0x10
   1052 	subs	x3, x3, #1
   1053 	b.ne	1b
   1054 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
   1055 	aese	v0.16b, v16.16b
   1056 	ldr	q16, [x0]		/* load last round key */
   1057 	/* q0 := AddRoundKey_q16(q0) */
   1058 	eor	v0.16b, v0.16b, v16.16b
   1059 	ret
   1060 END(aesarmv8_enc1)
   1061 
   1062 /*
   1063  * aesarmv8_enc2(const struct aesenc *enckey@x0,
   1064  *     uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3)
   1065  *
   1066  *	Encrypt two AES blocks in q0 and q1.
   1067  *
   1068  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1069  */
   1070 	.text
   1071 	_ALIGN_TEXT
   1072 	.type	aesarmv8_enc2,@function
   1073 aesarmv8_enc2:
   1074 	ldr	q16, [x0], #0x10	/* load round key */
   1075 	sub	x3, x3, #1
   1076 	_ALIGN_TEXT
   1077 1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
   1078 	aese	v0.16b, v16.16b
   1079 	aesmc	v0.16b, v0.16b
   1080 	aese	v1.16b, v16.16b
   1081 	aesmc	v1.16b, v1.16b
   1082 	ldr	q16, [x0], #0x10	/* load next round key */
   1083 	subs	x3, x3, #1
   1084 	b.ne	1b
   1085 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1086 	aese	v0.16b, v16.16b
   1087 	aese	v1.16b, v16.16b
   1088 	ldr	q16, [x0]		/* load last round key */
   1089 	/* q[i] := AddRoundKey_q16(q[i]) */
   1090 	eor	v0.16b, v0.16b, v16.16b
   1091 	eor	v1.16b, v1.16b, v16.16b
   1092 	ret
   1093 END(aesarmv8_enc2)
   1094 
   1095 /*
   1096  * aesarmv8_enc8(const struct aesenc *enckey@x0,
   1097  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1098  *     uint32_t nrounds@x3)
   1099  *
   1100  *	Encrypt eight AES blocks in q0 through q7 in parallel.
   1101  *
   1102  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1103  */
   1104 	.text
   1105 	_ALIGN_TEXT
   1106 	.type	aesarmv8_enc8,@function
   1107 aesarmv8_enc8:
   1108 	ldr	q16, [x0], #0x10	/* load round key */
   1109 	sub	x3, x3, #1
   1110 	_ALIGN_TEXT
   1111 1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
   1112 	aese	v0.16b, v16.16b
   1113 	aesmc	v0.16b, v0.16b
   1114 	aese	v1.16b, v16.16b
   1115 	aesmc	v1.16b, v1.16b
   1116 	aese	v2.16b, v16.16b
   1117 	aesmc	v2.16b, v2.16b
   1118 	aese	v3.16b, v16.16b
   1119 	aesmc	v3.16b, v3.16b
   1120 	aese	v4.16b, v16.16b
   1121 	aesmc	v4.16b, v4.16b
   1122 	aese	v5.16b, v16.16b
   1123 	aesmc	v5.16b, v5.16b
   1124 	aese	v6.16b, v16.16b
   1125 	aesmc	v6.16b, v6.16b
   1126 	aese	v7.16b, v16.16b
   1127 	aesmc	v7.16b, v7.16b
   1128 	ldr	q16, [x0], #0x10	/* load next round key */
   1129 	subs	x3, x3, #1
   1130 	b.ne	1b
   1131 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1132 	aese	v0.16b, v16.16b
   1133 	aese	v1.16b, v16.16b
   1134 	aese	v2.16b, v16.16b
   1135 	aese	v3.16b, v16.16b
   1136 	aese	v4.16b, v16.16b
   1137 	aese	v5.16b, v16.16b
   1138 	aese	v6.16b, v16.16b
   1139 	aese	v7.16b, v16.16b
   1140 	ldr	q16, [x0]		/* load last round key */
   1141 	/* q[i] := AddRoundKey_q16(q[i]) */
   1142 	eor	v0.16b, v0.16b, v16.16b
   1143 	eor	v1.16b, v1.16b, v16.16b
   1144 	eor	v2.16b, v2.16b, v16.16b
   1145 	eor	v3.16b, v3.16b, v16.16b
   1146 	eor	v4.16b, v4.16b, v16.16b
   1147 	eor	v5.16b, v5.16b, v16.16b
   1148 	eor	v6.16b, v6.16b, v16.16b
   1149 	eor	v7.16b, v7.16b, v16.16b
   1150 	ret
   1151 END(aesarmv8_enc8)
   1152 
   1153 /*
   1154  * aesarmv8_dec1(const struct aesdec *deckey@x0,
   1155  *     uint128_t block@q0, uint32_t nrounds@x3)
   1156  *
   1157  *	Decrypt a single AES block in q0.
   1158  *
   1159  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1160  */
   1161 	.text
   1162 	_ALIGN_TEXT
   1163 	.type	aesarmv8_dec1,@function
   1164 aesarmv8_dec1:
   1165 	ldr	q16, [x0], #0x10	/* load round key */
   1166 	sub	x3, x3, #1
   1167 	_ALIGN_TEXT
   1168 1:	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1169 	aesd	v0.16b, v16.16b
   1170 	/* q0 := InMixColumns(q0) */
   1171 	aesimc	v0.16b, v0.16b
   1172 	ldr	q16, [x0], #0x10	/* load next round key */
   1173 	subs	x3, x3, #1
   1174 	b.ne	1b
   1175 	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1176 	aesd	v0.16b, v16.16b
   1177 	ldr	q16, [x0]		/* load last round key */
   1178 	/* q0 := AddRoundKey_q16(q0) */
   1179 	eor	v0.16b, v0.16b, v16.16b
   1180 	ret
   1181 END(aesarmv8_dec1)
   1182 
   1183 /*
   1184  * aesarmv8_dec8(const struct aesdec *deckey@x0,
   1185  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1186  *     uint32_t nrounds@x3)
   1187  *
   1188  *	Decrypt eight AES blocks in q0 through q7 in parallel.
   1189  *
   1190  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1191  */
   1192 	.text
   1193 	_ALIGN_TEXT
   1194 	.type	aesarmv8_dec8,@function
   1195 aesarmv8_dec8:
   1196 	ldr	q16, [x0], #0x10	/* load round key */
   1197 	sub	x3, x3, #1
   1198 	_ALIGN_TEXT
   1199 1:	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1200 	aesd	v0.16b, v16.16b
   1201 	/* q[i] := InMixColumns(q[i]) */
   1202 	aesimc	v0.16b, v0.16b
   1203 	aesd	v1.16b, v16.16b
   1204 	aesimc	v1.16b, v1.16b
   1205 	aesd	v2.16b, v16.16b
   1206 	aesimc	v2.16b, v2.16b
   1207 	aesd	v3.16b, v16.16b
   1208 	aesimc	v3.16b, v3.16b
   1209 	aesd	v4.16b, v16.16b
   1210 	aesimc	v4.16b, v4.16b
   1211 	aesd	v5.16b, v16.16b
   1212 	aesimc	v5.16b, v5.16b
   1213 	aesd	v6.16b, v16.16b
   1214 	aesimc	v6.16b, v6.16b
   1215 	aesd	v7.16b, v16.16b
   1216 	aesimc	v7.16b, v7.16b
   1217 	ldr	q16, [x0], #0x10	/* load next round key */
   1218 	subs	x3, x3, #1
   1219 	b.ne	1b
   1220 	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1221 	aesd	v0.16b, v16.16b
   1222 	aesd	v1.16b, v16.16b
   1223 	aesd	v2.16b, v16.16b
   1224 	aesd	v3.16b, v16.16b
   1225 	aesd	v4.16b, v16.16b
   1226 	aesd	v5.16b, v16.16b
   1227 	aesd	v6.16b, v16.16b
   1228 	aesd	v7.16b, v16.16b
   1229 	ldr	q16, [x0]		/* load last round key */
   1230 	/* q[i] := AddRoundKey_q16(q[i]) */
   1231 	eor	v0.16b, v0.16b, v16.16b
   1232 	eor	v1.16b, v1.16b, v16.16b
   1233 	eor	v2.16b, v2.16b, v16.16b
   1234 	eor	v3.16b, v3.16b, v16.16b
   1235 	eor	v4.16b, v4.16b, v16.16b
   1236 	eor	v5.16b, v5.16b, v16.16b
   1237 	eor	v6.16b, v6.16b, v16.16b
   1238 	eor	v7.16b, v7.16b, v16.16b
   1239 	ret
   1240 END(aesarmv8_dec8)
   1241