Home | History | Annotate | Line # | Download | only in arm
aes_armv8_64.S revision 1.14
      1 /*	$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <aarch64/asm.h>
     30 
     31 RCSID("$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $")
     32 
     33 	.arch_extension	aes
     34 
     35 /*
     36  * uint32_t rcon[10]
     37  *
     38  *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
     39  *	Such elements of GF(8) need only eight bits to be represented,
     40  *	but we store them in 4-byte units so we can copy one into all
     41  *	four 4-byte lanes of a vector register with a single LD1R.  The
     42  *	access pattern is fixed, so indices into this table are never
     43  *	secret.
     44  */
     45 	.section .rodata
     46 	.p2align 2
     47 	.type	rcon,@object
     48 rcon:
     49 	.long	0x01
     50 	.long	0x02
     51 	.long	0x04
     52 	.long	0x08
     53 	.long	0x10
     54 	.long	0x20
     55 	.long	0x40
     56 	.long	0x80
     57 	.long	0x1b
     58 	.long	0x36
     59 END(rcon)
     60 
     61 /*
     62  * uint128_t unshiftrows_rotword_1
     63  *
     64  *	Table for TBL instruction to undo ShiftRows, and then do
     65  *	RotWord on word 1, and then copy it into all the other words.
     66  */
     67 	.section .rodata
     68 	.p2align 4
     69 	.type	unshiftrows_rotword_1,@object
     70 unshiftrows_rotword_1:
     71 	.byte	0x01,0x0e,0x0b,0x04
     72 	.byte	0x01,0x0e,0x0b,0x04
     73 	.byte	0x01,0x0e,0x0b,0x04
     74 	.byte	0x01,0x0e,0x0b,0x04
     75 END(unshiftrows_rotword_1)
     76 
     77 /*
     78  * uint128_t unshiftrows_3
     79  *
     80  *	Table for TBL instruction to undo ShiftRows, and then copy word
     81  *	3 into all the other words.
     82  */
     83 	.section .rodata
     84 	.p2align 4
     85 	.type	unshiftrows_3,@object
     86 unshiftrows_3:
     87 	.byte	0x0c,0x09,0x06,0x03
     88 	.byte	0x0c,0x09,0x06,0x03
     89 	.byte	0x0c,0x09,0x06,0x03
     90 	.byte	0x0c,0x09,0x06,0x03
     91 END(unshiftrows_3)
     92 
     93 /*
     94  * uint128_t unshiftrows_rotword_3
     95  *
     96  *	Table for TBL instruction to undo ShiftRows, and then do
     97  *	RotWord on word 3, and then copy it into all the other words.
     98  */
     99 	.section .rodata
    100 	.p2align 4
    101 	.type	unshiftrows_rotword_3,@object
    102 unshiftrows_rotword_3:
    103 	.byte	0x09,0x06,0x03,0x0c
    104 	.byte	0x09,0x06,0x03,0x0c
    105 	.byte	0x09,0x06,0x03,0x0c
    106 	.byte	0x09,0x06,0x03,0x0c
    107 END(unshiftrows_rotword_3)
    108 
    109 /*
    110  * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
    111  *
    112  *	Expand a 16-byte AES-128 key into 10 round keys.
    113  *
    114  *	Standard ABI calling convention.
    115  */
    116 ENTRY(aesarmv8_setenckey128)
    117 	ld1	{v1.16b}, [x1]	/* q1 := master key */
    118 
    119 	adrl	x4, unshiftrows_rotword_3
    120 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    121 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 table */
    122 
    123 	str	q1, [x0], #0x10	/* store master key as first round key */
    124 	mov	x2, #10		/* round count */
    125 	adrl	x3, rcon	/* round constant */
    126 
    127 1:	/*
    128 	 * q0 = 0
    129 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    130 	 * x0 = pointer to round key to compute
    131 	 * x2 = round count
    132 	 * x3 = rcon pointer
    133 	 */
    134 
    135 	/* q3 := ShiftRows(SubBytes(q1)) */
    136 	mov	v3.16b, v1.16b
    137 	aese	v3.16b, v0.16b
    138 
    139 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    140 	ld1r	{v4.4s}, [x3], #4
    141 	tbl	v3.16b, {v3.16b}, v16.16b
    142 	eor	v3.16b, v3.16b, v4.16b
    143 
    144 	/*
    145 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    146 	 * v6.4s := (0,0,prk[0],prk[1])
    147 	 * v7.4s := (0,0,0,prk[0])
    148 	 */
    149 	ext	v5.16b, v0.16b, v1.16b, #12
    150 	ext	v6.16b, v0.16b, v1.16b, #8
    151 	ext	v7.16b, v0.16b, v1.16b, #4
    152 
    153 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    154 	eor	v1.16b, v1.16b, v3.16b
    155 	eor	v1.16b, v1.16b, v5.16b
    156 	eor	v1.16b, v1.16b, v6.16b
    157 	eor	v1.16b, v1.16b, v7.16b
    158 
    159 	subs	x2, x2, #1	/* count down rounds */
    160 	str	q1, [x0], #0x10	/* store round key */
    161 	b.ne	1b
    162 
    163 	ret
    164 END(aesarmv8_setenckey128)
    165 
    166 /*
    167  * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
    168  *
    169  *	Expand a 24-byte AES-192 key into 12 round keys.
    170  *
    171  *	Standard ABI calling convention.
    172  */
    173 ENTRY(aesarmv8_setenckey192)
    174 	ld1	{v1.16b}, [x1], #0x10	/* q1 := master key[0:128) */
    175 	ld1	{v2.8b}, [x1]	/* d2 := master key[128:192) */
    176 
    177 	adrl	x4, unshiftrows_rotword_1
    178 	adrl	x5, unshiftrows_rotword_3
    179 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    180 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_1 */
    181 	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_rotword_3 */
    182 
    183 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
    184 	mov	x2, #12		/* round count */
    185 	adrl	x3, rcon	/* round constant */
    186 
    187 1:	/*
    188 	 * q0 = 0
    189 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    190 	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
    191 	 * x0 = pointer to three round keys to compute
    192 	 * x2 = round count
    193 	 * x3 = rcon pointer
    194 	 */
    195 
    196 	/* q3 := ShiftRows(SubBytes(q2)) */
    197 	mov	v3.16b, v2.16b
    198 	aese	v3.16b, v0.16b
    199 
    200 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
    201 	ld1r	{v4.4s}, [x3], #4
    202 	tbl	v3.16b, {v3.16b}, v16.16b
    203 	eor	v3.16b, v3.16b, v4.16b
    204 
    205 	/*
    206 	 * We need to compute:
    207 	 *
    208 	 * rk[0] := rklo[0]
    209 	 * rk[1] := rklo[1]
    210 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    211 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    212 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    213 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    214 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    215 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    216 	 *     ^ rklo[1]
    217 	 */
    218 
    219 	/*
    220 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    221 	 * v6.4s := (0,0,prk[0],prk[1])
    222 	 * v7.4s := (0,0,0,prk[0])
    223 	 */
    224 	ext	v5.16b, v0.16b, v1.16b, #12
    225 	ext	v6.16b, v0.16b, v1.16b, #8
    226 	ext	v7.16b, v0.16b, v1.16b, #4
    227 
    228 	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
    229 	eor	v5.16b, v5.16b, v1.16b
    230 	eor	v5.16b, v5.16b, v3.16b
    231 	eor	v5.16b, v5.16b, v6.16b
    232 	eor	v5.16b, v5.16b, v7.16b
    233 
    234 	/*
    235 	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
    236 	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
    237 	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
    238 	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
    239 	 * (rklo[0],rklo[1],...).
    240 	 */
    241 
    242 	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    243 	dup	v1.4s, v5.s[3]
    244 	mov	v1.s[0], v5.s[2]
    245 
    246 	/*
    247 	 * v6.4s := (0, 0, rklo[0], rklo[1])
    248 	 * v7.4s := (0, 0, 0, rklo[0])
    249 	 */
    250 	ext	v6.16b, v0.16b, v2.16b, #8
    251 	ext	v7.16b, v0.16b, v2.16b, #4
    252 
    253 	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    254 	eor	v3.16b, v1.16b, v6.16b
    255 	eor	v3.16b, v3.16b, v7.16b
    256 
    257 	/*
    258 	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
    259 	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
    260 	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
    261 	 */
    262 	mov	v2.d[1], v5.d[0]
    263 
    264 	/* store two round keys */
    265 	stp	q2, q3, [x0], #0x20
    266 
    267 	/*
    268 	 * Live vector registers at this point:
    269 	 *
    270 	 *	q0 = zero
    271 	 *	q2 = rk
    272 	 *	q3 = nrk
    273 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
    274 	 *	q16 = unshiftrows_rotword_1
    275 	 *	q17 = unshiftrows_rotword_3
    276 	 *
    277 	 * We have to compute, in q1:
    278 	 *
    279 	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
    280 	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
    281 	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    282 	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    283 	 *     ^ nrk[1]
    284 	 *
    285 	 * And, if there's any more afterward, in q2:
    286 	 *
    287 	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    288 	 *     ^ nrk[1] ^ nrk[2]
    289 	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    290 	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
    291 	 */
    292 
    293 	/* q1 := RotWords(SubBytes(q3)) */
    294 	mov	v1.16b, v3.16b
    295 	aese	v1.16b, v0.16b
    296 
    297 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
    298 	ld1r	{v4.4s}, [x3], #4
    299 	tbl	v1.16b, {v1.16b}, v17.16b
    300 	eor	v1.16b, v1.16b, v4.16b
    301 
    302 	/*
    303 	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
    304 	 * v4.4s := (0, rk[2], rk[3], nrk[0])
    305 	 * v6.4s := (0, 0, rk[2], rk[3])
    306 	 * v7.4s := (0, 0, 0, rk[2])
    307 	 */
    308 	ext	v4.16b, v0.16b, v5.16b, #12
    309 	ext	v6.16b, v0.16b, v5.16b, #8
    310 	ext	v7.16b, v0.16b, v5.16b, #4
    311 
    312 	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
    313 	eor	v1.16b, v1.16b, v5.16b
    314 	eor	v1.16b, v1.16b, v4.16b
    315 	eor	v1.16b, v1.16b, v6.16b
    316 	eor	v1.16b, v1.16b, v7.16b
    317 
    318 	subs	x2, x2, #3	/* count down three rounds */
    319 	str	q1, [x0], #0x10	/* store third round key */
    320 	b.eq	2f
    321 
    322 	/*
    323 	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
    324 	 * v5.4s := (0, nrk[2], xxx, xxx)
    325 	 */
    326 	ext	v4.16b, v3.16b, v0.16b, #8
    327 	ext	v5.16b, v0.16b, v4.16b, #12
    328 
    329 	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
    330 	dup	v2.4s, v1.s[3]
    331 
    332 	/*
    333 	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
    334 	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
    335 	 *     xxx, xxx)
    336 	 */
    337 	eor	v2.16b, v2.16b, v4.16b
    338 	eor	v2.16b, v2.16b, v5.16b
    339 
    340 	b	1b
    341 
    342 2:	ret
    343 END(aesarmv8_setenckey192)
    344 
    345 /*
    346  * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
    347  *
    348  *	Expand a 32-byte AES-256 key into 14 round keys.
    349  *
    350  *	Standard ABI calling convention.
    351  */
    352 ENTRY(aesarmv8_setenckey256)
    353 	/* q1 := key[0:128), q2 := key[128:256) */
    354 	ld1	{v1.16b-v2.16b}, [x1], #0x20
    355 
    356 	adrl	x4, unshiftrows_rotword_3
    357 	adrl	x5, unshiftrows_3
    358 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    359 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 */
    360 	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_3 */
    361 
    362 	/* store master key as first two round keys */
    363 	stp	q1, q2, [x0], #0x20
    364 	mov	x2, #14		/* round count */
    365 	adrl	x3, rcon	/* round constant */
    366 
    367 1:	/*
    368 	 * q0 = 0
    369 	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
    370 	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
    371 	 * x2 = round count
    372 	 * x3 = rcon pointer
    373 	 */
    374 
    375 	/* q3 := ShiftRows(SubBytes(q2)) */
    376 	mov	v3.16b, v2.16b
    377 	aese	v3.16b, v0.16b
    378 
    379 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    380 	ld1r	{v4.4s}, [x3], #4
    381 	tbl	v3.16b, {v3.16b}, v16.16b
    382 	eor	v3.16b, v3.16b, v4.16b
    383 
    384 	/*
    385 	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
    386 	 * v6.4s := (0,0,pprk[0],pprk[1])
    387 	 * v7.4s := (0,0,0,pprk[0])
    388 	 */
    389 	ext	v5.16b, v0.16b, v1.16b, #12
    390 	ext	v6.16b, v0.16b, v1.16b, #8
    391 	ext	v7.16b, v0.16b, v1.16b, #4
    392 
    393 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    394 	eor	v1.16b, v1.16b, v3.16b
    395 	eor	v1.16b, v1.16b, v5.16b
    396 	eor	v1.16b, v1.16b, v6.16b
    397 	eor	v1.16b, v1.16b, v7.16b
    398 
    399 	subs	x2, x2, #2		/* count down two rounds */
    400 	b.eq	2f			/* stop if this is the last one */
    401 
    402 	/* q3 := ShiftRows(SubBytes(q1)) */
    403 	mov	v3.16b, v1.16b
    404 	aese	v3.16b, v0.16b
    405 
    406 	/* v3.4s[i] := SubBytes(rk[3]) */
    407 	tbl	v3.16b, {v3.16b}, v17.16b
    408 
    409 	/*
    410 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    411 	 * v6.4s := (0,0,prk[0],prk[1])
    412 	 * v7.4s := (0,0,0,prk[0])
    413 	 */
    414 	ext	v5.16b, v0.16b, v2.16b, #12
    415 	ext	v6.16b, v0.16b, v2.16b, #8
    416 	ext	v7.16b, v0.16b, v2.16b, #4
    417 
    418 	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    419 	eor	v2.16b, v2.16b, v3.16b
    420 	eor	v2.16b, v2.16b, v5.16b
    421 	eor	v2.16b, v2.16b, v6.16b
    422 	eor	v2.16b, v2.16b, v7.16b
    423 
    424 	stp	q1, q2, [x0], #0x20	/* store two round keys */
    425 	b	1b
    426 
    427 2:	str	q1, [x0]		/* store last round key */
    428 	ret
    429 END(aesarmv8_setenckey256)
    430 
    431 /*
    432  * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
    433  *     uint32_t nrounds@x2)
    434  *
    435  *	Convert AES encryption round keys to AES decryption round keys.
    436  *	`rounds' must be between 10 and 14.
    437  *
    438  *	Standard ABI calling convention.
    439  */
    440 ENTRY(aesarmv8_enctodec)
    441 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
    442 	b	2f
    443 	_ALIGN_TEXT
    444 1:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
    445 2:	str	q0, [x1], #0x10	/* store round key */
    446 	subs	x2, x2, #1	/* count down round */
    447 	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
    448 	b.ne	1b		/* repeat if there's more */
    449 	str	q0, [x1]	/* store first round key verbatim */
    450 	ret
    451 END(aesarmv8_enctodec)
    452 
    453 /*
    454  * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
    455  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    456  *
    457  *	Encrypt a single block.
    458  *
    459  *	Standard ABI calling convention.
    460  */
    461 ENTRY(aesarmv8_enc)
    462 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    463 	mov	fp, sp
    464 	ld1	{v0.16b}, [x1]	/* q0 := ptxt */
    465 	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
    466 	st1	{v0.16b}, [x2]	/* store ctxt */
    467 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    468 	ret
    469 END(aesarmv8_enc)
    470 
    471 /*
    472  * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
    473  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    474  *
    475  *	Decrypt a single block.
    476  *
    477  *	Standard ABI calling convention.
    478  */
    479 ENTRY(aesarmv8_dec)
    480 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    481 	mov	fp, sp
    482 	ld1	{v0.16b}, [x1]	/* q0 := ctxt */
    483 	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
    484 	st1	{v0.16b}, [x2]	/* store ptxt */
    485 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    486 	ret
    487 END(aesarmv8_dec)
    488 
    489 /*
    490  * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    491  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
    492  *     uint32_t nrounds@x5)
    493  *
    494  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    495  *
    496  *	nbytes must be an integral multiple of 16.
    497  *
    498  *	Standard ABI calling convention.
    499  */
    500 ENTRY(aesarmv8_cbc_enc)
    501 	cbz	x3, 2f			/* stop if nothing to do */
    502 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    503 	mov	fp, sp
    504 	mov	x9, x0			/* x9 := enckey */
    505 	mov	x10, x3			/* x10 := nbytes */
    506 	ld1	{v0.16b}, [x4]		/* q0 := chaining value */
    507 	_ALIGN_TEXT
    508 1:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
    509 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
    510 	mov	x0, x9			/* x0 := enckey */
    511 	mov	x3, x5			/* x3 := nrounds */
    512 	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
    513 	subs	x10, x10, #0x10		/* count down nbytes */
    514 	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
    515 	b.ne	1b			/* repeat if x10 is nonzero */
    516 	st1	{v0.16b}, [x4]		/* store chaining value */
    517 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    518 2:	ret
    519 END(aesarmv8_cbc_enc)
    520 
    521 /*
    522  * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    523  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    524  *     uint32_t nrounds@x5)
    525  *
    526  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    527  *
    528  *	nbytes must be a positive integral multiple of 16.  This routine
    529  *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
    530  *
    531  *	Standard ABI calling convention.
    532  */
    533 ENTRY(aesarmv8_cbc_dec1)
    534 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    535 	mov	fp, sp
    536 	ld1	{v24.16b}, [x4]		/* q24 := iv */
    537 	mov	x9, x0			/* x9 := enckey */
    538 	mov	x10, x3			/* x10 := nbytes */
    539 	add	x1, x1, x3		/* x1 := pointer past end of in */
    540 	add	x2, x2, x3		/* x2 := pointer past end of out */
    541 	sub	x1, x1, #0x10
    542 	ld1	{v0.16b}, [x1]		/* q0 := last ciphertext block */
    543 	st1	{v0.16b}, [x4]		/* update iv */
    544 	b	2f
    545 	_ALIGN_TEXT
    546 1:	sub	x1, x1, #0x10
    547 	ld1	{v31.16b}, [x1]		/* q31 := chaining value */
    548 	sub	x2, x2, #0x10
    549 	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
    550 	st1	{v0.16b}, [x2]		/* store plaintext block */
    551 	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
    552 2:	mov	x0, x9			/* x0 := enckey */
    553 	mov	x3, x5			/* x3 := nrounds */
    554 	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
    555 	subs	x10, x10, #0x10		/* count down nbytes */
    556 	b.ne	1b			/* repeat if more blocks */
    557 	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
    558 	sub	x2, x2, #0x10		/* store first plaintext block */
    559 	st1	{v0.16b}, [x2]
    560 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    561 	ret
    562 END(aesarmv8_cbc_dec1)
    563 
    564 /*
    565  * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    566  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    567  *     uint32_t nrounds@x5)
    568  *
    569  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    570  *
    571  *	nbytes must be a positive integral multiple of 128.
    572  *
    573  *	Standard ABI calling convention.
    574  */
    575 ENTRY(aesarmv8_cbc_dec8)
    576 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    577 	mov	fp, sp
    578 	ld1	{v24.16b}, [x4]		/* q24 := iv */
    579 	mov	x9, x0			/* x9 := enckey */
    580 	mov	x10, x3			/* x10 := nbytes */
    581 	add	x1, x1, x3		/* x1 := pointer past end of in */
    582 	add	x2, x2, x3		/* x2 := pointer past end of out */
    583 	sub	x1, x1, #0x20
    584 	ld1	{v6.16b, v7.16b}, [x1]	/* q6, q7 := last ciphertext blocks */
    585 	st1	{v7.16b}, [x4]		/* update iv */
    586 	b	2f
    587 	_ALIGN_TEXT
    588 1:	sub	x1, x1, #0x20
    589 	ld1	{v6.16b, v7.16b}, [x1]
    590 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
    591 	sub	x2, x2, #0x20
    592 	st1	{v0.16b, v1.16b}, [x2]
    593 2:	sub	x1, x1, #0x20
    594 	ld1	{v4.16b-v5.16b}, [x1]
    595 	sub	x1, x1, #0x40
    596 	ld1	{v0.16b-v3.16b}, [x1]
    597 
    598 	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0<i<8 */
    599 	mov	v30.16b, v5.16b
    600 	mov	v29.16b, v4.16b
    601 	mov	v28.16b, v3.16b
    602 	mov	v27.16b, v2.16b
    603 	mov	v26.16b, v1.16b
    604 	mov	v25.16b, v0.16b
    605 	mov	x0, x9			/* x0 := enckey */
    606 	mov	x3, x5			/* x3 := nrounds */
    607 	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i];
    608 					 * trash x0/x3/q16 */
    609 	eor	v7.16b, v7.16b, v31.16b	/* q[i] := pt[i] */
    610 	eor	v6.16b, v6.16b, v30.16b
    611 	eor	v5.16b, v5.16b, v29.16b
    612 	eor	v4.16b, v4.16b, v28.16b
    613 	eor	v3.16b, v3.16b, v27.16b
    614 	eor	v2.16b, v2.16b, v26.16b
    615 	eor	v1.16b, v1.16b, v25.16b
    616 	subs	x10, x10, #0x80		/* count down nbytes */
    617 	sub	x2, x2, #0x20		/* store plaintext blocks */
    618 	st1	{v6.16b-v7.16b}, [x2]
    619 	sub	x2, x2, #0x40
    620 	st1	{v2.16b-v5.16b}, [x2]
    621 	b.ne	1b			/* repeat if there's more */
    622 	eor	v0.16b, v0.16b, v24.16b	/* q0 := pt0 */
    623 	sub	x2, x2, #0x20
    624 	st1	{v0.16b, v1.16b}, [x2]	/* store first two plaintext blocks */
    625 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    626 	ret
    627 END(aesarmv8_cbc_dec8)
    628 
    629 /*
    630  * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    631  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    632  *     uint32_t nrounds@x5)
    633  *
    634  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    635  *
    636  *	nbytes must be a positive integral multiple of 16.  This routine
    637  *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
    638  *
    639  *	Standard ABI calling convention.
    640  */
    641 ENTRY(aesarmv8_xts_enc1)
    642 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    643 	mov	fp, sp
    644 	mov	x9, x0			/* x9 := enckey */
    645 	mov	x10, x3			/* x10 := nbytes */
    646 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    647 	_ALIGN_TEXT
    648 1:	ld1	{v0.16b}, [x1], #0x10	/* q0 := ptxt */
    649 	mov	x0, x9			/* x0 := enckey */
    650 	mov	x3, x5			/* x3 := nrounds */
    651 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ptxt ^ tweak */
    652 	bl	aesarmv8_enc1		/* q0 := AES(...); trash x0/x3/q16 */
    653 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
    654 	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
    655 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    656 	subs	x10, x10, #0x10		/* count down nbytes */
    657 	b.ne	1b			/* repeat if more blocks */
    658 	st1	{v31.16b}, [x4]		/* update tweak */
    659 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    660 	ret
    661 END(aesarmv8_xts_enc1)
    662 
    663 /*
    664  * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    665  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    666  *     uint32_t nrounds@x5)
    667  *
    668  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    669  *
    670  *	nbytes must be a positive integral multiple of 128.
    671  *
    672  *	Standard ABI calling convention.
    673  */
    674 ENTRY(aesarmv8_xts_enc8)
    675 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    676 	mov	fp, sp
    677 	mov	x9, x0			/* x9 := enckey */
    678 	mov	x10, x3			/* x10 := nbytes */
    679 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    680 	_ALIGN_TEXT
    681 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    682 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    683 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    684 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    685 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    686 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    687 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    688 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    689 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    690 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    691 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    692 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    693 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    694 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    695 					/* q31 := tweak[7] */
    696 	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ptxt[i] */
    697 	ld1	{v4.16b-v7.16b}, [x1], #0x40
    698 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
    699 	eor	v1.16b, v1.16b, v25.16b
    700 	eor	v2.16b, v2.16b, v26.16b
    701 	eor	v3.16b, v3.16b, v27.16b
    702 	eor	v4.16b, v4.16b, v28.16b
    703 	eor	v5.16b, v5.16b, v29.16b
    704 	eor	v6.16b, v6.16b, v30.16b
    705 	eor	v7.16b, v7.16b, v31.16b
    706 	mov	x0, x9			/* x0 := enckey */
    707 	mov	x3, x5			/* x3 := nrounds */
    708 	bl	aesarmv8_enc8		/* encrypt q0-q7; trash x0/x3/q16 */
    709 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    710 	eor	v1.16b, v1.16b, v25.16b
    711 	eor	v2.16b, v2.16b, v26.16b
    712 	eor	v3.16b, v3.16b, v27.16b
    713 	eor	v4.16b, v4.16b, v28.16b
    714 	eor	v5.16b, v5.16b, v29.16b
    715 	eor	v6.16b, v6.16b, v30.16b
    716 	eor	v7.16b, v7.16b, v31.16b
    717 	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store ciphertext blocks */
    718 	st1	{v4.16b-v7.16b}, [x2], #0x40
    719 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    720 	subs	x10, x10, #0x80		/* count down nbytes */
    721 	b.ne	1b			/* repeat if more block groups */
    722 	st1	{v31.16b}, [x4]		/* update tweak */
    723 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    724 	ret
    725 END(aesarmv8_xts_enc8)
    726 
    727 /*
    728  * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    729  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    730  *     uint32_t nrounds@x5)
    731  *
    732  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    733  *
    734  *	nbytes must be a positive integral multiple of 16.  This routine
    735  *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
    736  *
    737  *	Standard ABI calling convention.
    738  */
    739 ENTRY(aesarmv8_xts_dec1)
    740 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    741 	mov	fp, sp
    742 	mov	x9, x0			/* x9 := deckey */
    743 	mov	x10, x3			/* x10 := nbytes */
    744 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    745 	_ALIGN_TEXT
    746 1:	ld1	{v0.16b}, [x1], #0x10	/* q0 := ctxt */
    747 	mov	x0, x9			/* x0 := deckey */
    748 	mov	x3, x5			/* x3 := nrounds */
    749 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ctxt ^ tweak */
    750 	bl	aesarmv8_dec1		/* q0 := AES(...); trash x0/x3/q16 */
    751 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ctxt ^ tweak) ^ tweak */
    752 	st1	{v0.16b}, [x2], #0x10	/* store plaintext block */
    753 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    754 	subs	x10, x10, #0x10		/* count down nbytes */
    755 	b.ne	1b			/* repeat if more blocks */
    756 	st1	{v31.16b}, [x4]		/* update tweak */
    757 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    758 	ret
    759 END(aesarmv8_xts_dec1)
    760 
    761 /*
    762  * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    763  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    764  *     uint32_t nrounds@x5)
    765  *
    766  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    767  *
    768  *	nbytes must be a positive integral multiple of 128.
    769  *
    770  *	Standard ABI calling convention.
    771  */
    772 ENTRY(aesarmv8_xts_dec8)
    773 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    774 	mov	fp, sp
    775 	mov	x9, x0			/* x9 := deckey */
    776 	mov	x10, x3			/* x10 := nbytes */
    777 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    778 	_ALIGN_TEXT
    779 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    780 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    781 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    782 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    783 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    784 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    785 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    786 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    787 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    788 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    789 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    790 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    791 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    792 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    793 					/* q31 := tweak[7] */
    794 	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ctxt[i] */
    795 	ld1	{v4.16b-v7.16b}, [x1], #0x40
    796 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ctxt[i] ^ tweak[i] */
    797 	eor	v1.16b, v1.16b, v25.16b
    798 	eor	v2.16b, v2.16b, v26.16b
    799 	eor	v3.16b, v3.16b, v27.16b
    800 	eor	v4.16b, v4.16b, v28.16b
    801 	eor	v5.16b, v5.16b, v29.16b
    802 	eor	v6.16b, v6.16b, v30.16b
    803 	eor	v7.16b, v7.16b, v31.16b
    804 	mov	x0, x9			/* x0 := deckey */
    805 	mov	x3, x5			/* x3 := nrounds */
    806 	bl	aesarmv8_dec8		/* decrypt q0-q7; trash x0/x3/q16 */
    807 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    808 	eor	v1.16b, v1.16b, v25.16b
    809 	eor	v2.16b, v2.16b, v26.16b
    810 	eor	v3.16b, v3.16b, v27.16b
    811 	eor	v4.16b, v4.16b, v28.16b
    812 	eor	v5.16b, v5.16b, v29.16b
    813 	eor	v6.16b, v6.16b, v30.16b
    814 	eor	v7.16b, v7.16b, v31.16b
    815 	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store plaintext blocks */
    816 	st1	{v4.16b-v7.16b}, [x2], #0x40
    817 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    818 	subs	x10, x10, #0x80		/* count down nbytes */
    819 	b.ne	1b			/* repeat if more block groups */
    820 	st1	{v31.16b}, [x4]		/* update tweak */
    821 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    822 	ret
    823 END(aesarmv8_xts_dec8)
    824 
    825 /*
    826  * aesarmv8_xts_mulx(tweak@q31)
    827  *
    828  *	Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    829  *	Uses x0 and q0/q1 as temporaries.
    830  */
    831 	.text
    832 	_ALIGN_TEXT
    833 	.type	aesarmv8_xts_mulx,@function
    834 aesarmv8_xts_mulx:
    835 	/*
    836 	 * Simultaneously determine
    837 	 * (a) whether the high bit of the low half must be
    838 	 *     shifted into the low bit of the high half, and
    839 	 * (b) whether the high bit of the high half must be
    840 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    841 	 */
    842 	adrl	x0, xtscarry
    843 	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
    844 	ld1	{v0.16b}, [x0]		/* q0 := xtscarry */
    845 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
    846 	shl	v31.2d, v31.2d, #1	/* shift */
    847 	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
    848 	eor	v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
    849 	ret
    850 END(aesarmv8_xts_mulx)
    851 
    852 	.section .rodata
    853 	.p2align 4
    854 	.type	xtscarry,@object
    855 xtscarry:
    856 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    857 END(xtscarry)
    858 
    859 /*
    860  * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
    861  *
    862  *	Update an AES-XTS tweak.
    863  *
    864  *	Standard ABI calling convention.
    865  */
    866 ENTRY(aesarmv8_xts_update)
    867 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    868 	mov	fp, sp
    869 	ld1	{v31.16b}, [x0]		/* load tweak */
    870 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    871 	st1	{v31.16b}, [x1]		/* store tweak */
    872 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    873 	ret
    874 END(aesarmv8_xts_update)
    875 
    876 /*
    877  * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0,
    878  *     const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3,
    879  *     uint32_t nrounds@x4)
    880  *
    881  *	Update CBC-MAC.
    882  *
    883  *	nbytes must be a positive integral multiple of 16.
    884  *
    885  *	Standard ABI calling convention.
    886  */
    887 ENTRY(aesarmv8_cbcmac_update1)
    888 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    889 	mov	fp, sp
    890 	ld1	{v0.16b}, [x3]		/* q0 := initial authenticator */
    891 	mov	x9, x0			/* x9 := enckey */
    892 	mov	x5, x3			/* x5 := &auth (enc1 trashes x3) */
    893 	_ALIGN_TEXT
    894 1:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
    895 	mov	x0, x9			/* x0 := enckey */
    896 	mov	x3, x4			/* x3 := nrounds */
    897 	eor	v0.16b, v0.16b, v1.16b	/* q0 := auth ^ ptxt */
    898 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
    899 	subs	x2, x2, #0x10		/* count down nbytes */
    900 	b.ne	1b			/* repeat if x10 is nonzero */
    901 	st1	{v0.16b}, [x5]		/* store updated authenticator */
    902 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    903 	ret
    904 END(aesarmv8_cbcmac_update1)
    905 
    906 /*
    907  * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    908  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    909  *     uint32_t nrounds@x5)
    910  *
    911  *	Update CCM encryption.
    912  *
    913  *	nbytes must be a positive integral multiple of 16.
    914  *
    915  *	Standard ABI calling convention.
    916  */
    917 ENTRY(aesarmv8_ccm_enc1)
    918 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    919 	mov	fp, sp
    920 	ld1	{v0.16b, v1.16b}, [x4]	/* q0 := auth, q2 := ctr (be) */
    921 	mov	v2.16b, v1.16b
    922 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    923 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    924 	mov	x9, x0			/* x9 := enckey */
    925 	mov	x10, x3			/* x10 := nbytes */
    926 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
    927 	_ALIGN_TEXT
    928 1:	ld1	{v3.16b}, [x1], #0x10	/* q3 := plaintext block */
    929 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    930 	mov	x0, x9			/* x0 := enckey */
    931 	mov	x3, x5			/* x3 := nrounds */
    932 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    933 	eor	v0.16b, v0.16b, v3.16b	/* q0 := auth ^ ptxt */
    934 	bl	aesarmv8_enc2		/* q0 := auth', q1 := pad;
    935 					 * trash x0/x3/q16 */
    936 	eor	v3.16b, v1.16b, v3.16b	/* q3 := ciphertext block */
    937 	subs	x10, x10, #0x10		/* count down bytes */
    938 	st1	{v3.16b}, [x2], #0x10	/* store ciphertext block */
    939 	b.ne	1b			/* repeat if more blocks */
    940 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
    941 	mov	v1.16b, v2.16b		/* store updated auth/ctr */
    942 	st1	{v0.16b-v1.16b}, [x4]
    943 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    944 	ret
    945 END(aesarmv8_ccm_enc1)
    946 
    947 /*
    948  * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    949  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    950  *     uint32_t nrounds@x5)
    951  *
    952  *	Update CCM decryption.
    953  *
    954  *	nbytes must be a positive integral multiple of 16.
    955  *
    956  *	Standard ABI calling convention.
    957  */
    958 ENTRY(aesarmv8_ccm_dec1)
    959 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    960 	mov	fp, sp
    961 	ld1	{v1.16b, v2.16b}, [x4]	/* q1 := auth, q2 := ctr (be) */
    962 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    963 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    964 	mov	x9, x0			/* x9 := enckey */
    965 	mov	x10, x3			/* x10 := nbytes */
    966 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
    967 
    968 	/* Decrypt the first block.  */
    969 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    970 	mov	x3, x5			/* x3 := nrounds */
    971 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    972 	ld1	{v3.16b}, [x1], #0x10	/* q3 := ctxt */
    973 	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
    974 	b	2f
    975 
    976 	_ALIGN_TEXT
    977 1:	/*
    978 	 * Authenticate the last block and decrypt the next block
    979 	 * simultaneously.
    980 	 *
    981 	 *	q1 = auth ^ ptxt[-1]
    982 	 *	q2 = ctr[-1] (le)
    983 	 */
    984 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    985 	mov	x0, x9			/* x0 := enckey */
    986 	mov	x3, x5			/* x3 := nrounds */
    987 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    988 	ld1	{v3.16b}, [x1], #0x10	/* q3 := ctxt */
    989 	bl	aesarmv8_enc2		/* q0 := pad, q1 := auth';
    990 					 * trash x0/x3/q16 */
    991 2:	eor	v3.16b, v0.16b, v3.16b	/* q3 := plaintext block */
    992 	subs	x10, x10, #0x10
    993 	st1	{v3.16b}, [x2], #0x10		/* store plaintext */
    994 	eor	v1.16b, v1.16b, v3.16b	/* q1 := auth ^ ptxt */
    995 	b.ne	1b
    996 
    997 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
    998 
    999 	/* Authenticate the last block.  */
   1000 	mov	x0, x9			/* x0 := enckey */
   1001 	mov	x3, x5			/* x3 := nrounds */
   1002 	mov	v0.16b, v1.16b		/* q0 := auth ^ ptxt */
   1003 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
   1004 
   1005 	mov	v1.16b, v2.16b		/* store updated auth/ctr */
   1006 	st1	{v0.16b-v1.16b}, [x4]
   1007 	ldp	fp, lr, [sp], #16	/* pop stack frame */
   1008 	ret
   1009 END(aesarmv8_ccm_dec1)
   1010 
   1011 	.section .rodata
   1012 	.p2align 4
   1013 	.type	ctr32_inc,@object
   1014 ctr32_inc:
   1015 	.int	0, 0, 0, 1
   1016 END(ctr32_inc)
   1017 
   1018 /*
   1019  * aesarmv8_enc1(const struct aesenc *enckey@x0,
   1020  *     uint128_t block@q0, uint32_t nrounds@x3)
   1021  *
   1022  *	Encrypt a single AES block in q0.
   1023  *
   1024  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1025  */
   1026 	.text
   1027 	_ALIGN_TEXT
   1028 	.type	aesarmv8_enc1,@function
   1029 aesarmv8_enc1:
   1030 	ldr	q16, [x0], #0x10	/* load round key */
   1031 	sub	x3, x3, #1
   1032 	_ALIGN_TEXT
   1033 1:	/* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
   1034 	aese	v0.16b, v16.16b
   1035 	aesmc	v0.16b, v0.16b
   1036 	ldr	q16, [x0], #0x10
   1037 	subs	x3, x3, #1
   1038 	b.ne	1b
   1039 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
   1040 	aese	v0.16b, v16.16b
   1041 	ldr	q16, [x0]		/* load last round key */
   1042 	/* q0 := AddRoundKey_q16(q0) */
   1043 	eor	v0.16b, v0.16b, v16.16b
   1044 	ret
   1045 END(aesarmv8_enc1)
   1046 
   1047 /*
   1048  * aesarmv8_enc2(const struct aesenc *enckey@x0,
   1049  *     uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3)
   1050  *
   1051  *	Encrypt two AES blocks in q0 and q1.
   1052  *
   1053  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1054  */
   1055 	.text
   1056 	_ALIGN_TEXT
   1057 	.type	aesarmv8_enc2,@function
   1058 aesarmv8_enc2:
   1059 	ldr	q16, [x0], #0x10	/* load round key */
   1060 	sub	x3, x3, #1
   1061 	_ALIGN_TEXT
   1062 1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
   1063 	aese	v0.16b, v16.16b
   1064 	aesmc	v0.16b, v0.16b
   1065 	aese	v1.16b, v16.16b
   1066 	aesmc	v1.16b, v1.16b
   1067 	ldr	q16, [x0], #0x10	/* load next round key */
   1068 	subs	x3, x3, #1
   1069 	b.ne	1b
   1070 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1071 	aese	v0.16b, v16.16b
   1072 	aese	v1.16b, v16.16b
   1073 	ldr	q16, [x0]		/* load last round key */
   1074 	/* q[i] := AddRoundKey_q16(q[i]) */
   1075 	eor	v0.16b, v0.16b, v16.16b
   1076 	eor	v1.16b, v1.16b, v16.16b
   1077 	ret
   1078 END(aesarmv8_enc2)
   1079 
   1080 /*
   1081  * aesarmv8_enc8(const struct aesenc *enckey@x0,
   1082  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1083  *     uint32_t nrounds@x3)
   1084  *
   1085  *	Encrypt eight AES blocks in q0 through q7 in parallel.
   1086  *
   1087  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1088  */
   1089 	.text
   1090 	_ALIGN_TEXT
   1091 	.type	aesarmv8_enc8,@function
   1092 aesarmv8_enc8:
   1093 	ldr	q16, [x0], #0x10	/* load round key */
   1094 	sub	x3, x3, #1
   1095 	_ALIGN_TEXT
   1096 1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
   1097 	aese	v0.16b, v16.16b
   1098 	aesmc	v0.16b, v0.16b
   1099 	aese	v1.16b, v16.16b
   1100 	aesmc	v1.16b, v1.16b
   1101 	aese	v2.16b, v16.16b
   1102 	aesmc	v2.16b, v2.16b
   1103 	aese	v3.16b, v16.16b
   1104 	aesmc	v3.16b, v3.16b
   1105 	aese	v4.16b, v16.16b
   1106 	aesmc	v4.16b, v4.16b
   1107 	aese	v5.16b, v16.16b
   1108 	aesmc	v5.16b, v5.16b
   1109 	aese	v6.16b, v16.16b
   1110 	aesmc	v6.16b, v6.16b
   1111 	aese	v7.16b, v16.16b
   1112 	aesmc	v7.16b, v7.16b
   1113 	ldr	q16, [x0], #0x10	/* load next round key */
   1114 	subs	x3, x3, #1
   1115 	b.ne	1b
   1116 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1117 	aese	v0.16b, v16.16b
   1118 	aese	v1.16b, v16.16b
   1119 	aese	v2.16b, v16.16b
   1120 	aese	v3.16b, v16.16b
   1121 	aese	v4.16b, v16.16b
   1122 	aese	v5.16b, v16.16b
   1123 	aese	v6.16b, v16.16b
   1124 	aese	v7.16b, v16.16b
   1125 	ldr	q16, [x0]		/* load last round key */
   1126 	/* q[i] := AddRoundKey_q16(q[i]) */
   1127 	eor	v0.16b, v0.16b, v16.16b
   1128 	eor	v1.16b, v1.16b, v16.16b
   1129 	eor	v2.16b, v2.16b, v16.16b
   1130 	eor	v3.16b, v3.16b, v16.16b
   1131 	eor	v4.16b, v4.16b, v16.16b
   1132 	eor	v5.16b, v5.16b, v16.16b
   1133 	eor	v6.16b, v6.16b, v16.16b
   1134 	eor	v7.16b, v7.16b, v16.16b
   1135 	ret
   1136 END(aesarmv8_enc8)
   1137 
   1138 /*
   1139  * aesarmv8_dec1(const struct aesdec *deckey@x0,
   1140  *     uint128_t block@q0, uint32_t nrounds@x3)
   1141  *
   1142  *	Decrypt a single AES block in q0.
   1143  *
   1144  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1145  */
   1146 	.text
   1147 	_ALIGN_TEXT
   1148 	.type	aesarmv8_dec1,@function
   1149 aesarmv8_dec1:
   1150 	ldr	q16, [x0], #0x10	/* load round key */
   1151 	sub	x3, x3, #1
   1152 	_ALIGN_TEXT
   1153 1:	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1154 	aesd	v0.16b, v16.16b
   1155 	/* q0 := InMixColumns(q0) */
   1156 	aesimc	v0.16b, v0.16b
   1157 	ldr	q16, [x0], #0x10	/* load next round key */
   1158 	subs	x3, x3, #1
   1159 	b.ne	1b
   1160 	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1161 	aesd	v0.16b, v16.16b
   1162 	ldr	q16, [x0]		/* load last round key */
   1163 	/* q0 := AddRoundKey_q16(q0) */
   1164 	eor	v0.16b, v0.16b, v16.16b
   1165 	ret
   1166 END(aesarmv8_dec1)
   1167 
   1168 /*
   1169  * aesarmv8_dec8(const struct aesdec *deckey@x0,
   1170  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1171  *     uint32_t nrounds@x3)
   1172  *
   1173  *	Decrypt eight AES blocks in q0 through q7 in parallel.
   1174  *
   1175  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1176  */
   1177 	.text
   1178 	_ALIGN_TEXT
   1179 	.type	aesarmv8_dec8,@function
   1180 aesarmv8_dec8:
   1181 	ldr	q16, [x0], #0x10	/* load round key */
   1182 	sub	x3, x3, #1
   1183 	_ALIGN_TEXT
   1184 1:	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1185 	aesd	v0.16b, v16.16b
   1186 	/* q[i] := InMixColumns(q[i]) */
   1187 	aesimc	v0.16b, v0.16b
   1188 	aesd	v1.16b, v16.16b
   1189 	aesimc	v1.16b, v1.16b
   1190 	aesd	v2.16b, v16.16b
   1191 	aesimc	v2.16b, v2.16b
   1192 	aesd	v3.16b, v16.16b
   1193 	aesimc	v3.16b, v3.16b
   1194 	aesd	v4.16b, v16.16b
   1195 	aesimc	v4.16b, v4.16b
   1196 	aesd	v5.16b, v16.16b
   1197 	aesimc	v5.16b, v5.16b
   1198 	aesd	v6.16b, v16.16b
   1199 	aesimc	v6.16b, v6.16b
   1200 	aesd	v7.16b, v16.16b
   1201 	aesimc	v7.16b, v7.16b
   1202 	ldr	q16, [x0], #0x10	/* load next round key */
   1203 	subs	x3, x3, #1
   1204 	b.ne	1b
   1205 	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1206 	aesd	v0.16b, v16.16b
   1207 	aesd	v1.16b, v16.16b
   1208 	aesd	v2.16b, v16.16b
   1209 	aesd	v3.16b, v16.16b
   1210 	aesd	v4.16b, v16.16b
   1211 	aesd	v5.16b, v16.16b
   1212 	aesd	v6.16b, v16.16b
   1213 	aesd	v7.16b, v16.16b
   1214 	ldr	q16, [x0]		/* load last round key */
   1215 	/* q[i] := AddRoundKey_q16(q[i]) */
   1216 	eor	v0.16b, v0.16b, v16.16b
   1217 	eor	v1.16b, v1.16b, v16.16b
   1218 	eor	v2.16b, v2.16b, v16.16b
   1219 	eor	v3.16b, v3.16b, v16.16b
   1220 	eor	v4.16b, v4.16b, v16.16b
   1221 	eor	v5.16b, v5.16b, v16.16b
   1222 	eor	v6.16b, v6.16b, v16.16b
   1223 	eor	v7.16b, v7.16b, v16.16b
   1224 	ret
   1225 END(aesarmv8_dec8)
   1226