Home | History | Annotate | Line # | Download | only in arm
      1 /*	$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <aarch64/asm.h>
     30 
     31 RCSID("$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $")
     32 
     33 	.arch_extension	aes
     34 
     35 /*
     36  * uint32_t rcon[10]
     37  *
     38  *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
     39  *	Such elements of GF(8) need only eight bits to be represented,
     40  *	but we store them in 4-byte units so we can copy one into all
     41  *	four 4-byte lanes of a vector register with a single LD1R.  The
     42  *	access pattern is fixed, so indices into this table are never
     43  *	secret.
     44  */
     45 	.section .rodata
     46 	.p2align 2
     47 	.type	rcon,@object
     48 rcon:
     49 	.long	0x01
     50 	.long	0x02
     51 	.long	0x04
     52 	.long	0x08
     53 	.long	0x10
     54 	.long	0x20
     55 	.long	0x40
     56 	.long	0x80
     57 	.long	0x1b
     58 	.long	0x36
     59 END(rcon)
     60 
     61 /*
     62  * uint128_t unshiftrows_rotword_1
     63  *
     64  *	Table for TBL instruction to undo ShiftRows, and then do
     65  *	RotWord on word 1, and then copy it into all the other words.
     66  */
     67 	.section .rodata
     68 	.p2align 4
     69 	.type	unshiftrows_rotword_1,@object
     70 unshiftrows_rotword_1:
     71 	.byte	0x01,0x0e,0x0b,0x04
     72 	.byte	0x01,0x0e,0x0b,0x04
     73 	.byte	0x01,0x0e,0x0b,0x04
     74 	.byte	0x01,0x0e,0x0b,0x04
     75 END(unshiftrows_rotword_1)
     76 
     77 /*
     78  * uint128_t unshiftrows_3
     79  *
     80  *	Table for TBL instruction to undo ShiftRows, and then copy word
     81  *	3 into all the other words.
     82  */
     83 	.section .rodata
     84 	.p2align 4
     85 	.type	unshiftrows_3,@object
     86 unshiftrows_3:
     87 	.byte	0x0c,0x09,0x06,0x03
     88 	.byte	0x0c,0x09,0x06,0x03
     89 	.byte	0x0c,0x09,0x06,0x03
     90 	.byte	0x0c,0x09,0x06,0x03
     91 END(unshiftrows_3)
     92 
     93 /*
     94  * uint128_t unshiftrows_rotword_3
     95  *
     96  *	Table for TBL instruction to undo ShiftRows, and then do
     97  *	RotWord on word 3, and then copy it into all the other words.
     98  */
     99 	.section .rodata
    100 	.p2align 4
    101 	.type	unshiftrows_rotword_3,@object
    102 unshiftrows_rotword_3:
    103 	.byte	0x09,0x06,0x03,0x0c
    104 	.byte	0x09,0x06,0x03,0x0c
    105 	.byte	0x09,0x06,0x03,0x0c
    106 	.byte	0x09,0x06,0x03,0x0c
    107 END(unshiftrows_rotword_3)
    108 
    109 /*
    110  * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
    111  *
    112  *	Expand a 16-byte AES-128 key into 10 round keys.
    113  *
    114  *	Standard ABI calling convention.
    115  */
    116 ENTRY(aesarmv8_setenckey128)
    117 	ld1	{v1.16b}, [x1]	/* q1 := master key */
    118 
    119 	adrl	x4, unshiftrows_rotword_3
    120 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    121 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 table */
    122 
    123 	str	q1, [x0], #0x10	/* store master key as first round key */
    124 	mov	x2, #10		/* round count */
    125 	adrl	x3, rcon	/* round constant */
    126 
    127 1:	/*
    128 	 * q0 = 0
    129 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    130 	 * x0 = pointer to round key to compute
    131 	 * x2 = round count
    132 	 * x3 = rcon pointer
    133 	 */
    134 
    135 	/* q3 := ShiftRows(SubBytes(q1)) */
    136 	mov	v3.16b, v1.16b
    137 	aese	v3.16b, v0.16b
    138 
    139 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    140 	ld1r	{v4.4s}, [x3], #4
    141 	tbl	v3.16b, {v3.16b}, v16.16b
    142 	eor	v3.16b, v3.16b, v4.16b
    143 
    144 	/*
    145 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    146 	 * v6.4s := (0,0,prk[0],prk[1])
    147 	 * v7.4s := (0,0,0,prk[0])
    148 	 */
    149 	ext	v5.16b, v0.16b, v1.16b, #12
    150 	ext	v6.16b, v0.16b, v1.16b, #8
    151 	ext	v7.16b, v0.16b, v1.16b, #4
    152 
    153 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    154 	eor	v1.16b, v1.16b, v3.16b
    155 	eor	v1.16b, v1.16b, v5.16b
    156 	eor	v1.16b, v1.16b, v6.16b
    157 	eor	v1.16b, v1.16b, v7.16b
    158 
    159 	subs	x2, x2, #1	/* count down rounds */
    160 	str	q1, [x0], #0x10	/* store round key */
    161 	b.ne	1b
    162 
    163 	ret
    164 END(aesarmv8_setenckey128)
    165 
    166 /*
    167  * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
    168  *
    169  *	Expand a 24-byte AES-192 key into 12 round keys.
    170  *
    171  *	Standard ABI calling convention.
    172  */
    173 ENTRY(aesarmv8_setenckey192)
    174 	ld1	{v1.16b}, [x1], #0x10	/* q1 := master key[0:128) */
    175 	ld1	{v2.8b}, [x1]	/* d2 := master key[128:192) */
    176 
    177 	adrl	x4, unshiftrows_rotword_1
    178 	adrl	x5, unshiftrows_rotword_3
    179 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    180 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_1 */
    181 	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_rotword_3 */
    182 
    183 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
    184 	mov	x2, #12		/* round count */
    185 	adrl	x3, rcon	/* round constant */
    186 
    187 1:	/*
    188 	 * q0 = 0
    189 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    190 	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
    191 	 * x0 = pointer to three round keys to compute
    192 	 * x2 = round count
    193 	 * x3 = rcon pointer
    194 	 */
    195 
    196 	/* q3 := ShiftRows(SubBytes(q2)) */
    197 	mov	v3.16b, v2.16b
    198 	aese	v3.16b, v0.16b
    199 
    200 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
    201 	ld1r	{v4.4s}, [x3], #4
    202 	tbl	v3.16b, {v3.16b}, v16.16b
    203 	eor	v3.16b, v3.16b, v4.16b
    204 
    205 	/*
    206 	 * We need to compute:
    207 	 *
    208 	 * rk[0] := rklo[0]
    209 	 * rk[1] := rklo[1]
    210 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    211 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    212 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    213 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    214 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    215 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    216 	 *     ^ rklo[1]
    217 	 */
    218 
    219 	/*
    220 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    221 	 * v6.4s := (0,0,prk[0],prk[1])
    222 	 * v7.4s := (0,0,0,prk[0])
    223 	 */
    224 	ext	v5.16b, v0.16b, v1.16b, #12
    225 	ext	v6.16b, v0.16b, v1.16b, #8
    226 	ext	v7.16b, v0.16b, v1.16b, #4
    227 
    228 	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
    229 	eor	v5.16b, v5.16b, v1.16b
    230 	eor	v5.16b, v5.16b, v3.16b
    231 	eor	v5.16b, v5.16b, v6.16b
    232 	eor	v5.16b, v5.16b, v7.16b
    233 
    234 	/*
    235 	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
    236 	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
    237 	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
    238 	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
    239 	 * (rklo[0],rklo[1],...).
    240 	 */
    241 
    242 	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    243 	dup	v1.4s, v5.s[3]
    244 	mov	v1.s[0], v5.s[2]
    245 
    246 	/*
    247 	 * v6.4s := (0, 0, rklo[0], rklo[1])
    248 	 * v7.4s := (0, 0, 0, rklo[0])
    249 	 */
    250 	ext	v6.16b, v0.16b, v2.16b, #8
    251 	ext	v7.16b, v0.16b, v2.16b, #4
    252 
    253 	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    254 	eor	v3.16b, v1.16b, v6.16b
    255 	eor	v3.16b, v3.16b, v7.16b
    256 
    257 	/*
    258 	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
    259 	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
    260 	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
    261 	 */
    262 	mov	v2.d[1], v5.d[0]
    263 
    264 	/* store two round keys */
    265 	stp	q2, q3, [x0], #0x20
    266 
    267 	/*
    268 	 * Live vector registers at this point:
    269 	 *
    270 	 *	q0 = zero
    271 	 *	q2 = rk
    272 	 *	q3 = nrk
    273 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
    274 	 *	q16 = unshiftrows_rotword_1
    275 	 *	q17 = unshiftrows_rotword_3
    276 	 *
    277 	 * We have to compute, in q1:
    278 	 *
    279 	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
    280 	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
    281 	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    282 	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    283 	 *     ^ nrk[1]
    284 	 *
    285 	 * And, if there's any more afterward, in q2:
    286 	 *
    287 	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    288 	 *     ^ nrk[1] ^ nrk[2]
    289 	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    290 	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
    291 	 */
    292 
    293 	/* q1 := RotWords(SubBytes(q3)) */
    294 	mov	v1.16b, v3.16b
    295 	aese	v1.16b, v0.16b
    296 
    297 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
    298 	ld1r	{v4.4s}, [x3], #4
    299 	tbl	v1.16b, {v1.16b}, v17.16b
    300 	eor	v1.16b, v1.16b, v4.16b
    301 
    302 	/*
    303 	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
    304 	 * v4.4s := (0, rk[2], rk[3], nrk[0])
    305 	 * v6.4s := (0, 0, rk[2], rk[3])
    306 	 * v7.4s := (0, 0, 0, rk[2])
    307 	 */
    308 	ext	v4.16b, v0.16b, v5.16b, #12
    309 	ext	v6.16b, v0.16b, v5.16b, #8
    310 	ext	v7.16b, v0.16b, v5.16b, #4
    311 
    312 	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
    313 	eor	v1.16b, v1.16b, v5.16b
    314 	eor	v1.16b, v1.16b, v4.16b
    315 	eor	v1.16b, v1.16b, v6.16b
    316 	eor	v1.16b, v1.16b, v7.16b
    317 
    318 	subs	x2, x2, #3	/* count down three rounds */
    319 	str	q1, [x0], #0x10	/* store third round key */
    320 	b.eq	2f
    321 
    322 	/*
    323 	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
    324 	 * v5.4s := (0, nrk[2], xxx, xxx)
    325 	 */
    326 	ext	v4.16b, v3.16b, v0.16b, #8
    327 	ext	v5.16b, v0.16b, v4.16b, #12
    328 
    329 	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
    330 	dup	v2.4s, v1.s[3]
    331 
    332 	/*
    333 	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
    334 	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
    335 	 *     xxx, xxx)
    336 	 */
    337 	eor	v2.16b, v2.16b, v4.16b
    338 	eor	v2.16b, v2.16b, v5.16b
    339 
    340 	b	1b
    341 
    342 2:	ret
    343 END(aesarmv8_setenckey192)
    344 
    345 /*
    346  * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
    347  *
    348  *	Expand a 32-byte AES-256 key into 14 round keys.
    349  *
    350  *	Standard ABI calling convention.
    351  */
    352 ENTRY(aesarmv8_setenckey256)
    353 	/* q1 := key[0:128), q2 := key[128:256) */
    354 	ld1	{v1.16b-v2.16b}, [x1], #0x20
    355 
    356 	adrl	x4, unshiftrows_rotword_3
    357 	adrl	x5, unshiftrows_3
    358 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    359 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 */
    360 	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_3 */
    361 
    362 	/* store master key as first two round keys */
    363 	stp	q1, q2, [x0], #0x20
    364 	mov	x2, #14		/* round count */
    365 	adrl	x3, rcon	/* round constant */
    366 
    367 1:	/*
    368 	 * q0 = 0
    369 	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
    370 	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
    371 	 * x2 = round count
    372 	 * x3 = rcon pointer
    373 	 */
    374 
    375 	/* q3 := ShiftRows(SubBytes(q2)) */
    376 	mov	v3.16b, v2.16b
    377 	aese	v3.16b, v0.16b
    378 
    379 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    380 	ld1r	{v4.4s}, [x3], #4
    381 	tbl	v3.16b, {v3.16b}, v16.16b
    382 	eor	v3.16b, v3.16b, v4.16b
    383 
    384 	/*
    385 	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
    386 	 * v6.4s := (0,0,pprk[0],pprk[1])
    387 	 * v7.4s := (0,0,0,pprk[0])
    388 	 */
    389 	ext	v5.16b, v0.16b, v1.16b, #12
    390 	ext	v6.16b, v0.16b, v1.16b, #8
    391 	ext	v7.16b, v0.16b, v1.16b, #4
    392 
    393 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    394 	eor	v1.16b, v1.16b, v3.16b
    395 	eor	v1.16b, v1.16b, v5.16b
    396 	eor	v1.16b, v1.16b, v6.16b
    397 	eor	v1.16b, v1.16b, v7.16b
    398 
    399 	subs	x2, x2, #2		/* count down two rounds */
    400 	b.eq	2f			/* stop if this is the last one */
    401 
    402 	/* q3 := ShiftRows(SubBytes(q1)) */
    403 	mov	v3.16b, v1.16b
    404 	aese	v3.16b, v0.16b
    405 
    406 	/* v3.4s[i] := SubBytes(rk[3]) */
    407 	tbl	v3.16b, {v3.16b}, v17.16b
    408 
    409 	/*
    410 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    411 	 * v6.4s := (0,0,prk[0],prk[1])
    412 	 * v7.4s := (0,0,0,prk[0])
    413 	 */
    414 	ext	v5.16b, v0.16b, v2.16b, #12
    415 	ext	v6.16b, v0.16b, v2.16b, #8
    416 	ext	v7.16b, v0.16b, v2.16b, #4
    417 
    418 	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    419 	eor	v2.16b, v2.16b, v3.16b
    420 	eor	v2.16b, v2.16b, v5.16b
    421 	eor	v2.16b, v2.16b, v6.16b
    422 	eor	v2.16b, v2.16b, v7.16b
    423 
    424 	stp	q1, q2, [x0], #0x20	/* store two round keys */
    425 	b	1b
    426 
    427 2:	str	q1, [x0]		/* store last round key */
    428 	ret
    429 END(aesarmv8_setenckey256)
    430 
    431 /*
    432  * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
    433  *     uint32_t nrounds@x2)
    434  *
    435  *	Convert AES encryption round keys to AES decryption round keys.
    436  *	`rounds' must be between 10 and 14.
    437  *
    438  *	Standard ABI calling convention.
    439  */
    440 ENTRY(aesarmv8_enctodec)
    441 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
    442 	b	2f
    443 	_ALIGN_TEXT
    444 1:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
    445 2:	str	q0, [x1], #0x10	/* store round key */
    446 	subs	x2, x2, #1	/* count down round */
    447 	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
    448 	b.ne	1b		/* repeat if there's more */
    449 	str	q0, [x1]	/* store first round key verbatim */
    450 	ret
    451 END(aesarmv8_enctodec)
    452 
    453 /*
    454  * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
    455  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    456  *
    457  *	Encrypt a single block.
    458  *
    459  *	Standard ABI calling convention.
    460  */
    461 ENTRY(aesarmv8_enc)
    462 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    463 	mov	fp, sp
    464 	ld1	{v0.16b}, [x1]	/* q0 := ptxt */
    465 	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
    466 	st1	{v0.16b}, [x2]	/* store ctxt */
    467 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    468 	ret
    469 END(aesarmv8_enc)
    470 
    471 /*
    472  * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
    473  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    474  *
    475  *	Decrypt a single block.
    476  *
    477  *	Standard ABI calling convention.
    478  */
    479 ENTRY(aesarmv8_dec)
    480 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    481 	mov	fp, sp
    482 	ld1	{v0.16b}, [x1]	/* q0 := ctxt */
    483 	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
    484 	st1	{v0.16b}, [x2]	/* store ptxt */
    485 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    486 	ret
    487 END(aesarmv8_dec)
    488 
    489 /*
    490  * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    491  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
    492  *     uint32_t nrounds@x5)
    493  *
    494  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    495  *
    496  *	nbytes must be an integral multiple of 16.
    497  *
    498  *	Standard ABI calling convention.
    499  */
    500 ENTRY(aesarmv8_cbc_enc)
    501 	cbz	x3, 2f			/* stop if nothing to do */
    502 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    503 	mov	fp, sp
    504 	mov	x9, x0			/* x9 := enckey */
    505 	mov	x10, x3			/* x10 := nbytes */
    506 	ld1	{v0.16b}, [x4]		/* q0 := chaining value */
    507 	_ALIGN_TEXT
    508 1:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
    509 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
    510 	mov	x0, x9			/* x0 := enckey */
    511 	mov	x3, x5			/* x3 := nrounds */
    512 	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
    513 	subs	x10, x10, #0x10		/* count down nbytes */
    514 	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
    515 	b.ne	1b			/* repeat if x10 is nonzero */
    516 	st1	{v0.16b}, [x4]		/* store chaining value */
    517 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    518 2:	ret
    519 END(aesarmv8_cbc_enc)
    520 
    521 /*
    522  * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    523  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    524  *     uint32_t nrounds@x5)
    525  *
    526  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    527  *
    528  *	nbytes must be a positive integral multiple of 16.  This routine
    529  *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
    530  *
    531  *	Standard ABI calling convention.
    532  */
    533 ENTRY(aesarmv8_cbc_dec1)
    534 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    535 	mov	fp, sp
    536 	ld1	{v24.16b}, [x4]		/* q24 := iv */
    537 	mov	x9, x0			/* x9 := enckey */
    538 	mov	x10, x3			/* x10 := nbytes */
    539 	add	x1, x1, x3		/* x1 := pointer past end of in */
    540 	add	x2, x2, x3		/* x2 := pointer past end of out */
    541 	sub	x1, x1, #0x10
    542 	ld1	{v0.16b}, [x1]		/* q0 := last ciphertext block */
    543 	st1	{v0.16b}, [x4]		/* update iv */
    544 	b	2f
    545 	_ALIGN_TEXT
    546 1:	sub	x1, x1, #0x10
    547 	ld1	{v31.16b}, [x1]		/* q31 := chaining value */
    548 	sub	x2, x2, #0x10
    549 	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
    550 	st1	{v0.16b}, [x2]		/* store plaintext block */
    551 	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
    552 2:	mov	x0, x9			/* x0 := enckey */
    553 	mov	x3, x5			/* x3 := nrounds */
    554 	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
    555 	subs	x10, x10, #0x10		/* count down nbytes */
    556 	b.ne	1b			/* repeat if more blocks */
    557 	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
    558 	sub	x2, x2, #0x10		/* store first plaintext block */
    559 	st1	{v0.16b}, [x2]
    560 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    561 	ret
    562 END(aesarmv8_cbc_dec1)
    563 
    564 /*
    565  * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    566  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    567  *     uint32_t nrounds@x5)
    568  *
    569  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    570  *
    571  *	nbytes must be a positive integral multiple of 128.
    572  *
    573  *	Standard ABI calling convention.
    574  */
    575 ENTRY(aesarmv8_cbc_dec8)
    576 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    577 	mov	fp, sp
    578 	ld1	{v24.16b}, [x4]		/* q24 := iv */
    579 	mov	x9, x0			/* x9 := enckey */
    580 	mov	x10, x3			/* x10 := nbytes */
    581 	add	x1, x1, x3		/* x1 := pointer past end of in */
    582 	add	x2, x2, x3		/* x2 := pointer past end of out */
    583 	sub	x1, x1, #0x20
    584 	ld1	{v6.16b, v7.16b}, [x1]	/* q6, q7 := last ciphertext blocks */
    585 	st1	{v7.16b}, [x4]		/* update iv */
    586 	b	2f
    587 	_ALIGN_TEXT
    588 1:	sub	x1, x1, #0x20
    589 	ld1	{v6.16b, v7.16b}, [x1]
    590 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
    591 	sub	x2, x2, #0x20
    592 	st1	{v0.16b, v1.16b}, [x2]
    593 2:	sub	x1, x1, #0x20
    594 	ld1	{v4.16b-v5.16b}, [x1]
    595 	sub	x1, x1, #0x40
    596 	ld1	{v0.16b-v3.16b}, [x1]
    597 
    598 	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0<i<8 */
    599 	mov	v30.16b, v5.16b
    600 	mov	v29.16b, v4.16b
    601 	mov	v28.16b, v3.16b
    602 	mov	v27.16b, v2.16b
    603 	mov	v26.16b, v1.16b
    604 	mov	v25.16b, v0.16b
    605 	mov	x0, x9			/* x0 := enckey */
    606 	mov	x3, x5			/* x3 := nrounds */
    607 	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i];
    608 					 * trash x0/x3/q16 */
    609 	eor	v7.16b, v7.16b, v31.16b	/* q[i] := pt[i] */
    610 	eor	v6.16b, v6.16b, v30.16b
    611 	eor	v5.16b, v5.16b, v29.16b
    612 	eor	v4.16b, v4.16b, v28.16b
    613 	eor	v3.16b, v3.16b, v27.16b
    614 	eor	v2.16b, v2.16b, v26.16b
    615 	eor	v1.16b, v1.16b, v25.16b
    616 	subs	x10, x10, #0x80		/* count down nbytes */
    617 	sub	x2, x2, #0x20		/* store plaintext blocks */
    618 	st1	{v6.16b-v7.16b}, [x2]
    619 	sub	x2, x2, #0x40
    620 	st1	{v2.16b-v5.16b}, [x2]
    621 	b.ne	1b			/* repeat if there's more */
    622 	eor	v0.16b, v0.16b, v24.16b	/* q0 := pt0 */
    623 	sub	x2, x2, #0x20
    624 	st1	{v0.16b, v1.16b}, [x2]	/* store first two plaintext blocks */
    625 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    626 	ret
    627 END(aesarmv8_cbc_dec8)
    628 
    629 /*
    630  * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    631  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    632  *     uint32_t nrounds@x5)
    633  *
    634  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    635  *
    636  *	nbytes must be a positive integral multiple of 16.  This routine
    637  *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
    638  *
    639  *	Standard ABI calling convention.
    640  */
    641 ENTRY(aesarmv8_xts_enc1)
    642 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    643 	mov	fp, sp
    644 	mov	x9, x0			/* x9 := enckey */
    645 	mov	x10, x3			/* x10 := nbytes */
    646 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    647 	_ALIGN_TEXT
    648 1:	ld1	{v0.16b}, [x1], #0x10	/* q0 := ptxt */
    649 	mov	x0, x9			/* x0 := enckey */
    650 	mov	x3, x5			/* x3 := nrounds */
    651 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ptxt ^ tweak */
    652 	bl	aesarmv8_enc1		/* q0 := AES(...); trash x0/x3/q16 */
    653 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
    654 	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
    655 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    656 	subs	x10, x10, #0x10		/* count down nbytes */
    657 	b.ne	1b			/* repeat if more blocks */
    658 	st1	{v31.16b}, [x4]		/* update tweak */
    659 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    660 	ret
    661 END(aesarmv8_xts_enc1)
    662 
    663 /*
    664  * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    665  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    666  *     uint32_t nrounds@x5)
    667  *
    668  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    669  *
    670  *	nbytes must be a positive integral multiple of 128.
    671  *
    672  *	Standard ABI calling convention.
    673  */
    674 ENTRY(aesarmv8_xts_enc8)
    675 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    676 	mov	fp, sp
    677 	mov	x9, x0			/* x9 := enckey */
    678 	mov	x10, x3			/* x10 := nbytes */
    679 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    680 	_ALIGN_TEXT
    681 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    682 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    683 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    684 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    685 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    686 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    687 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    688 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    689 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    690 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    691 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    692 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    693 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    694 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    695 					/* q31 := tweak[7] */
    696 	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ptxt[i] */
    697 	ld1	{v4.16b-v7.16b}, [x1], #0x40
    698 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
    699 	eor	v1.16b, v1.16b, v25.16b
    700 	eor	v2.16b, v2.16b, v26.16b
    701 	eor	v3.16b, v3.16b, v27.16b
    702 	eor	v4.16b, v4.16b, v28.16b
    703 	eor	v5.16b, v5.16b, v29.16b
    704 	eor	v6.16b, v6.16b, v30.16b
    705 	eor	v7.16b, v7.16b, v31.16b
    706 	mov	x0, x9			/* x0 := enckey */
    707 	mov	x3, x5			/* x3 := nrounds */
    708 	bl	aesarmv8_enc8		/* encrypt q0-q7; trash x0/x3/q16 */
    709 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    710 	eor	v1.16b, v1.16b, v25.16b
    711 	eor	v2.16b, v2.16b, v26.16b
    712 	eor	v3.16b, v3.16b, v27.16b
    713 	eor	v4.16b, v4.16b, v28.16b
    714 	eor	v5.16b, v5.16b, v29.16b
    715 	eor	v6.16b, v6.16b, v30.16b
    716 	eor	v7.16b, v7.16b, v31.16b
    717 	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store ciphertext blocks */
    718 	st1	{v4.16b-v7.16b}, [x2], #0x40
    719 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    720 	subs	x10, x10, #0x80		/* count down nbytes */
    721 	b.ne	1b			/* repeat if more block groups */
    722 	st1	{v31.16b}, [x4]		/* update tweak */
    723 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    724 	ret
    725 END(aesarmv8_xts_enc8)
    726 
    727 /*
    728  * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    729  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    730  *     uint32_t nrounds@x5)
    731  *
    732  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    733  *
    734  *	nbytes must be a positive integral multiple of 16.  This routine
    735  *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
    736  *
    737  *	Standard ABI calling convention.
    738  */
    739 ENTRY(aesarmv8_xts_dec1)
    740 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    741 	mov	fp, sp
    742 	mov	x9, x0			/* x9 := deckey */
    743 	mov	x10, x3			/* x10 := nbytes */
    744 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    745 	_ALIGN_TEXT
    746 1:	ld1	{v0.16b}, [x1], #0x10	/* q0 := ctxt */
    747 	mov	x0, x9			/* x0 := deckey */
    748 	mov	x3, x5			/* x3 := nrounds */
    749 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ctxt ^ tweak */
    750 	bl	aesarmv8_dec1		/* q0 := AES(...); trash x0/x3/q16 */
    751 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ctxt ^ tweak) ^ tweak */
    752 	st1	{v0.16b}, [x2], #0x10	/* store plaintext block */
    753 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    754 	subs	x10, x10, #0x10		/* count down nbytes */
    755 	b.ne	1b			/* repeat if more blocks */
    756 	st1	{v31.16b}, [x4]		/* update tweak */
    757 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    758 	ret
    759 END(aesarmv8_xts_dec1)
    760 
    761 /*
    762  * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    763  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    764  *     uint32_t nrounds@x5)
    765  *
    766  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    767  *
    768  *	nbytes must be a positive integral multiple of 128.
    769  *
    770  *	Standard ABI calling convention.
    771  */
    772 ENTRY(aesarmv8_xts_dec8)
    773 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    774 	mov	fp, sp
    775 	mov	x9, x0			/* x9 := deckey */
    776 	mov	x10, x3			/* x10 := nbytes */
    777 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    778 	_ALIGN_TEXT
    779 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    780 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    781 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    782 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    783 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    784 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    785 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    786 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    787 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    788 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    789 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    790 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    791 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    792 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    793 					/* q31 := tweak[7] */
    794 	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ctxt[i] */
    795 	ld1	{v4.16b-v7.16b}, [x1], #0x40
    796 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ctxt[i] ^ tweak[i] */
    797 	eor	v1.16b, v1.16b, v25.16b
    798 	eor	v2.16b, v2.16b, v26.16b
    799 	eor	v3.16b, v3.16b, v27.16b
    800 	eor	v4.16b, v4.16b, v28.16b
    801 	eor	v5.16b, v5.16b, v29.16b
    802 	eor	v6.16b, v6.16b, v30.16b
    803 	eor	v7.16b, v7.16b, v31.16b
    804 	mov	x0, x9			/* x0 := deckey */
    805 	mov	x3, x5			/* x3 := nrounds */
    806 	bl	aesarmv8_dec8		/* decrypt q0-q7; trash x0/x3/q16 */
    807 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    808 	eor	v1.16b, v1.16b, v25.16b
    809 	eor	v2.16b, v2.16b, v26.16b
    810 	eor	v3.16b, v3.16b, v27.16b
    811 	eor	v4.16b, v4.16b, v28.16b
    812 	eor	v5.16b, v5.16b, v29.16b
    813 	eor	v6.16b, v6.16b, v30.16b
    814 	eor	v7.16b, v7.16b, v31.16b
    815 	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store plaintext blocks */
    816 	st1	{v4.16b-v7.16b}, [x2], #0x40
    817 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    818 	subs	x10, x10, #0x80		/* count down nbytes */
    819 	b.ne	1b			/* repeat if more block groups */
    820 	st1	{v31.16b}, [x4]		/* update tweak */
    821 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    822 	ret
    823 END(aesarmv8_xts_dec8)
    824 
    825 /*
    826  * aesarmv8_xts_mulx(tweak@q31)
    827  *
    828  *	Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    829  *	Uses x0 and q0/q1 as temporaries.
    830  */
    831 	.text
    832 	_ALIGN_TEXT
    833 	.type	aesarmv8_xts_mulx,@function
    834 aesarmv8_xts_mulx:
    835 	/*
    836 	 * Simultaneously determine
    837 	 * (a) whether the high bit of the low half must be
    838 	 *     shifted into the low bit of the high half, and
    839 	 * (b) whether the high bit of the high half must be
    840 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    841 	 */
    842 	adrl	x0, xtscarry
    843 	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
    844 	ld1	{v0.16b}, [x0]		/* q0 := xtscarry */
    845 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
    846 	shl	v31.2d, v31.2d, #1	/* shift */
    847 	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
    848 	eor	v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
    849 	ret
    850 END(aesarmv8_xts_mulx)
    851 
    852 	.section .rodata
    853 	.p2align 4
    854 	.type	xtscarry,@object
    855 xtscarry:
    856 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    857 END(xtscarry)
    858 
    859 /*
    860  * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
    861  *
    862  *	Update an AES-XTS tweak.
    863  *
    864  *	Standard ABI calling convention.
    865  */
    866 ENTRY(aesarmv8_xts_update)
    867 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    868 	mov	fp, sp
    869 	ld1	{v31.16b}, [x0]		/* load tweak */
    870 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    871 	st1	{v31.16b}, [x1]		/* store tweak */
    872 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    873 	ret
    874 END(aesarmv8_xts_update)
    875 
    876 /*
    877  * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0,
    878  *     const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3,
    879  *     uint32_t nrounds@x4)
    880  *
    881  *	Update CBC-MAC.
    882  *
    883  *	nbytes must be a positive integral multiple of 16.
    884  *
    885  *	Standard ABI calling convention.
    886  */
    887 ENTRY(aesarmv8_cbcmac_update1)
    888 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    889 	mov	fp, sp
    890 	ld1	{v0.16b}, [x3]		/* q0 := initial authenticator */
    891 	mov	x9, x0			/* x9 := enckey */
    892 	mov	x5, x3			/* x5 := &auth (enc1 trashes x3) */
    893 	_ALIGN_TEXT
    894 1:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
    895 	mov	x0, x9			/* x0 := enckey */
    896 	mov	x3, x4			/* x3 := nrounds */
    897 	eor	v0.16b, v0.16b, v1.16b	/* q0 := auth ^ ptxt */
    898 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
    899 	subs	x2, x2, #0x10		/* count down nbytes */
    900 	b.ne	1b			/* repeat if x10 is nonzero */
    901 	st1	{v0.16b}, [x5]		/* store updated authenticator */
    902 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    903 	ret
    904 END(aesarmv8_cbcmac_update1)
    905 
    906 /*
    907  * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    908  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    909  *     uint32_t nrounds@x5)
    910  *
    911  *	Update CCM encryption.
    912  *
    913  *	nbytes must be a positive integral multiple of 16.
    914  *
    915  *	Standard ABI calling convention.
    916  */
    917 ENTRY(aesarmv8_ccm_enc1)
    918 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    919 	mov	fp, sp
    920 	ld1	{v0.16b-v1.16b}, [x4]	/* q0 := auth, q1 := ctr (be) */
    921 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    922 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    923 	mov	x9, x0			/* x9 := enckey */
    924 	mov	x10, x3			/* x10 := nbytes */
    925 	rev32	v2.16b, v1.16b		/* q2 := ctr (host-endian) */
    926 	_ALIGN_TEXT
    927 1:	ld1	{v3.16b}, [x1], #0x10	/* q3 := plaintext block */
    928 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    929 	mov	x0, x9			/* x0 := enckey */
    930 	mov	x3, x5			/* x3 := nrounds */
    931 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    932 	eor	v0.16b, v0.16b, v3.16b	/* q0 := auth ^ ptxt */
    933 	bl	aesarmv8_enc2		/* q0 := auth', q1 := pad;
    934 					 * trash x0/x3/q16 */
    935 	eor	v3.16b, v1.16b, v3.16b	/* q3 := ciphertext block */
    936 	subs	x10, x10, #0x10		/* count down bytes */
    937 	st1	{v3.16b}, [x2], #0x10	/* store ciphertext block */
    938 	b.ne	1b			/* repeat if more blocks */
    939 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    940 	st1	{v0.16b-v1.16b}, [x4]	/* store updated auth/ctr */
    941 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    942 	ret
    943 END(aesarmv8_ccm_enc1)
    944 
    945 /*
    946  * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    947  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    948  *     uint32_t nrounds@x5)
    949  *
    950  *	Update CCM decryption.
    951  *
    952  *	nbytes must be a positive integral multiple of 16.
    953  *
    954  *	Standard ABI calling convention.
    955  */
    956 ENTRY(aesarmv8_ccm_dec1)
    957 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    958 	mov	fp, sp
    959 	ld1	{v1.16b, v2.16b}, [x4]	/* q1 := auth, q2 := ctr (be) */
    960 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    961 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    962 	mov	x9, x0			/* x9 := enckey */
    963 	mov	x10, x3			/* x10 := nbytes */
    964 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
    965 
    966 	/* Decrypt the first block.  */
    967 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    968 	mov	x3, x5			/* x3 := nrounds */
    969 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    970 	ld1	{v3.16b}, [x1], #0x10	/* q3 := ctxt */
    971 	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
    972 	b	2f
    973 
    974 	_ALIGN_TEXT
    975 1:	/*
    976 	 * Authenticate the last block and decrypt the next block
    977 	 * simultaneously.
    978 	 *
    979 	 *	q1 = auth ^ ptxt[-1]
    980 	 *	q2 = ctr[-1] (le)
    981 	 */
    982 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    983 	mov	x0, x9			/* x0 := enckey */
    984 	mov	x3, x5			/* x3 := nrounds */
    985 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    986 	ld1	{v3.16b}, [x1], #0x10	/* q3 := ctxt */
    987 	bl	aesarmv8_enc2		/* q0 := pad, q1 := auth';
    988 					 * trash x0/x3/q16 */
    989 2:	eor	v3.16b, v0.16b, v3.16b	/* q3 := plaintext block */
    990 	subs	x10, x10, #0x10
    991 	st1	{v3.16b}, [x2], #0x10		/* store plaintext */
    992 	eor	v1.16b, v1.16b, v3.16b	/* q1 := auth ^ ptxt */
    993 	b.ne	1b
    994 
    995 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
    996 
    997 	/* Authenticate the last block.  */
    998 	mov	x0, x9			/* x0 := enckey */
    999 	mov	x3, x5			/* x3 := nrounds */
   1000 	mov	v0.16b, v1.16b		/* q0 := auth ^ ptxt */
   1001 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
   1002 
   1003 	mov	v1.16b, v2.16b		/* store updated auth/ctr */
   1004 	st1	{v0.16b-v1.16b}, [x4]
   1005 	ldp	fp, lr, [sp], #16	/* pop stack frame */
   1006 	ret
   1007 END(aesarmv8_ccm_dec1)
   1008 
   1009 	.section .rodata
   1010 	.p2align 4
   1011 	.type	ctr32_inc,@object
   1012 ctr32_inc:
   1013 	.int	0, 0, 0, 1
   1014 END(ctr32_inc)
   1015 
   1016 /*
   1017  * aesarmv8_enc1(const struct aesenc *enckey@x0,
   1018  *     uint128_t block@q0, uint32_t nrounds@x3)
   1019  *
   1020  *	Encrypt a single AES block in q0.
   1021  *
   1022  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1023  */
   1024 	.text
   1025 	_ALIGN_TEXT
   1026 	.type	aesarmv8_enc1,@function
   1027 aesarmv8_enc1:
   1028 	ldr	q16, [x0], #0x10	/* load round key */
   1029 	sub	x3, x3, #1
   1030 	_ALIGN_TEXT
   1031 1:	/* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
   1032 	aese	v0.16b, v16.16b
   1033 	aesmc	v0.16b, v0.16b
   1034 	ldr	q16, [x0], #0x10
   1035 	subs	x3, x3, #1
   1036 	b.ne	1b
   1037 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
   1038 	aese	v0.16b, v16.16b
   1039 	ldr	q16, [x0]		/* load last round key */
   1040 	/* q0 := AddRoundKey_q16(q0) */
   1041 	eor	v0.16b, v0.16b, v16.16b
   1042 	ret
   1043 END(aesarmv8_enc1)
   1044 
   1045 /*
   1046  * aesarmv8_enc2(const struct aesenc *enckey@x0,
   1047  *     uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3)
   1048  *
   1049  *	Encrypt two AES blocks in q0 and q1.
   1050  *
   1051  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1052  */
   1053 	.text
   1054 	_ALIGN_TEXT
   1055 	.type	aesarmv8_enc2,@function
   1056 aesarmv8_enc2:
   1057 	ldr	q16, [x0], #0x10	/* load round key */
   1058 	sub	x3, x3, #1
   1059 	_ALIGN_TEXT
   1060 1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
   1061 	aese	v0.16b, v16.16b
   1062 	aesmc	v0.16b, v0.16b
   1063 	aese	v1.16b, v16.16b
   1064 	aesmc	v1.16b, v1.16b
   1065 	ldr	q16, [x0], #0x10	/* load next round key */
   1066 	subs	x3, x3, #1
   1067 	b.ne	1b
   1068 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1069 	aese	v0.16b, v16.16b
   1070 	aese	v1.16b, v16.16b
   1071 	ldr	q16, [x0]		/* load last round key */
   1072 	/* q[i] := AddRoundKey_q16(q[i]) */
   1073 	eor	v0.16b, v0.16b, v16.16b
   1074 	eor	v1.16b, v1.16b, v16.16b
   1075 	ret
   1076 END(aesarmv8_enc2)
   1077 
   1078 /*
   1079  * aesarmv8_enc8(const struct aesenc *enckey@x0,
   1080  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1081  *     uint32_t nrounds@x3)
   1082  *
   1083  *	Encrypt eight AES blocks in q0 through q7 in parallel.
   1084  *
   1085  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1086  */
   1087 	.text
   1088 	_ALIGN_TEXT
   1089 	.type	aesarmv8_enc8,@function
   1090 aesarmv8_enc8:
   1091 	ldr	q16, [x0], #0x10	/* load round key */
   1092 	sub	x3, x3, #1
   1093 	_ALIGN_TEXT
   1094 1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
   1095 	aese	v0.16b, v16.16b
   1096 	aesmc	v0.16b, v0.16b
   1097 	aese	v1.16b, v16.16b
   1098 	aesmc	v1.16b, v1.16b
   1099 	aese	v2.16b, v16.16b
   1100 	aesmc	v2.16b, v2.16b
   1101 	aese	v3.16b, v16.16b
   1102 	aesmc	v3.16b, v3.16b
   1103 	aese	v4.16b, v16.16b
   1104 	aesmc	v4.16b, v4.16b
   1105 	aese	v5.16b, v16.16b
   1106 	aesmc	v5.16b, v5.16b
   1107 	aese	v6.16b, v16.16b
   1108 	aesmc	v6.16b, v6.16b
   1109 	aese	v7.16b, v16.16b
   1110 	aesmc	v7.16b, v7.16b
   1111 	ldr	q16, [x0], #0x10	/* load next round key */
   1112 	subs	x3, x3, #1
   1113 	b.ne	1b
   1114 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1115 	aese	v0.16b, v16.16b
   1116 	aese	v1.16b, v16.16b
   1117 	aese	v2.16b, v16.16b
   1118 	aese	v3.16b, v16.16b
   1119 	aese	v4.16b, v16.16b
   1120 	aese	v5.16b, v16.16b
   1121 	aese	v6.16b, v16.16b
   1122 	aese	v7.16b, v16.16b
   1123 	ldr	q16, [x0]		/* load last round key */
   1124 	/* q[i] := AddRoundKey_q16(q[i]) */
   1125 	eor	v0.16b, v0.16b, v16.16b
   1126 	eor	v1.16b, v1.16b, v16.16b
   1127 	eor	v2.16b, v2.16b, v16.16b
   1128 	eor	v3.16b, v3.16b, v16.16b
   1129 	eor	v4.16b, v4.16b, v16.16b
   1130 	eor	v5.16b, v5.16b, v16.16b
   1131 	eor	v6.16b, v6.16b, v16.16b
   1132 	eor	v7.16b, v7.16b, v16.16b
   1133 	ret
   1134 END(aesarmv8_enc8)
   1135 
   1136 /*
   1137  * aesarmv8_dec1(const struct aesdec *deckey@x0,
   1138  *     uint128_t block@q0, uint32_t nrounds@x3)
   1139  *
   1140  *	Decrypt a single AES block in q0.
   1141  *
   1142  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1143  */
   1144 	.text
   1145 	_ALIGN_TEXT
   1146 	.type	aesarmv8_dec1,@function
   1147 aesarmv8_dec1:
   1148 	ldr	q16, [x0], #0x10	/* load round key */
   1149 	sub	x3, x3, #1
   1150 	_ALIGN_TEXT
   1151 1:	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1152 	aesd	v0.16b, v16.16b
   1153 	/* q0 := InMixColumns(q0) */
   1154 	aesimc	v0.16b, v0.16b
   1155 	ldr	q16, [x0], #0x10	/* load next round key */
   1156 	subs	x3, x3, #1
   1157 	b.ne	1b
   1158 	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1159 	aesd	v0.16b, v16.16b
   1160 	ldr	q16, [x0]		/* load last round key */
   1161 	/* q0 := AddRoundKey_q16(q0) */
   1162 	eor	v0.16b, v0.16b, v16.16b
   1163 	ret
   1164 END(aesarmv8_dec1)
   1165 
   1166 /*
   1167  * aesarmv8_dec8(const struct aesdec *deckey@x0,
   1168  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1169  *     uint32_t nrounds@x3)
   1170  *
   1171  *	Decrypt eight AES blocks in q0 through q7 in parallel.
   1172  *
   1173  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1174  */
   1175 	.text
   1176 	_ALIGN_TEXT
   1177 	.type	aesarmv8_dec8,@function
   1178 aesarmv8_dec8:
   1179 	ldr	q16, [x0], #0x10	/* load round key */
   1180 	sub	x3, x3, #1
   1181 	_ALIGN_TEXT
   1182 1:	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1183 	aesd	v0.16b, v16.16b
   1184 	/* q[i] := InMixColumns(q[i]) */
   1185 	aesimc	v0.16b, v0.16b
   1186 	aesd	v1.16b, v16.16b
   1187 	aesimc	v1.16b, v1.16b
   1188 	aesd	v2.16b, v16.16b
   1189 	aesimc	v2.16b, v2.16b
   1190 	aesd	v3.16b, v16.16b
   1191 	aesimc	v3.16b, v3.16b
   1192 	aesd	v4.16b, v16.16b
   1193 	aesimc	v4.16b, v4.16b
   1194 	aesd	v5.16b, v16.16b
   1195 	aesimc	v5.16b, v5.16b
   1196 	aesd	v6.16b, v16.16b
   1197 	aesimc	v6.16b, v6.16b
   1198 	aesd	v7.16b, v16.16b
   1199 	aesimc	v7.16b, v7.16b
   1200 	ldr	q16, [x0], #0x10	/* load next round key */
   1201 	subs	x3, x3, #1
   1202 	b.ne	1b
   1203 	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1204 	aesd	v0.16b, v16.16b
   1205 	aesd	v1.16b, v16.16b
   1206 	aesd	v2.16b, v16.16b
   1207 	aesd	v3.16b, v16.16b
   1208 	aesd	v4.16b, v16.16b
   1209 	aesd	v5.16b, v16.16b
   1210 	aesd	v6.16b, v16.16b
   1211 	aesd	v7.16b, v16.16b
   1212 	ldr	q16, [x0]		/* load last round key */
   1213 	/* q[i] := AddRoundKey_q16(q[i]) */
   1214 	eor	v0.16b, v0.16b, v16.16b
   1215 	eor	v1.16b, v1.16b, v16.16b
   1216 	eor	v2.16b, v2.16b, v16.16b
   1217 	eor	v3.16b, v3.16b, v16.16b
   1218 	eor	v4.16b, v4.16b, v16.16b
   1219 	eor	v5.16b, v5.16b, v16.16b
   1220 	eor	v6.16b, v6.16b, v16.16b
   1221 	eor	v7.16b, v7.16b, v16.16b
   1222 	ret
   1223 END(aesarmv8_dec8)
   1224