Home | History | Annotate | Line # | Download | only in arm
aes_armv8_64.S revision 1.7
      1  1.7  riastrad /*	$NetBSD: aes_armv8_64.S,v 1.7 2020/07/25 22:32:09 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.1  riastrad #include <aarch64/asm.h>
     30  1.1  riastrad 
     31  1.3  riastrad 	.arch_extension	aes
     32  1.1  riastrad 
     33  1.1  riastrad /*
     34  1.1  riastrad  * uint32_t rcon[10]
     35  1.1  riastrad  *
     36  1.1  riastrad  *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
     37  1.1  riastrad  *	Such elements of GF(8) need only eight bits to be represented,
     38  1.1  riastrad  *	but we store them in 4-byte units so we can copy one into all
     39  1.1  riastrad  *	four 4-byte lanes of a vector register with a single LD1R.  The
     40  1.1  riastrad  *	access pattern is fixed, so indices into this table are never
     41  1.1  riastrad  *	secret.
     42  1.1  riastrad  */
     43  1.1  riastrad 	.section .rodata
     44  1.2  riastrad 	.p2align 2
     45  1.1  riastrad 	.type	rcon,@object
     46  1.1  riastrad rcon:
     47  1.1  riastrad 	.long	0x01
     48  1.1  riastrad 	.long	0x02
     49  1.1  riastrad 	.long	0x04
     50  1.1  riastrad 	.long	0x08
     51  1.1  riastrad 	.long	0x10
     52  1.1  riastrad 	.long	0x20
     53  1.1  riastrad 	.long	0x40
     54  1.1  riastrad 	.long	0x80
     55  1.1  riastrad 	.long	0x1b
     56  1.1  riastrad 	.long	0x36
     57  1.1  riastrad END(rcon)
     58  1.1  riastrad 
     59  1.1  riastrad /*
     60  1.1  riastrad  * uint128_t unshiftrows_rotword_1
     61  1.1  riastrad  *
     62  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then do
     63  1.1  riastrad  *	RotWord on word 1, and then copy it into all the other words.
     64  1.1  riastrad  */
     65  1.1  riastrad 	.section .rodata
     66  1.2  riastrad 	.p2align 4
     67  1.1  riastrad 	.type	unshiftrows_rotword_1,@object
     68  1.1  riastrad unshiftrows_rotword_1:
     69  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     70  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     71  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     72  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     73  1.1  riastrad END(unshiftrows_rotword_1)
     74  1.1  riastrad 
     75  1.1  riastrad /*
     76  1.1  riastrad  * uint128_t unshiftrows_3
     77  1.1  riastrad  *
     78  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then copy word
     79  1.1  riastrad  *	3 into all the other words.
     80  1.1  riastrad  */
     81  1.1  riastrad 	.section .rodata
     82  1.2  riastrad 	.p2align 4
     83  1.1  riastrad 	.type	unshiftrows_3,@object
     84  1.1  riastrad unshiftrows_3:
     85  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     86  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     87  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     88  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     89  1.1  riastrad END(unshiftrows_3)
     90  1.1  riastrad 
     91  1.1  riastrad /*
     92  1.1  riastrad  * uint128_t unshiftrows_rotword_3
     93  1.1  riastrad  *
     94  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then do
     95  1.1  riastrad  *	RotWord on word 3, and then copy it into all the other words.
     96  1.1  riastrad  */
     97  1.1  riastrad 	.section .rodata
     98  1.2  riastrad 	.p2align 4
     99  1.1  riastrad 	.type	unshiftrows_rotword_3,@object
    100  1.1  riastrad unshiftrows_rotword_3:
    101  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    102  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    103  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    104  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    105  1.1  riastrad END(unshiftrows_rotword_3)
    106  1.1  riastrad 
    107  1.1  riastrad /*
    108  1.1  riastrad  * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
    109  1.1  riastrad  *
    110  1.1  riastrad  *	Expand a 16-byte AES-128 key into 10 round keys.
    111  1.1  riastrad  *
    112  1.1  riastrad  *	Standard ABI calling convention.
    113  1.1  riastrad  */
    114  1.1  riastrad ENTRY(aesarmv8_setenckey128)
    115  1.1  riastrad 	ldr	q1, [x1]	/* q1 := master key */
    116  1.1  riastrad 
    117  1.1  riastrad 	adrl	x4, unshiftrows_rotword_3
    118  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    119  1.4  riastrad 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 table */
    120  1.1  riastrad 
    121  1.1  riastrad 	str	q1, [x0], #0x10	/* store master key as first round key */
    122  1.1  riastrad 	mov	x2, #10		/* round count */
    123  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    124  1.1  riastrad 
    125  1.1  riastrad 1:	/*
    126  1.1  riastrad 	 * q0 = 0
    127  1.1  riastrad 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    128  1.1  riastrad 	 * x0 = pointer to round key to compute
    129  1.1  riastrad 	 * x2 = round count
    130  1.1  riastrad 	 * x3 = rcon pointer
    131  1.1  riastrad 	 */
    132  1.1  riastrad 
    133  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q1)) */
    134  1.1  riastrad 	mov	v3.16b, v1.16b
    135  1.1  riastrad 	aese	v3.16b, v0.16b
    136  1.1  riastrad 
    137  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    138  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    139  1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    140  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    141  1.1  riastrad 
    142  1.1  riastrad 	/*
    143  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    144  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    145  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    146  1.1  riastrad 	 */
    147  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    148  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    149  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    150  1.1  riastrad 
    151  1.1  riastrad 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    152  1.1  riastrad 	eor	v1.16b, v1.16b, v3.16b
    153  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    154  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    155  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    156  1.1  riastrad 
    157  1.1  riastrad 	subs	x2, x2, #1	/* count down rounds */
    158  1.1  riastrad 	str	q1, [x0], #0x10	/* store round key */
    159  1.1  riastrad 	b.ne	1b
    160  1.1  riastrad 
    161  1.1  riastrad 	ret
    162  1.1  riastrad END(aesarmv8_setenckey128)
    163  1.1  riastrad 
    164  1.1  riastrad /*
    165  1.1  riastrad  * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
    166  1.1  riastrad  *
    167  1.1  riastrad  *	Expand a 24-byte AES-192 key into 12 round keys.
    168  1.1  riastrad  *
    169  1.1  riastrad  *	Standard ABI calling convention.
    170  1.1  riastrad  */
    171  1.1  riastrad ENTRY(aesarmv8_setenckey192)
    172  1.1  riastrad 	ldr	q1, [x1], #0x10	/* q1 := master key[0:128) */
    173  1.1  riastrad 	ldr	d2, [x1]	/* d2 := master key[128:192) */
    174  1.1  riastrad 
    175  1.1  riastrad 	adrl	x4, unshiftrows_rotword_1
    176  1.1  riastrad 	adrl	x5, unshiftrows_rotword_3
    177  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    178  1.4  riastrad 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_1 */
    179  1.4  riastrad 	ldr	q17, [x5]	/* q17 := unshiftrows_rotword_3 */
    180  1.1  riastrad 
    181  1.1  riastrad 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
    182  1.1  riastrad 	mov	x2, #12		/* round count */
    183  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    184  1.1  riastrad 
    185  1.1  riastrad 1:	/*
    186  1.1  riastrad 	 * q0 = 0
    187  1.1  riastrad 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    188  1.1  riastrad 	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
    189  1.1  riastrad 	 * x0 = pointer to three round keys to compute
    190  1.1  riastrad 	 * x2 = round count
    191  1.1  riastrad 	 * x3 = rcon pointer
    192  1.1  riastrad 	 */
    193  1.1  riastrad 
    194  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q2)) */
    195  1.1  riastrad 	mov	v3.16b, v2.16b
    196  1.1  riastrad 	aese	v3.16b, v0.16b
    197  1.1  riastrad 
    198  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
    199  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    200  1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    201  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    202  1.1  riastrad 
    203  1.1  riastrad 	/*
    204  1.1  riastrad 	 * We need to compute:
    205  1.1  riastrad 	 *
    206  1.1  riastrad 	 * rk[0] := rklo[0]
    207  1.1  riastrad 	 * rk[1] := rklo[1]
    208  1.1  riastrad 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    209  1.1  riastrad 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    210  1.1  riastrad 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    211  1.1  riastrad 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    212  1.1  riastrad 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    213  1.1  riastrad 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    214  1.1  riastrad 	 *     ^ rklo[1]
    215  1.1  riastrad 	 */
    216  1.1  riastrad 
    217  1.1  riastrad 	/*
    218  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    219  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    220  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    221  1.1  riastrad 	 */
    222  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    223  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    224  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    225  1.1  riastrad 
    226  1.1  riastrad 	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
    227  1.1  riastrad 	eor	v5.16b, v5.16b, v1.16b
    228  1.1  riastrad 	eor	v5.16b, v5.16b, v3.16b
    229  1.1  riastrad 	eor	v5.16b, v5.16b, v6.16b
    230  1.1  riastrad 	eor	v5.16b, v5.16b, v7.16b
    231  1.1  riastrad 
    232  1.1  riastrad 	/*
    233  1.1  riastrad 	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
    234  1.1  riastrad 	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
    235  1.1  riastrad 	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
    236  1.1  riastrad 	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
    237  1.1  riastrad 	 * (rklo[0],rklo[1],...).
    238  1.1  riastrad 	 */
    239  1.1  riastrad 
    240  1.1  riastrad 	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    241  1.5       ryo 	dup	v1.4s, v5.s[3]
    242  1.5       ryo 	mov	v1.s[0], v5.s[2]
    243  1.1  riastrad 
    244  1.1  riastrad 	/*
    245  1.1  riastrad 	 * v6.4s := (0, 0, rklo[0], rklo[1])
    246  1.1  riastrad 	 * v7.4s := (0, 0, 0, rklo[0])
    247  1.1  riastrad 	 */
    248  1.1  riastrad 	ext	v6.16b, v0.16b, v2.16b, #8
    249  1.1  riastrad 	ext	v7.16b, v0.16b, v2.16b, #4
    250  1.1  riastrad 
    251  1.1  riastrad 	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    252  1.1  riastrad 	eor	v3.16b, v1.16b, v6.16b
    253  1.1  riastrad 	eor	v3.16b, v3.16b, v7.16b
    254  1.1  riastrad 
    255  1.1  riastrad 	/*
    256  1.1  riastrad 	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
    257  1.1  riastrad 	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
    258  1.1  riastrad 	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
    259  1.1  riastrad 	 */
    260  1.5       ryo 	mov	v2.d[1], v5.d[0]
    261  1.1  riastrad 
    262  1.1  riastrad 	/* store two round keys */
    263  1.1  riastrad 	stp	q2, q3, [x0], #0x20
    264  1.1  riastrad 
    265  1.1  riastrad 	/*
    266  1.1  riastrad 	 * Live vector registers at this point:
    267  1.1  riastrad 	 *
    268  1.1  riastrad 	 *	q0 = zero
    269  1.1  riastrad 	 *	q2 = rk
    270  1.1  riastrad 	 *	q3 = nrk
    271  1.1  riastrad 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
    272  1.4  riastrad 	 *	q16 = unshiftrows_rotword_1
    273  1.4  riastrad 	 *	q17 = unshiftrows_rotword_3
    274  1.1  riastrad 	 *
    275  1.1  riastrad 	 * We have to compute, in q1:
    276  1.1  riastrad 	 *
    277  1.1  riastrad 	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
    278  1.1  riastrad 	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
    279  1.1  riastrad 	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    280  1.1  riastrad 	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    281  1.1  riastrad 	 *     ^ nrk[1]
    282  1.1  riastrad 	 *
    283  1.1  riastrad 	 * And, if there's any more afterward, in q2:
    284  1.1  riastrad 	 *
    285  1.1  riastrad 	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    286  1.1  riastrad 	 *     ^ nrk[1] ^ nrk[2]
    287  1.1  riastrad 	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    288  1.1  riastrad 	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
    289  1.1  riastrad 	 */
    290  1.1  riastrad 
    291  1.1  riastrad 	/* q1 := RotWords(SubBytes(q3)) */
    292  1.1  riastrad 	mov	v1.16b, v3.16b
    293  1.1  riastrad 	aese	v1.16b, v0.16b
    294  1.1  riastrad 
    295  1.1  riastrad 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
    296  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    297  1.4  riastrad 	tbl	v1.16b, {v1.16b}, v17.16b
    298  1.1  riastrad 	eor	v1.16b, v1.16b, v4.16b
    299  1.1  riastrad 
    300  1.1  riastrad 	/*
    301  1.1  riastrad 	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
    302  1.1  riastrad 	 * v4.4s := (0, rk[2], rk[3], nrk[0])
    303  1.1  riastrad 	 * v6.4s := (0, 0, rk[2], rk[3])
    304  1.1  riastrad 	 * v7.4s := (0, 0, 0, rk[2])
    305  1.1  riastrad 	 */
    306  1.1  riastrad 	ext	v4.16b, v0.16b, v5.16b, #12
    307  1.1  riastrad 	ext	v6.16b, v0.16b, v5.16b, #8
    308  1.1  riastrad 	ext	v7.16b, v0.16b, v5.16b, #4
    309  1.1  riastrad 
    310  1.1  riastrad 	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
    311  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    312  1.1  riastrad 	eor	v1.16b, v1.16b, v4.16b
    313  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    314  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    315  1.1  riastrad 
    316  1.1  riastrad 	subs	x2, x2, #3	/* count down three rounds */
    317  1.1  riastrad 	str	q1, [x0], #0x10	/* store third round key */
    318  1.1  riastrad 	b.eq	2f
    319  1.1  riastrad 
    320  1.1  riastrad 	/*
    321  1.1  riastrad 	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
    322  1.1  riastrad 	 * v5.4s := (0, nrk[2], xxx, xxx)
    323  1.1  riastrad 	 */
    324  1.1  riastrad 	ext	v4.16b, v3.16b, v0.16b, #8
    325  1.1  riastrad 	ext	v5.16b, v0.16b, v4.16b, #12
    326  1.1  riastrad 
    327  1.1  riastrad 	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
    328  1.5       ryo 	dup	v2.4s, v1.s[3]
    329  1.1  riastrad 
    330  1.1  riastrad 	/*
    331  1.1  riastrad 	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
    332  1.1  riastrad 	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
    333  1.1  riastrad 	 *     xxx, xxx)
    334  1.1  riastrad 	 */
    335  1.1  riastrad 	eor	v2.16b, v2.16b, v4.16b
    336  1.1  riastrad 	eor	v2.16b, v2.16b, v5.16b
    337  1.1  riastrad 
    338  1.1  riastrad 	b	1b
    339  1.1  riastrad 
    340  1.1  riastrad 2:	ret
    341  1.1  riastrad END(aesarmv8_setenckey192)
    342  1.1  riastrad 
    343  1.1  riastrad /*
    344  1.1  riastrad  * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
    345  1.1  riastrad  *
    346  1.1  riastrad  *	Expand a 32-byte AES-256 key into 14 round keys.
    347  1.1  riastrad  *
    348  1.1  riastrad  *	Standard ABI calling convention.
    349  1.1  riastrad  */
    350  1.1  riastrad ENTRY(aesarmv8_setenckey256)
    351  1.1  riastrad 	/* q1 := key[0:128), q2 := key[128:256) */
    352  1.1  riastrad 	ldp	q1, q2, [x1], #0x20
    353  1.1  riastrad 
    354  1.1  riastrad 	adrl	x4, unshiftrows_rotword_3
    355  1.1  riastrad 	adrl	x5, unshiftrows_3
    356  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    357  1.4  riastrad 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 */
    358  1.4  riastrad 	ldr	q17, [x5]	/* q17 := unshiftrows_3 */
    359  1.1  riastrad 
    360  1.1  riastrad 	/* store master key as first two round keys */
    361  1.1  riastrad 	stp	q1, q2, [x0], #0x20
    362  1.1  riastrad 	mov	x2, #14		/* round count */
    363  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    364  1.1  riastrad 
    365  1.1  riastrad 1:	/*
    366  1.1  riastrad 	 * q0 = 0
    367  1.1  riastrad 	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
    368  1.1  riastrad 	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
    369  1.1  riastrad 	 * x2 = round count
    370  1.1  riastrad 	 * x3 = rcon pointer
    371  1.1  riastrad 	 */
    372  1.1  riastrad 
    373  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q2)) */
    374  1.1  riastrad 	mov	v3.16b, v2.16b
    375  1.1  riastrad 	aese	v3.16b, v0.16b
    376  1.1  riastrad 
    377  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    378  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    379  1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    380  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    381  1.1  riastrad 
    382  1.1  riastrad 	/*
    383  1.1  riastrad 	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
    384  1.1  riastrad 	 * v6.4s := (0,0,pprk[0],pprk[1])
    385  1.1  riastrad 	 * v7.4s := (0,0,0,pprk[0])
    386  1.1  riastrad 	 */
    387  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    388  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    389  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    390  1.1  riastrad 
    391  1.1  riastrad 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    392  1.1  riastrad 	eor	v1.16b, v1.16b, v3.16b
    393  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    394  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    395  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    396  1.1  riastrad 
    397  1.1  riastrad 	subs	x2, x2, #2		/* count down two rounds */
    398  1.1  riastrad 	b.eq	2f			/* stop if this is the last one */
    399  1.1  riastrad 
    400  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q1)) */
    401  1.1  riastrad 	mov	v3.16b, v1.16b
    402  1.1  riastrad 	aese	v3.16b, v0.16b
    403  1.1  riastrad 
    404  1.1  riastrad 	/* v3.4s[i] := SubBytes(rk[3]) */
    405  1.4  riastrad 	tbl	v3.16b, {v3.16b}, v17.16b
    406  1.1  riastrad 
    407  1.1  riastrad 	/*
    408  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    409  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    410  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    411  1.1  riastrad 	 */
    412  1.1  riastrad 	ext	v5.16b, v0.16b, v2.16b, #12
    413  1.1  riastrad 	ext	v6.16b, v0.16b, v2.16b, #8
    414  1.1  riastrad 	ext	v7.16b, v0.16b, v2.16b, #4
    415  1.1  riastrad 
    416  1.1  riastrad 	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    417  1.1  riastrad 	eor	v2.16b, v2.16b, v3.16b
    418  1.1  riastrad 	eor	v2.16b, v2.16b, v5.16b
    419  1.1  riastrad 	eor	v2.16b, v2.16b, v6.16b
    420  1.1  riastrad 	eor	v2.16b, v2.16b, v7.16b
    421  1.1  riastrad 
    422  1.1  riastrad 	stp	q1, q2, [x0], #0x20	/* store two round keys */
    423  1.1  riastrad 	b	1b
    424  1.1  riastrad 
    425  1.1  riastrad 2:	str	q1, [x0]		/* store last round key */
    426  1.1  riastrad 	ret
    427  1.1  riastrad END(aesarmv8_setenckey256)
    428  1.1  riastrad 
    429  1.1  riastrad /*
    430  1.1  riastrad  * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
    431  1.1  riastrad  *     uint32_t nrounds@x2)
    432  1.1  riastrad  *
    433  1.1  riastrad  *	Convert AES encryption round keys to AES decryption round keys.
    434  1.1  riastrad  *	`rounds' must be between 10 and 14.
    435  1.1  riastrad  *
    436  1.1  riastrad  *	Standard ABI calling convention.
    437  1.1  riastrad  */
    438  1.1  riastrad ENTRY(aesarmv8_enctodec)
    439  1.1  riastrad 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
    440  1.7  riastrad 	b	2f
    441  1.7  riastrad 1:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
    442  1.7  riastrad 2:	str	q0, [x1], #0x10	/* store round key */
    443  1.1  riastrad 	subs	x2, x2, #1	/* count down round */
    444  1.1  riastrad 	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
    445  1.7  riastrad 	b.ne	1b		/* repeat if there's more */
    446  1.7  riastrad 	str	q0, [x1]	/* store first round key verbatim */
    447  1.1  riastrad 	ret
    448  1.1  riastrad END(aesarmv8_enctodec)
    449  1.1  riastrad 
    450  1.1  riastrad /*
    451  1.1  riastrad  * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
    452  1.1  riastrad  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    453  1.1  riastrad  *
    454  1.1  riastrad  *	Encrypt a single block.
    455  1.1  riastrad  *
    456  1.1  riastrad  *	Standard ABI calling convention.
    457  1.1  riastrad  */
    458  1.1  riastrad ENTRY(aesarmv8_enc)
    459  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    460  1.1  riastrad 	mov	fp, sp
    461  1.4  riastrad 	ldr	q0, [x1]	/* q0 := ptxt */
    462  1.4  riastrad 	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
    463  1.4  riastrad 	str	q0, [x2]	/* store ctxt */
    464  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    465  1.1  riastrad 	ret
    466  1.1  riastrad END(aesarmv8_enc)
    467  1.1  riastrad 
    468  1.1  riastrad /*
    469  1.1  riastrad  * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
    470  1.1  riastrad  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    471  1.1  riastrad  *
    472  1.1  riastrad  *	Decrypt a single block.
    473  1.1  riastrad  *
    474  1.1  riastrad  *	Standard ABI calling convention.
    475  1.1  riastrad  */
    476  1.1  riastrad ENTRY(aesarmv8_dec)
    477  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    478  1.1  riastrad 	mov	fp, sp
    479  1.4  riastrad 	ldr	q0, [x1]	/* q0 := ctxt */
    480  1.4  riastrad 	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
    481  1.4  riastrad 	str	q0, [x2]	/* store ptxt */
    482  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    483  1.1  riastrad 	ret
    484  1.1  riastrad END(aesarmv8_dec)
    485  1.1  riastrad 
    486  1.1  riastrad /*
    487  1.1  riastrad  * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    488  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
    489  1.1  riastrad  *     uint32_t nrounds@x5)
    490  1.1  riastrad  *
    491  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    492  1.1  riastrad  *
    493  1.1  riastrad  *	nbytes must be an integral multiple of 16.
    494  1.1  riastrad  *
    495  1.1  riastrad  *	Standard ABI calling convention.
    496  1.1  riastrad  */
    497  1.1  riastrad ENTRY(aesarmv8_cbc_enc)
    498  1.1  riastrad 	cbz	x3, 2f			/* stop if nothing to do */
    499  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    500  1.1  riastrad 	mov	fp, sp
    501  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    502  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    503  1.1  riastrad 	ldr	q0, [x4]		/* q0 := chaining value */
    504  1.1  riastrad 1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
    505  1.1  riastrad 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
    506  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    507  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    508  1.4  riastrad 	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
    509  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    510  1.1  riastrad 	str	q0, [x2], #0x10		/* store ciphertext block */
    511  1.1  riastrad 	b.ne	1b			/* repeat if x10 is nonzero */
    512  1.1  riastrad 	str	q0, [x4]		/* store chaining value */
    513  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    514  1.1  riastrad 2:	ret
    515  1.1  riastrad END(aesarmv8_cbc_enc)
    516  1.1  riastrad 
    517  1.1  riastrad /*
    518  1.1  riastrad  * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    519  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    520  1.1  riastrad  *     uint32_t nrounds@x5)
    521  1.1  riastrad  *
    522  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    523  1.1  riastrad  *
    524  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    525  1.1  riastrad  *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
    526  1.1  riastrad  *
    527  1.1  riastrad  *	Standard ABI calling convention.
    528  1.1  riastrad  */
    529  1.1  riastrad ENTRY(aesarmv8_cbc_dec1)
    530  1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    531  1.1  riastrad 	mov	fp, sp
    532  1.4  riastrad 	ldr	q24, [x4]		/* q24 := iv */
    533  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    534  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    535  1.1  riastrad 	add	x1, x1, x3		/* x1 := pointer past end of in */
    536  1.1  riastrad 	add	x2, x2, x3		/* x2 := pointer past end of out */
    537  1.1  riastrad 	ldr	q0, [x1, #-0x10]!	/* q0 := last ciphertext block */
    538  1.1  riastrad 	str	q0, [x4]		/* update iv */
    539  1.7  riastrad 	b	2f
    540  1.7  riastrad 1:	ldr	q31, [x1, #-0x10]!	/* q31 := chaining value */
    541  1.7  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
    542  1.7  riastrad 	str	q0, [x2, #-0x10]!	/* store plaintext block */
    543  1.7  riastrad 	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
    544  1.7  riastrad 2:	mov	x0, x9			/* x0 := enckey */
    545  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    546  1.4  riastrad 	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
    547  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    548  1.7  riastrad 	b.ne	1b			/* repeat if more blocks */
    549  1.7  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
    550  1.1  riastrad 	str	q0, [x2, #-0x10]!	/* store first plaintext block */
    551  1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    552  1.1  riastrad 	ret
    553  1.1  riastrad END(aesarmv8_cbc_dec1)
    554  1.1  riastrad 
    555  1.1  riastrad /*
    556  1.1  riastrad  * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    557  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    558  1.1  riastrad  *     uint32_t nrounds@x5)
    559  1.1  riastrad  *
    560  1.1  riastrad  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    561  1.1  riastrad  *
    562  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    563  1.1  riastrad  *
    564  1.1  riastrad  *	Standard ABI calling convention.
    565  1.1  riastrad  */
    566  1.1  riastrad ENTRY(aesarmv8_cbc_dec8)
    567  1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    568  1.1  riastrad 	mov	fp, sp
    569  1.4  riastrad 	ldr	q24, [x4]		/* q24 := iv */
    570  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    571  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    572  1.1  riastrad 	add	x1, x1, x3		/* x1 := pointer past end of in */
    573  1.1  riastrad 	add	x2, x2, x3		/* x2 := pointer past end of out */
    574  1.1  riastrad 	ldp	q6, q7, [x1, #-0x20]!	/* q6, q7 := last ciphertext blocks */
    575  1.1  riastrad 	str	q7, [x4]		/* update iv */
    576  1.7  riastrad 	b	2f
    577  1.7  riastrad 1:	ldp	q6, q7, [x1, #-0x20]!
    578  1.7  riastrad 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
    579  1.7  riastrad 	stp	q0, q1, [x2, #-0x20]!
    580  1.7  riastrad 2:	ldp	q4, q5, [x1, #-0x20]!
    581  1.1  riastrad 	ldp	q2, q3, [x1, #-0x20]!
    582  1.1  riastrad 	ldp	q0, q1, [x1, #-0x20]!
    583  1.4  riastrad 	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0<i<8 */
    584  1.4  riastrad 	mov	v30.16b, v5.16b
    585  1.4  riastrad 	mov	v29.16b, v4.16b
    586  1.4  riastrad 	mov	v28.16b, v3.16b
    587  1.4  riastrad 	mov	v27.16b, v2.16b
    588  1.4  riastrad 	mov	v26.16b, v1.16b
    589  1.4  riastrad 	mov	v25.16b, v0.16b
    590  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    591  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    592  1.4  riastrad 	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i];
    593  1.4  riastrad 					 * trash x0/x3/q16 */
    594  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b	/* q[i] := pt[i] */
    595  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    596  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    597  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    598  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    599  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    600  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    601  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    602  1.1  riastrad 	stp	q6, q7, [x2, #-0x20]!	/* store plaintext blocks */
    603  1.1  riastrad 	stp	q4, q5, [x2, #-0x20]!
    604  1.1  riastrad 	stp	q2, q3, [x2, #-0x20]!
    605  1.7  riastrad 	b.ne	1b			/* repeat if there's more */
    606  1.7  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q0 := pt0 */
    607  1.1  riastrad 	stp	q0, q1, [x2, #-0x20]!	/* store first two plaintext blocks */
    608  1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    609  1.1  riastrad 	ret
    610  1.1  riastrad END(aesarmv8_cbc_dec8)
    611  1.1  riastrad 
    612  1.1  riastrad /*
    613  1.1  riastrad  * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    614  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    615  1.1  riastrad  *     uint32_t nrounds@x5)
    616  1.1  riastrad  *
    617  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    618  1.1  riastrad  *
    619  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    620  1.1  riastrad  *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
    621  1.1  riastrad  *
    622  1.1  riastrad  *	Standard ABI calling convention.
    623  1.1  riastrad  */
    624  1.1  riastrad ENTRY(aesarmv8_xts_enc1)
    625  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    626  1.1  riastrad 	mov	fp, sp
    627  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    628  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    629  1.4  riastrad 	ldr	q31, [x4]		/* q31 := tweak */
    630  1.1  riastrad 1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
    631  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    632  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    633  1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ptxt ^ tweak */
    634  1.4  riastrad 	bl	aesarmv8_enc1		/* q0 := AES(...); trash x0/x3/q16 */
    635  1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
    636  1.1  riastrad 	str	q0, [x2], #0x10		/* store ciphertext block */
    637  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    638  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    639  1.1  riastrad 	b.ne	1b			/* repeat if more blocks */
    640  1.4  riastrad 	str	q31, [x4]		/* update tweak */
    641  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    642  1.1  riastrad 	ret
    643  1.1  riastrad END(aesarmv8_xts_enc1)
    644  1.1  riastrad 
    645  1.1  riastrad /*
    646  1.1  riastrad  * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    647  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    648  1.1  riastrad  *     uint32_t nrounds@x5)
    649  1.1  riastrad  *
    650  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    651  1.1  riastrad  *
    652  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    653  1.1  riastrad  *
    654  1.1  riastrad  *	Standard ABI calling convention.
    655  1.1  riastrad  */
    656  1.1  riastrad ENTRY(aesarmv8_xts_enc8)
    657  1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    658  1.1  riastrad 	mov	fp, sp
    659  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    660  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    661  1.4  riastrad 	ldr	q31, [x4]		/* q31 := tweak */
    662  1.4  riastrad 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    663  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    664  1.4  riastrad 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    665  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    666  1.4  riastrad 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    667  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    668  1.4  riastrad 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    669  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    670  1.4  riastrad 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    671  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    672  1.4  riastrad 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    673  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    674  1.4  riastrad 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    675  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    676  1.4  riastrad 					/* q31 := tweak[7] */
    677  1.4  riastrad 	ldp	q0, q1, [x1], #0x20	/* q[i] := ptxt[i] */
    678  1.1  riastrad 	ldp	q2, q3, [x1], #0x20
    679  1.1  riastrad 	ldp	q4, q5, [x1], #0x20
    680  1.1  riastrad 	ldp	q6, q7, [x1], #0x20
    681  1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
    682  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    683  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    684  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    685  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    686  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    687  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    688  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    689  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    690  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    691  1.4  riastrad 	bl	aesarmv8_enc8		/* encrypt q0-q7; trash x0/x3/q16 */
    692  1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    693  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    694  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    695  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    696  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    697  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    698  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    699  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    700  1.1  riastrad 	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
    701  1.4  riastrad 	stp	q2, q3, [x2], #0x20
    702  1.4  riastrad 	stp	q4, q5, [x2], #0x20
    703  1.4  riastrad 	stp	q6, q7, [x2], #0x20
    704  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    705  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    706  1.1  riastrad 	b.ne	1b			/* repeat if more block groups */
    707  1.4  riastrad 	str	q31, [x4]		/* update tweak */
    708  1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    709  1.1  riastrad 	ret
    710  1.1  riastrad END(aesarmv8_xts_enc8)
    711  1.1  riastrad 
    712  1.1  riastrad /*
    713  1.1  riastrad  * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    714  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    715  1.1  riastrad  *     uint32_t nrounds@x5)
    716  1.1  riastrad  *
    717  1.4  riastrad  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    718  1.1  riastrad  *
    719  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    720  1.1  riastrad  *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
    721  1.1  riastrad  *
    722  1.1  riastrad  *	Standard ABI calling convention.
    723  1.1  riastrad  */
    724  1.1  riastrad ENTRY(aesarmv8_xts_dec1)
    725  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    726  1.1  riastrad 	mov	fp, sp
    727  1.1  riastrad 	mov	x9, x0			/* x9 := deckey */
    728  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    729  1.4  riastrad 	ldr	q31, [x4]		/* q31 := tweak */
    730  1.4  riastrad 1:	ldr	q0, [x1], #0x10		/* q0 := ctxt */
    731  1.1  riastrad 	mov	x0, x9			/* x0 := deckey */
    732  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    733  1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ctxt ^ tweak */
    734  1.4  riastrad 	bl	aesarmv8_dec1		/* q0 := AES(...); trash x0/x3/q16 */
    735  1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ctxt ^ tweak) ^ tweak */
    736  1.4  riastrad 	str	q0, [x2], #0x10		/* store plaintext block */
    737  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    738  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    739  1.1  riastrad 	b.ne	1b			/* repeat if more blocks */
    740  1.4  riastrad 	str	q31, [x4]		/* update tweak */
    741  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    742  1.1  riastrad 	ret
    743  1.1  riastrad END(aesarmv8_xts_dec1)
    744  1.1  riastrad 
    745  1.1  riastrad /*
    746  1.1  riastrad  * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    747  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    748  1.1  riastrad  *     uint32_t nrounds@x5)
    749  1.1  riastrad  *
    750  1.4  riastrad  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    751  1.1  riastrad  *
    752  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    753  1.1  riastrad  *
    754  1.1  riastrad  *	Standard ABI calling convention.
    755  1.1  riastrad  */
    756  1.1  riastrad ENTRY(aesarmv8_xts_dec8)
    757  1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    758  1.1  riastrad 	mov	fp, sp
    759  1.1  riastrad 	mov	x9, x0			/* x9 := deckey */
    760  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    761  1.4  riastrad 	ldr	q31, [x4]		/* q31 := tweak */
    762  1.4  riastrad 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    763  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    764  1.4  riastrad 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    765  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    766  1.4  riastrad 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    767  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    768  1.4  riastrad 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    769  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    770  1.4  riastrad 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    771  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    772  1.4  riastrad 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    773  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    774  1.4  riastrad 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    775  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    776  1.4  riastrad 					/* q31 := tweak[7] */
    777  1.4  riastrad 	ldp	q0, q1, [x1], #0x20	/* q[i] := ctxt[i] */
    778  1.1  riastrad 	ldp	q2, q3, [x1], #0x20
    779  1.1  riastrad 	ldp	q4, q5, [x1], #0x20
    780  1.1  riastrad 	ldp	q6, q7, [x1], #0x20
    781  1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ctxt[i] ^ tweak[i] */
    782  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    783  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    784  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    785  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    786  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    787  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    788  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    789  1.1  riastrad 	mov	x0, x9			/* x0 := deckey */
    790  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    791  1.4  riastrad 	bl	aesarmv8_dec8		/* decrypt q0-q7; trash x0/x3/q16 */
    792  1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    793  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    794  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    795  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    796  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    797  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    798  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    799  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    800  1.4  riastrad 	stp	q0, q1, [x2], #0x20	/* store plaintext blocks */
    801  1.4  riastrad 	stp	q2, q3, [x2], #0x20
    802  1.4  riastrad 	stp	q4, q5, [x2], #0x20
    803  1.4  riastrad 	stp	q6, q7, [x2], #0x20
    804  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    805  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    806  1.1  riastrad 	b.ne	1b			/* repeat if more block groups */
    807  1.4  riastrad 	str	q31, [x4]		/* update tweak */
    808  1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    809  1.1  riastrad 	ret
    810  1.1  riastrad END(aesarmv8_xts_dec8)
    811  1.1  riastrad 
    812  1.1  riastrad /*
    813  1.4  riastrad  * aesarmv8_xts_mulx(tweak@q31)
    814  1.1  riastrad  *
    815  1.4  riastrad  *	Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    816  1.1  riastrad  *	Uses x0 and q0/q1 as temporaries.
    817  1.1  riastrad  */
    818  1.1  riastrad 	.text
    819  1.1  riastrad 	_ALIGN_TEXT
    820  1.1  riastrad 	.type	aesarmv8_xts_mulx,@function
    821  1.1  riastrad aesarmv8_xts_mulx:
    822  1.1  riastrad 	/*
    823  1.1  riastrad 	 * Simultaneously determine
    824  1.1  riastrad 	 * (a) whether the high bit of the low half must be
    825  1.1  riastrad 	 *     shifted into the low bit of the high half, and
    826  1.1  riastrad 	 * (b) whether the high bit of the high half must be
    827  1.1  riastrad 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    828  1.1  riastrad 	 */
    829  1.1  riastrad 	adrl	x0, xtscarry
    830  1.6  riastrad 	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
    831  1.1  riastrad 	ldr	q0, [x0]		/* q0 := xtscarry */
    832  1.1  riastrad 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
    833  1.4  riastrad 	shl	v31.2d, v31.2d, #1	/* shift */
    834  1.1  riastrad 	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
    835  1.4  riastrad 	eor	v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
    836  1.1  riastrad 	ret
    837  1.1  riastrad END(aesarmv8_xts_mulx)
    838  1.1  riastrad 
    839  1.1  riastrad 	.section .rodata
    840  1.2  riastrad 	.p2align 4
    841  1.1  riastrad 	.type	xtscarry,@object
    842  1.1  riastrad xtscarry:
    843  1.1  riastrad 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    844  1.1  riastrad END(xtscarry)
    845  1.1  riastrad 
    846  1.1  riastrad /*
    847  1.1  riastrad  * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
    848  1.1  riastrad  *
    849  1.1  riastrad  *	Update an AES-XTS tweak.
    850  1.1  riastrad  *
    851  1.1  riastrad  *	Standard ABI calling convention.
    852  1.1  riastrad  */
    853  1.1  riastrad ENTRY(aesarmv8_xts_update)
    854  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    855  1.1  riastrad 	mov	fp, sp
    856  1.4  riastrad 	ldr	q31, [x0]		/* load tweak */
    857  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    858  1.4  riastrad 	str	q31, [x1]		/* store tweak */
    859  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    860  1.1  riastrad 	ret
    861  1.1  riastrad END(aesarmv8_xts_update)
    862  1.1  riastrad 
    863  1.1  riastrad /*
    864  1.1  riastrad  * aesarmv8_enc1(const struct aesenc *enckey@x0,
    865  1.1  riastrad  *     uint128_t block@q0, uint32_t nrounds@x3)
    866  1.1  riastrad  *
    867  1.1  riastrad  *	Encrypt a single AES block in q0.
    868  1.1  riastrad  *
    869  1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
    870  1.1  riastrad  */
    871  1.1  riastrad 	.text
    872  1.1  riastrad 	_ALIGN_TEXT
    873  1.1  riastrad 	.type	aesarmv8_enc1,@function
    874  1.1  riastrad aesarmv8_enc1:
    875  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
    876  1.7  riastrad 	b	2f
    877  1.7  riastrad 1:	/* q0 := MixColumns(q0) */
    878  1.7  riastrad 	aesmc	v0.16b, v0.16b
    879  1.7  riastrad 2:	subs	x3, x3, #1
    880  1.4  riastrad 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
    881  1.4  riastrad 	aese	v0.16b, v16.16b
    882  1.4  riastrad 	ldr	q16, [x0], #0x10		/* load next round key */
    883  1.7  riastrad 	b.ne	1b
    884  1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b
    885  1.1  riastrad 	ret
    886  1.1  riastrad END(aesarmv8_enc1)
    887  1.1  riastrad 
    888  1.1  riastrad /*
    889  1.1  riastrad  * aesarmv8_enc8(const struct aesenc *enckey@x0,
    890  1.1  riastrad  *     uint128_t block0@q0, ..., uint128_t block7@q7,
    891  1.1  riastrad  *     uint32_t nrounds@x3)
    892  1.1  riastrad  *
    893  1.1  riastrad  *	Encrypt eight AES blocks in q0 through q7 in parallel.
    894  1.1  riastrad  *
    895  1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
    896  1.1  riastrad  */
    897  1.1  riastrad 	.text
    898  1.1  riastrad 	_ALIGN_TEXT
    899  1.1  riastrad 	.type	aesarmv8_enc8,@function
    900  1.1  riastrad aesarmv8_enc8:
    901  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
    902  1.7  riastrad 	b	2f
    903  1.7  riastrad 1:	/* q[i] := MixColumns(q[i]) */
    904  1.7  riastrad 	aesmc	v0.16b, v0.16b
    905  1.7  riastrad 	aesmc	v1.16b, v1.16b
    906  1.7  riastrad 	aesmc	v2.16b, v2.16b
    907  1.7  riastrad 	aesmc	v3.16b, v3.16b
    908  1.7  riastrad 	aesmc	v4.16b, v4.16b
    909  1.7  riastrad 	aesmc	v5.16b, v5.16b
    910  1.7  riastrad 	aesmc	v6.16b, v6.16b
    911  1.7  riastrad 	aesmc	v7.16b, v7.16b
    912  1.7  riastrad 2:	subs	x3, x3, #1
    913  1.4  riastrad 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
    914  1.4  riastrad 	aese	v0.16b, v16.16b
    915  1.4  riastrad 	aese	v1.16b, v16.16b
    916  1.4  riastrad 	aese	v2.16b, v16.16b
    917  1.4  riastrad 	aese	v3.16b, v16.16b
    918  1.4  riastrad 	aese	v4.16b, v16.16b
    919  1.4  riastrad 	aese	v5.16b, v16.16b
    920  1.4  riastrad 	aese	v6.16b, v16.16b
    921  1.4  riastrad 	aese	v7.16b, v16.16b
    922  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
    923  1.7  riastrad 	b.ne	1b
    924  1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
    925  1.4  riastrad 	eor	v1.16b, v1.16b, v16.16b
    926  1.4  riastrad 	eor	v2.16b, v2.16b, v16.16b
    927  1.4  riastrad 	eor	v3.16b, v3.16b, v16.16b
    928  1.4  riastrad 	eor	v4.16b, v4.16b, v16.16b
    929  1.4  riastrad 	eor	v5.16b, v5.16b, v16.16b
    930  1.4  riastrad 	eor	v6.16b, v6.16b, v16.16b
    931  1.4  riastrad 	eor	v7.16b, v7.16b, v16.16b
    932  1.1  riastrad 	ret
    933  1.1  riastrad END(aesarmv8_enc8)
    934  1.1  riastrad 
    935  1.1  riastrad /*
    936  1.1  riastrad  * aesarmv8_dec1(const struct aesdec *deckey@x0,
    937  1.1  riastrad  *     uint128_t block@q0, uint32_t nrounds@x3)
    938  1.1  riastrad  *
    939  1.1  riastrad  *	Decrypt a single AES block in q0.
    940  1.1  riastrad  *
    941  1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
    942  1.1  riastrad  */
    943  1.1  riastrad 	.text
    944  1.1  riastrad 	_ALIGN_TEXT
    945  1.1  riastrad 	.type	aesarmv8_dec1,@function
    946  1.1  riastrad aesarmv8_dec1:
    947  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
    948  1.7  riastrad 	b	2f
    949  1.7  riastrad 1:	/* q0 := InMixColumns(q0) */
    950  1.7  riastrad 	aesimc	v0.16b, v0.16b
    951  1.7  riastrad 2:	subs	x3, x3, #1
    952  1.4  riastrad 	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
    953  1.4  riastrad 	aesd	v0.16b, v16.16b
    954  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
    955  1.7  riastrad 	b.ne	1b
    956  1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b
    957  1.1  riastrad 	ret
    958  1.1  riastrad END(aesarmv8_dec1)
    959  1.1  riastrad 
    960  1.1  riastrad /*
    961  1.1  riastrad  * aesarmv8_dec8(const struct aesdec *deckey@x0,
    962  1.1  riastrad  *     uint128_t block0@q0, ..., uint128_t block7@q7,
    963  1.1  riastrad  *     uint32_t nrounds@x3)
    964  1.1  riastrad  *
    965  1.1  riastrad  *	Decrypt eight AES blocks in q0 through q7 in parallel.
    966  1.1  riastrad  *
    967  1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
    968  1.1  riastrad  */
    969  1.1  riastrad 	.text
    970  1.1  riastrad 	_ALIGN_TEXT
    971  1.1  riastrad 	.type	aesarmv8_dec8,@function
    972  1.1  riastrad aesarmv8_dec8:
    973  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
    974  1.7  riastrad 	b	2f
    975  1.7  riastrad 1:	/* q[i] := InMixColumns(q[i]) */
    976  1.7  riastrad 	aesimc	v0.16b, v0.16b
    977  1.7  riastrad 	aesimc	v1.16b, v1.16b
    978  1.7  riastrad 	aesimc	v2.16b, v2.16b
    979  1.7  riastrad 	aesimc	v3.16b, v3.16b
    980  1.7  riastrad 	aesimc	v4.16b, v4.16b
    981  1.7  riastrad 	aesimc	v5.16b, v5.16b
    982  1.7  riastrad 	aesimc	v6.16b, v6.16b
    983  1.7  riastrad 	aesimc	v7.16b, v7.16b
    984  1.7  riastrad 2:	subs	x3, x3, #1
    985  1.4  riastrad 	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
    986  1.4  riastrad 	aesd	v0.16b, v16.16b
    987  1.4  riastrad 	aesd	v1.16b, v16.16b
    988  1.4  riastrad 	aesd	v2.16b, v16.16b
    989  1.4  riastrad 	aesd	v3.16b, v16.16b
    990  1.4  riastrad 	aesd	v4.16b, v16.16b
    991  1.4  riastrad 	aesd	v5.16b, v16.16b
    992  1.4  riastrad 	aesd	v6.16b, v16.16b
    993  1.4  riastrad 	aesd	v7.16b, v16.16b
    994  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
    995  1.7  riastrad 	b.ne	1b
    996  1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
    997  1.4  riastrad 	eor	v1.16b, v1.16b, v16.16b
    998  1.4  riastrad 	eor	v2.16b, v2.16b, v16.16b
    999  1.4  riastrad 	eor	v3.16b, v3.16b, v16.16b
   1000  1.4  riastrad 	eor	v4.16b, v4.16b, v16.16b
   1001  1.4  riastrad 	eor	v5.16b, v5.16b, v16.16b
   1002  1.4  riastrad 	eor	v6.16b, v6.16b, v16.16b
   1003  1.4  riastrad 	eor	v7.16b, v7.16b, v16.16b
   1004  1.1  riastrad 	ret
   1005  1.1  riastrad END(aesarmv8_dec8)
   1006