Home | History | Annotate | Line # | Download | only in arm
aes_armv8_64.S revision 1.3
      1  1.3  riastrad /*	$NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.1  riastrad #include <aarch64/asm.h>
     30  1.1  riastrad 
     31  1.3  riastrad 	.arch_extension	aes
     32  1.1  riastrad 
     33  1.1  riastrad /*
     34  1.1  riastrad  * uint32_t rcon[10]
     35  1.1  riastrad  *
     36  1.1  riastrad  *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
     37  1.1  riastrad  *	Such elements of GF(8) need only eight bits to be represented,
     38  1.1  riastrad  *	but we store them in 4-byte units so we can copy one into all
     39  1.1  riastrad  *	four 4-byte lanes of a vector register with a single LD1R.  The
     40  1.1  riastrad  *	access pattern is fixed, so indices into this table are never
     41  1.1  riastrad  *	secret.
     42  1.1  riastrad  */
     43  1.1  riastrad 	.section .rodata
     44  1.2  riastrad 	.p2align 2
     45  1.1  riastrad 	.type	rcon,@object
     46  1.1  riastrad rcon:
     47  1.1  riastrad 	.long	0x01
     48  1.1  riastrad 	.long	0x02
     49  1.1  riastrad 	.long	0x04
     50  1.1  riastrad 	.long	0x08
     51  1.1  riastrad 	.long	0x10
     52  1.1  riastrad 	.long	0x20
     53  1.1  riastrad 	.long	0x40
     54  1.1  riastrad 	.long	0x80
     55  1.1  riastrad 	.long	0x1b
     56  1.1  riastrad 	.long	0x36
     57  1.1  riastrad END(rcon)
     58  1.1  riastrad 
     59  1.1  riastrad /*
     60  1.1  riastrad  * uint128_t unshiftrows_rotword_1
     61  1.1  riastrad  *
     62  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then do
     63  1.1  riastrad  *	RotWord on word 1, and then copy it into all the other words.
     64  1.1  riastrad  */
     65  1.1  riastrad 	.section .rodata
     66  1.2  riastrad 	.p2align 4
     67  1.1  riastrad 	.type	unshiftrows_rotword_1,@object
     68  1.1  riastrad unshiftrows_rotword_1:
     69  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     70  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     71  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     72  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     73  1.1  riastrad END(unshiftrows_rotword_1)
     74  1.1  riastrad 
     75  1.1  riastrad /*
     76  1.1  riastrad  * uint128_t unshiftrows_3
     77  1.1  riastrad  *
     78  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then copy word
     79  1.1  riastrad  *	3 into all the other words.
     80  1.1  riastrad  */
     81  1.1  riastrad 	.section .rodata
     82  1.2  riastrad 	.p2align 4
     83  1.1  riastrad 	.type	unshiftrows_3,@object
     84  1.1  riastrad unshiftrows_3:
     85  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     86  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     87  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     88  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     89  1.1  riastrad END(unshiftrows_3)
     90  1.1  riastrad 
     91  1.1  riastrad /*
     92  1.1  riastrad  * uint128_t unshiftrows_rotword_3
     93  1.1  riastrad  *
     94  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then do
     95  1.1  riastrad  *	RotWord on word 3, and then copy it into all the other words.
     96  1.1  riastrad  */
     97  1.1  riastrad 	.section .rodata
     98  1.2  riastrad 	.p2align 4
     99  1.1  riastrad 	.type	unshiftrows_rotword_3,@object
    100  1.1  riastrad unshiftrows_rotword_3:
    101  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    102  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    103  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    104  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    105  1.1  riastrad END(unshiftrows_rotword_3)
    106  1.1  riastrad 
    107  1.1  riastrad /*
    108  1.1  riastrad  * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
    109  1.1  riastrad  *
    110  1.1  riastrad  *	Expand a 16-byte AES-128 key into 10 round keys.
    111  1.1  riastrad  *
    112  1.1  riastrad  *	Standard ABI calling convention.
    113  1.1  riastrad  */
    114  1.1  riastrad ENTRY(aesarmv8_setenckey128)
    115  1.1  riastrad 	ldr	q1, [x1]	/* q1 := master key */
    116  1.1  riastrad 
    117  1.1  riastrad 	adrl	x4, unshiftrows_rotword_3
    118  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    119  1.1  riastrad 	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_3 table */
    120  1.1  riastrad 
    121  1.1  riastrad 	str	q1, [x0], #0x10	/* store master key as first round key */
    122  1.1  riastrad 	mov	x2, #10		/* round count */
    123  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    124  1.1  riastrad 
    125  1.1  riastrad 1:	/*
    126  1.1  riastrad 	 * q0 = 0
    127  1.1  riastrad 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    128  1.1  riastrad 	 * x0 = pointer to round key to compute
    129  1.1  riastrad 	 * x2 = round count
    130  1.1  riastrad 	 * x3 = rcon pointer
    131  1.1  riastrad 	 */
    132  1.1  riastrad 
    133  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q1)) */
    134  1.1  riastrad 	mov	v3.16b, v1.16b
    135  1.1  riastrad 	aese	v3.16b, v0.16b
    136  1.1  riastrad 
    137  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    138  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    139  1.1  riastrad 	tbl	v3.16b, {v3.16b}, v8.16b
    140  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    141  1.1  riastrad 
    142  1.1  riastrad 	/*
    143  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    144  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    145  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    146  1.1  riastrad 	 */
    147  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    148  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    149  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    150  1.1  riastrad 
    151  1.1  riastrad 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    152  1.1  riastrad 	eor	v1.16b, v1.16b, v3.16b
    153  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    154  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    155  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    156  1.1  riastrad 
    157  1.1  riastrad 	subs	x2, x2, #1	/* count down rounds */
    158  1.1  riastrad 	str	q1, [x0], #0x10	/* store round key */
    159  1.1  riastrad 	b.ne	1b
    160  1.1  riastrad 
    161  1.1  riastrad 	ret
    162  1.1  riastrad END(aesarmv8_setenckey128)
    163  1.1  riastrad 
    164  1.1  riastrad /*
    165  1.1  riastrad  * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
    166  1.1  riastrad  *
    167  1.1  riastrad  *	Expand a 24-byte AES-192 key into 12 round keys.
    168  1.1  riastrad  *
    169  1.1  riastrad  *	Standard ABI calling convention.
    170  1.1  riastrad  */
    171  1.1  riastrad ENTRY(aesarmv8_setenckey192)
    172  1.1  riastrad 	ldr	q1, [x1], #0x10	/* q1 := master key[0:128) */
    173  1.1  riastrad 	ldr	d2, [x1]	/* d2 := master key[128:192) */
    174  1.1  riastrad 
    175  1.1  riastrad 	adrl	x4, unshiftrows_rotword_1
    176  1.1  riastrad 	adrl	x5, unshiftrows_rotword_3
    177  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    178  1.1  riastrad 	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_1 */
    179  1.1  riastrad 	ldr	q9, [x5]	/* q9 := unshiftrows_rotword_3 */
    180  1.1  riastrad 
    181  1.1  riastrad 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
    182  1.1  riastrad 	mov	x2, #12		/* round count */
    183  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    184  1.1  riastrad 
    185  1.1  riastrad 1:	/*
    186  1.1  riastrad 	 * q0 = 0
    187  1.1  riastrad 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    188  1.1  riastrad 	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
    189  1.1  riastrad 	 * x0 = pointer to three round keys to compute
    190  1.1  riastrad 	 * x2 = round count
    191  1.1  riastrad 	 * x3 = rcon pointer
    192  1.1  riastrad 	 */
    193  1.1  riastrad 
    194  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q2)) */
    195  1.1  riastrad 	mov	v3.16b, v2.16b
    196  1.1  riastrad 	aese	v3.16b, v0.16b
    197  1.1  riastrad 
    198  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
    199  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    200  1.1  riastrad 	tbl	v3.16b, {v3.16b}, v8.16b
    201  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    202  1.1  riastrad 
    203  1.1  riastrad 	/*
    204  1.1  riastrad 	 * We need to compute:
    205  1.1  riastrad 	 *
    206  1.1  riastrad 	 * rk[0] := rklo[0]
    207  1.1  riastrad 	 * rk[1] := rklo[1]
    208  1.1  riastrad 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    209  1.1  riastrad 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    210  1.1  riastrad 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    211  1.1  riastrad 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    212  1.1  riastrad 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    213  1.1  riastrad 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    214  1.1  riastrad 	 *     ^ rklo[1]
    215  1.1  riastrad 	 */
    216  1.1  riastrad 
    217  1.1  riastrad 	/*
    218  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    219  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    220  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    221  1.1  riastrad 	 */
    222  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    223  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    224  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    225  1.1  riastrad 
    226  1.1  riastrad 	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
    227  1.1  riastrad 	eor	v5.16b, v5.16b, v1.16b
    228  1.1  riastrad 	eor	v5.16b, v5.16b, v3.16b
    229  1.1  riastrad 	eor	v5.16b, v5.16b, v6.16b
    230  1.1  riastrad 	eor	v5.16b, v5.16b, v7.16b
    231  1.1  riastrad 
    232  1.1  riastrad 	/*
    233  1.1  riastrad 	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
    234  1.1  riastrad 	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
    235  1.1  riastrad 	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
    236  1.1  riastrad 	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
    237  1.1  riastrad 	 * (rklo[0],rklo[1],...).
    238  1.1  riastrad 	 */
    239  1.1  riastrad 
    240  1.1  riastrad 	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    241  1.1  riastrad 	dup	v1.4s, v5.4s[3]
    242  1.1  riastrad 	mov	v1.4s[0], v5.4s[2]
    243  1.1  riastrad 
    244  1.1  riastrad 	/*
    245  1.1  riastrad 	 * v6.4s := (0, 0, rklo[0], rklo[1])
    246  1.1  riastrad 	 * v7.4s := (0, 0, 0, rklo[0])
    247  1.1  riastrad 	 */
    248  1.1  riastrad 	ext	v6.16b, v0.16b, v2.16b, #8
    249  1.1  riastrad 	ext	v7.16b, v0.16b, v2.16b, #4
    250  1.1  riastrad 
    251  1.1  riastrad 	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    252  1.1  riastrad 	eor	v3.16b, v1.16b, v6.16b
    253  1.1  riastrad 	eor	v3.16b, v3.16b, v7.16b
    254  1.1  riastrad 
    255  1.1  riastrad 	/*
    256  1.1  riastrad 	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
    257  1.1  riastrad 	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
    258  1.1  riastrad 	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
    259  1.1  riastrad 	 */
    260  1.1  riastrad 	mov	v2.2d[1], v5.2d[0]
    261  1.1  riastrad 
    262  1.1  riastrad 	/* store two round keys */
    263  1.1  riastrad 	stp	q2, q3, [x0], #0x20
    264  1.1  riastrad 
    265  1.1  riastrad 	/*
    266  1.1  riastrad 	 * Live vector registers at this point:
    267  1.1  riastrad 	 *
    268  1.1  riastrad 	 *	q0 = zero
    269  1.1  riastrad 	 *	q2 = rk
    270  1.1  riastrad 	 *	q3 = nrk
    271  1.1  riastrad 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
    272  1.1  riastrad 	 *	q8 = unshiftrows_rotword_1
    273  1.1  riastrad 	 *	q9 = unshiftrows_rotword_3
    274  1.1  riastrad 	 *
    275  1.1  riastrad 	 * We have to compute, in q1:
    276  1.1  riastrad 	 *
    277  1.1  riastrad 	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
    278  1.1  riastrad 	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
    279  1.1  riastrad 	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    280  1.1  riastrad 	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    281  1.1  riastrad 	 *     ^ nrk[1]
    282  1.1  riastrad 	 *
    283  1.1  riastrad 	 * And, if there's any more afterward, in q2:
    284  1.1  riastrad 	 *
    285  1.1  riastrad 	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    286  1.1  riastrad 	 *     ^ nrk[1] ^ nrk[2]
    287  1.1  riastrad 	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    288  1.1  riastrad 	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
    289  1.1  riastrad 	 */
    290  1.1  riastrad 
    291  1.1  riastrad 	/* q1 := RotWords(SubBytes(q3)) */
    292  1.1  riastrad 	mov	v1.16b, v3.16b
    293  1.1  riastrad 	aese	v1.16b, v0.16b
    294  1.1  riastrad 
    295  1.1  riastrad 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
    296  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    297  1.1  riastrad 	tbl	v1.16b, {v1.16b}, v9.16b
    298  1.1  riastrad 	eor	v1.16b, v1.16b, v4.16b
    299  1.1  riastrad 
    300  1.1  riastrad 	/*
    301  1.1  riastrad 	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
    302  1.1  riastrad 	 * v4.4s := (0, rk[2], rk[3], nrk[0])
    303  1.1  riastrad 	 * v6.4s := (0, 0, rk[2], rk[3])
    304  1.1  riastrad 	 * v7.4s := (0, 0, 0, rk[2])
    305  1.1  riastrad 	 */
    306  1.1  riastrad 	ext	v4.16b, v0.16b, v5.16b, #12
    307  1.1  riastrad 	ext	v6.16b, v0.16b, v5.16b, #8
    308  1.1  riastrad 	ext	v7.16b, v0.16b, v5.16b, #4
    309  1.1  riastrad 
    310  1.1  riastrad 	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
    311  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    312  1.1  riastrad 	eor	v1.16b, v1.16b, v4.16b
    313  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    314  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    315  1.1  riastrad 
    316  1.1  riastrad 	subs	x2, x2, #3	/* count down three rounds */
    317  1.1  riastrad 	str	q1, [x0], #0x10	/* store third round key */
    318  1.1  riastrad 	b.eq	2f
    319  1.1  riastrad 
    320  1.1  riastrad 	/*
    321  1.1  riastrad 	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
    322  1.1  riastrad 	 * v5.4s := (0, nrk[2], xxx, xxx)
    323  1.1  riastrad 	 */
    324  1.1  riastrad 	ext	v4.16b, v3.16b, v0.16b, #8
    325  1.1  riastrad 	ext	v5.16b, v0.16b, v4.16b, #12
    326  1.1  riastrad 
    327  1.1  riastrad 	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
    328  1.1  riastrad 	dup	v2.4s, v1.4s[3]
    329  1.1  riastrad 
    330  1.1  riastrad 	/*
    331  1.1  riastrad 	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
    332  1.1  riastrad 	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
    333  1.1  riastrad 	 *     xxx, xxx)
    334  1.1  riastrad 	 */
    335  1.1  riastrad 	eor	v2.16b, v2.16b, v4.16b
    336  1.1  riastrad 	eor	v2.16b, v2.16b, v5.16b
    337  1.1  riastrad 
    338  1.1  riastrad 	b	1b
    339  1.1  riastrad 
    340  1.1  riastrad 2:	ret
    341  1.1  riastrad END(aesarmv8_setenckey192)
    342  1.1  riastrad 
    343  1.1  riastrad /*
    344  1.1  riastrad  * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
    345  1.1  riastrad  *
    346  1.1  riastrad  *	Expand a 32-byte AES-256 key into 14 round keys.
    347  1.1  riastrad  *
    348  1.1  riastrad  *	Standard ABI calling convention.
    349  1.1  riastrad  */
    350  1.1  riastrad ENTRY(aesarmv8_setenckey256)
    351  1.1  riastrad 	/* q1 := key[0:128), q2 := key[128:256) */
    352  1.1  riastrad 	ldp	q1, q2, [x1], #0x20
    353  1.1  riastrad 
    354  1.1  riastrad 	adrl	x4, unshiftrows_rotword_3
    355  1.1  riastrad 	adrl	x5, unshiftrows_3
    356  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    357  1.1  riastrad 	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_3 */
    358  1.1  riastrad 	ldr	q9, [x5]	/* q9 := unshiftrows_3 */
    359  1.1  riastrad 
    360  1.1  riastrad 	/* store master key as first two round keys */
    361  1.1  riastrad 	stp	q1, q2, [x0], #0x20
    362  1.1  riastrad 	mov	x2, #14		/* round count */
    363  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    364  1.1  riastrad 
    365  1.1  riastrad 1:	/*
    366  1.1  riastrad 	 * q0 = 0
    367  1.1  riastrad 	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
    368  1.1  riastrad 	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
    369  1.1  riastrad 	 * x2 = round count
    370  1.1  riastrad 	 * x3 = rcon pointer
    371  1.1  riastrad 	 */
    372  1.1  riastrad 
    373  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q2)) */
    374  1.1  riastrad 	mov	v3.16b, v2.16b
    375  1.1  riastrad 	aese	v3.16b, v0.16b
    376  1.1  riastrad 
    377  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    378  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    379  1.1  riastrad 	tbl	v3.16b, {v3.16b}, v8.16b
    380  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    381  1.1  riastrad 
    382  1.1  riastrad 	/*
    383  1.1  riastrad 	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
    384  1.1  riastrad 	 * v6.4s := (0,0,pprk[0],pprk[1])
    385  1.1  riastrad 	 * v7.4s := (0,0,0,pprk[0])
    386  1.1  riastrad 	 */
    387  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    388  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    389  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    390  1.1  riastrad 
    391  1.1  riastrad 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    392  1.1  riastrad 	eor	v1.16b, v1.16b, v3.16b
    393  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    394  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    395  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    396  1.1  riastrad 
    397  1.1  riastrad 	subs	x2, x2, #2		/* count down two rounds */
    398  1.1  riastrad 	b.eq	2f			/* stop if this is the last one */
    399  1.1  riastrad 
    400  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q1)) */
    401  1.1  riastrad 	mov	v3.16b, v1.16b
    402  1.1  riastrad 	aese	v3.16b, v0.16b
    403  1.1  riastrad 
    404  1.1  riastrad 	/* v3.4s[i] := SubBytes(rk[3]) */
    405  1.1  riastrad 	tbl	v3.16b, {v3.16b}, v9.16b
    406  1.1  riastrad 
    407  1.1  riastrad 	/*
    408  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    409  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    410  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    411  1.1  riastrad 	 */
    412  1.1  riastrad 	ext	v5.16b, v0.16b, v2.16b, #12
    413  1.1  riastrad 	ext	v6.16b, v0.16b, v2.16b, #8
    414  1.1  riastrad 	ext	v7.16b, v0.16b, v2.16b, #4
    415  1.1  riastrad 
    416  1.1  riastrad 	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    417  1.1  riastrad 	eor	v2.16b, v2.16b, v3.16b
    418  1.1  riastrad 	eor	v2.16b, v2.16b, v5.16b
    419  1.1  riastrad 	eor	v2.16b, v2.16b, v6.16b
    420  1.1  riastrad 	eor	v2.16b, v2.16b, v7.16b
    421  1.1  riastrad 
    422  1.1  riastrad 	stp	q1, q2, [x0], #0x20	/* store two round keys */
    423  1.1  riastrad 	b	1b
    424  1.1  riastrad 
    425  1.1  riastrad 2:	str	q1, [x0]		/* store last round key */
    426  1.1  riastrad 	ret
    427  1.1  riastrad END(aesarmv8_setenckey256)
    428  1.1  riastrad 
    429  1.1  riastrad /*
    430  1.1  riastrad  * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
    431  1.1  riastrad  *     uint32_t nrounds@x2)
    432  1.1  riastrad  *
    433  1.1  riastrad  *	Convert AES encryption round keys to AES decryption round keys.
    434  1.1  riastrad  *	`rounds' must be between 10 and 14.
    435  1.1  riastrad  *
    436  1.1  riastrad  *	Standard ABI calling convention.
    437  1.1  riastrad  */
    438  1.1  riastrad ENTRY(aesarmv8_enctodec)
    439  1.1  riastrad 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
    440  1.1  riastrad 1:	str	q0, [x1], #0x10	/* store round key */
    441  1.1  riastrad 	subs	x2, x2, #1	/* count down round */
    442  1.1  riastrad 	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
    443  1.1  riastrad 	b.eq	2f		/* stop if this is the last one */
    444  1.1  riastrad 	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
    445  1.1  riastrad 	b	1b
    446  1.1  riastrad 2:	str	q0, [x1]	/* store first round key verbatim */
    447  1.1  riastrad 	ret
    448  1.1  riastrad END(aesarmv8_enctodec)
    449  1.1  riastrad 
    450  1.1  riastrad /*
    451  1.1  riastrad  * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
    452  1.1  riastrad  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    453  1.1  riastrad  *
    454  1.1  riastrad  *	Encrypt a single block.
    455  1.1  riastrad  *
    456  1.1  riastrad  *	Standard ABI calling convention.
    457  1.1  riastrad  */
    458  1.1  riastrad ENTRY(aesarmv8_enc)
    459  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    460  1.1  riastrad 	mov	fp, sp
    461  1.1  riastrad 	ldr	q0, [x1]	/* q0 := block */
    462  1.1  riastrad 	bl	aesarmv8_enc1
    463  1.1  riastrad 	str	q0, [x2]	/* store block */
    464  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    465  1.1  riastrad 	ret
    466  1.1  riastrad END(aesarmv8_enc)
    467  1.1  riastrad 
    468  1.1  riastrad /*
    469  1.1  riastrad  * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
    470  1.1  riastrad  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    471  1.1  riastrad  *
    472  1.1  riastrad  *	Decrypt a single block.
    473  1.1  riastrad  *
    474  1.1  riastrad  *	Standard ABI calling convention.
    475  1.1  riastrad  */
    476  1.1  riastrad ENTRY(aesarmv8_dec)
    477  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    478  1.1  riastrad 	mov	fp, sp
    479  1.1  riastrad 	ldr	q0, [x1]	/* q0 := block */
    480  1.1  riastrad 	bl	aesarmv8_dec1
    481  1.1  riastrad 	str	q0, [x2]	/* store block */
    482  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    483  1.1  riastrad 	ret
    484  1.1  riastrad END(aesarmv8_dec)
    485  1.1  riastrad 
    486  1.1  riastrad /*
    487  1.1  riastrad  * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    488  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
    489  1.1  riastrad  *     uint32_t nrounds@x5)
    490  1.1  riastrad  *
    491  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    492  1.1  riastrad  *
    493  1.1  riastrad  *	nbytes must be an integral multiple of 16.
    494  1.1  riastrad  *
    495  1.1  riastrad  *	Standard ABI calling convention.
    496  1.1  riastrad  */
    497  1.1  riastrad ENTRY(aesarmv8_cbc_enc)
    498  1.1  riastrad 	cbz	x3, 2f			/* stop if nothing to do */
    499  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    500  1.1  riastrad 	mov	fp, sp
    501  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    502  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    503  1.1  riastrad 	ldr	q0, [x4]		/* q0 := chaining value */
    504  1.1  riastrad 1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
    505  1.1  riastrad 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
    506  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    507  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    508  1.1  riastrad 	bl	aesarmv8_enc1		/* q0 := ciphertext block */
    509  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    510  1.1  riastrad 	str	q0, [x2], #0x10		/* store ciphertext block */
    511  1.1  riastrad 	b.ne	1b			/* repeat if x10 is nonzero */
    512  1.1  riastrad 	str	q0, [x4]		/* store chaining value */
    513  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    514  1.1  riastrad 2:	ret
    515  1.1  riastrad END(aesarmv8_cbc_enc)
    516  1.1  riastrad 
    517  1.1  riastrad /*
    518  1.1  riastrad  * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    519  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    520  1.1  riastrad  *     uint32_t nrounds@x5)
    521  1.1  riastrad  *
    522  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    523  1.1  riastrad  *
    524  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    525  1.1  riastrad  *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
    526  1.1  riastrad  *
    527  1.1  riastrad  *	Standard ABI calling convention.
    528  1.1  riastrad  */
    529  1.1  riastrad ENTRY(aesarmv8_cbc_dec1)
    530  1.1  riastrad 	stp	fp, lr, [sp, #-32]!	/* push stack frame with uint128 */
    531  1.1  riastrad 	mov	fp, sp
    532  1.1  riastrad 	ldr	q8, [x4]		/* q8 := iv */
    533  1.1  riastrad 	str	q8, [sp, #16]		/* save iv */
    534  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    535  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    536  1.1  riastrad 	add	x1, x1, x3		/* x1 := pointer past end of in */
    537  1.1  riastrad 	add	x2, x2, x3		/* x2 := pointer past end of out */
    538  1.1  riastrad 	ldr	q0, [x1, #-0x10]!	/* q0 := last ciphertext block */
    539  1.1  riastrad 	str	q0, [x4]		/* update iv */
    540  1.1  riastrad 1:	mov	x0, x9			/* x0 := enckey */
    541  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    542  1.1  riastrad 	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3 */
    543  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    544  1.1  riastrad 	b.eq	2f			/* stop if this is the first block */
    545  1.1  riastrad 	ldr	q8, [x1, #-0x10]!	/* q8 := chaining value */
    546  1.1  riastrad 	eor	v0.16b, v0.16b, v8.16b	/* q0 := plaintext block */
    547  1.1  riastrad 	str	q0, [x2, #-0x10]!	/* store plaintext block */
    548  1.1  riastrad 	mov	v0.16b, v8.16b		/* move cv = ciphertext block */
    549  1.1  riastrad 	b	1b
    550  1.1  riastrad 2:	ldr	q8, [sp, #16]		/* q8 := iv */
    551  1.1  riastrad 	eor	v0.16b, v0.16b, v8.16b	/* q0 := first plaintext block */
    552  1.1  riastrad 	str	q0, [x2, #-0x10]!	/* store first plaintext block */
    553  1.1  riastrad 	ldp	fp, lr, [sp], #32	/* pop stack frame */
    554  1.1  riastrad 	ret
    555  1.1  riastrad END(aesarmv8_cbc_dec1)
    556  1.1  riastrad 
    557  1.1  riastrad /*
    558  1.1  riastrad  * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    559  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    560  1.1  riastrad  *     uint32_t nrounds@x5)
    561  1.1  riastrad  *
    562  1.1  riastrad  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    563  1.1  riastrad  *
    564  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    565  1.1  riastrad  *
    566  1.1  riastrad  *	Standard ABI calling convention.
    567  1.1  riastrad  */
    568  1.1  riastrad ENTRY(aesarmv8_cbc_dec8)
    569  1.1  riastrad 	stp	fp, lr, [sp, #-32]!	/* push stack frame with uint128 */
    570  1.1  riastrad 	mov	fp, sp
    571  1.1  riastrad 	ldr	q8, [x4]		/* q8 := iv */
    572  1.1  riastrad 	str	q8, [sp, #16]		/* save iv */
    573  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    574  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    575  1.1  riastrad 	add	x1, x1, x3		/* x1 := pointer past end of in */
    576  1.1  riastrad 	add	x2, x2, x3		/* x2 := pointer past end of out */
    577  1.1  riastrad 	ldp	q6, q7, [x1, #-0x20]!	/* q6, q7 := last ciphertext blocks */
    578  1.1  riastrad 	str	q7, [x4]		/* update iv */
    579  1.1  riastrad 1:	ldp	q4, q5, [x1, #-0x20]!
    580  1.1  riastrad 	ldp	q2, q3, [x1, #-0x20]!
    581  1.1  riastrad 	ldp	q0, q1, [x1, #-0x20]!
    582  1.1  riastrad 	mov	v15.16b, v6.16b		/* q[8+i] := cv[i], 0<i<8 */
    583  1.1  riastrad 	mov	v14.16b, v5.16b
    584  1.1  riastrad 	mov	v13.16b, v4.16b
    585  1.1  riastrad 	mov	v12.16b, v3.16b
    586  1.1  riastrad 	mov	v11.16b, v2.16b
    587  1.1  riastrad 	mov	v10.16b, v1.16b
    588  1.1  riastrad 	mov	v9.16b, v0.16b
    589  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    590  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    591  1.1  riastrad 	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i] */
    592  1.1  riastrad 	eor	v7.16b, v7.16b, v15.16b	/* q[i] := pt[i] */
    593  1.1  riastrad 	eor	v6.16b, v6.16b, v14.16b
    594  1.1  riastrad 	eor	v5.16b, v5.16b, v13.16b
    595  1.1  riastrad 	eor	v4.16b, v4.16b, v12.16b
    596  1.1  riastrad 	eor	v3.16b, v3.16b, v11.16b
    597  1.1  riastrad 	eor	v2.16b, v2.16b, v10.16b
    598  1.1  riastrad 	eor	v1.16b, v1.16b, v9.16b
    599  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    600  1.1  riastrad 	stp	q6, q7, [x2, #-0x20]!	/* store plaintext blocks */
    601  1.1  riastrad 	stp	q4, q5, [x2, #-0x20]!
    602  1.1  riastrad 	stp	q2, q3, [x2, #-0x20]!
    603  1.1  riastrad 	b.eq	2f			/* stop if this is the first block */
    604  1.1  riastrad 	ldp	q6, q7, [x1, #-0x20]!
    605  1.1  riastrad 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
    606  1.1  riastrad 	stp	q0, q1, [x2, #-0x20]!
    607  1.1  riastrad 	b	1b
    608  1.1  riastrad 2:	ldr	q8, [sp, #16]		/* q8 := iv */
    609  1.1  riastrad 	eor	v0.16b, v0.16b, v8.16b	/* q0 := pt0 */
    610  1.1  riastrad 	stp	q0, q1, [x2, #-0x20]!	/* store first two plaintext blocks */
    611  1.1  riastrad 	ldp	fp, lr, [sp], #32	/* pop stack frame */
    612  1.1  riastrad 	ret
    613  1.1  riastrad END(aesarmv8_cbc_dec8)
    614  1.1  riastrad 
    615  1.1  riastrad /*
    616  1.1  riastrad  * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    617  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    618  1.1  riastrad  *     uint32_t nrounds@x5)
    619  1.1  riastrad  *
    620  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    621  1.1  riastrad  *
    622  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    623  1.1  riastrad  *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
    624  1.1  riastrad  *
    625  1.1  riastrad  *	Standard ABI calling convention.
    626  1.1  riastrad  */
    627  1.1  riastrad ENTRY(aesarmv8_xts_enc1)
    628  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    629  1.1  riastrad 	mov	fp, sp
    630  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    631  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    632  1.1  riastrad 	ldr	q9, [x4]		/* q9 := tweak */
    633  1.1  riastrad 1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
    634  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    635  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    636  1.1  riastrad 	eor	v0.16b, v0.16b, v9.16b	/* q0 := ptxt ^ tweak */
    637  1.1  riastrad 	bl	aesarmv8_enc1		/* q0 := AES(ptxt ^ tweak) */
    638  1.1  riastrad 	eor	v0.16b, v0.16b, v9.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
    639  1.1  riastrad 	str	q0, [x2], #0x10		/* store ciphertext block */
    640  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    641  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    642  1.1  riastrad 	b.ne	1b			/* repeat if more blocks */
    643  1.1  riastrad 	str	q9, [x4]		/* update tweak */
    644  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    645  1.1  riastrad 	ret
    646  1.1  riastrad END(aesarmv8_xts_enc1)
    647  1.1  riastrad 
    648  1.1  riastrad /*
    649  1.1  riastrad  * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    650  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    651  1.1  riastrad  *     uint32_t nrounds@x5)
    652  1.1  riastrad  *
    653  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    654  1.1  riastrad  *
    655  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    656  1.1  riastrad  *
    657  1.1  riastrad  *	Standard ABI calling convention.
    658  1.1  riastrad  */
    659  1.1  riastrad ENTRY(aesarmv8_xts_enc8)
    660  1.1  riastrad 	stp	fp, lr, [sp, #-48]!	/* push stack frame uint128[2] */
    661  1.1  riastrad 	mov	fp, sp
    662  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    663  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    664  1.1  riastrad 	ldr	q9, [x4]		/* q9 := tweak */
    665  1.1  riastrad 1:	str	q9, [sp, #16]		/* save tweak[0] */
    666  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    667  1.1  riastrad 	str	q9, [sp, #32]		/* save tweak[1] */
    668  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    669  1.1  riastrad 	mov	v10.16b, v9.16b		/* q10 := tweak[2] */
    670  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    671  1.1  riastrad 	mov	v11.16b, v9.16b		/* q11 := tweak[3] */
    672  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    673  1.1  riastrad 	mov	v12.16b, v9.16b		/* q11 := tweak[4] */
    674  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    675  1.1  riastrad 	mov	v13.16b, v9.16b		/* q11 := tweak[5] */
    676  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    677  1.1  riastrad 	mov	v14.16b, v9.16b		/* q11 := tweak[6] */
    678  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    679  1.1  riastrad 	mov	v15.16b, v9.16b		/* q11 := tweak[7] */
    680  1.1  riastrad 	ldp	q8, q9, [sp, #16]	/* q8 := tweak[0], q9 := tweak[1] */
    681  1.1  riastrad 	ldp	q0, q1, [x1], #0x20	/* q[i] := pt[i] */
    682  1.1  riastrad 	ldp	q2, q3, [x1], #0x20
    683  1.1  riastrad 	ldp	q4, q5, [x1], #0x20
    684  1.1  riastrad 	ldp	q6, q7, [x1], #0x20
    685  1.1  riastrad 	eor	v0.16b, v0.16b, v8.16b	/* q[i] := pt[i] ^ tweak[i] */
    686  1.1  riastrad 	eor	v1.16b, v1.16b, v9.16b
    687  1.1  riastrad 	eor	v2.16b, v2.16b, v10.16b
    688  1.1  riastrad 	eor	v3.16b, v3.16b, v11.16b
    689  1.1  riastrad 	eor	v4.16b, v4.16b, v12.16b
    690  1.1  riastrad 	eor	v5.16b, v5.16b, v13.16b
    691  1.1  riastrad 	eor	v6.16b, v6.16b, v14.16b
    692  1.1  riastrad 	eor	v7.16b, v7.16b, v15.16b
    693  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    694  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    695  1.1  riastrad 	bl	aesarmv8_enc8		/* encrypt q0,...,q7; trash x0/x3/q8 */
    696  1.1  riastrad 	ldr	q8, [sp, #16]		/* reload q8 := tweak[0] */
    697  1.1  riastrad 	eor	v1.16b, v1.16b, v9.16b	/* q[i] := AES(...) ^ tweak[i] */
    698  1.1  riastrad 	eor	v2.16b, v2.16b, v10.16b
    699  1.1  riastrad 	eor	v3.16b, v3.16b, v11.16b
    700  1.1  riastrad 	eor	v0.16b, v0.16b, v8.16b
    701  1.1  riastrad 	eor	v4.16b, v4.16b, v12.16b
    702  1.1  riastrad 	eor	v5.16b, v5.16b, v13.16b
    703  1.1  riastrad 	eor	v6.16b, v6.16b, v14.16b
    704  1.1  riastrad 	eor	v7.16b, v7.16b, v15.16b
    705  1.1  riastrad 	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
    706  1.1  riastrad 	stp	q2, q3, [x2], #0x20	/* store ciphertext blocks */
    707  1.1  riastrad 	stp	q4, q5, [x2], #0x20	/* store ciphertext blocks */
    708  1.1  riastrad 	stp	q6, q7, [x2], #0x20	/* store ciphertext blocks */
    709  1.1  riastrad 	mov	v9.16b, v15.16b		/* q9 := q15 = tweak[7] */
    710  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    711  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    712  1.1  riastrad 	b.ne	1b			/* repeat if more block groups */
    713  1.1  riastrad 	str	q9, [x4]		/* update tweak */
    714  1.1  riastrad 	ldp	fp, lr, [sp], #48	/* pop stack frame */
    715  1.1  riastrad 	ret
    716  1.1  riastrad END(aesarmv8_xts_enc8)
    717  1.1  riastrad 
    718  1.1  riastrad /*
    719  1.1  riastrad  * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    720  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    721  1.1  riastrad  *     uint32_t nrounds@x5)
    722  1.1  riastrad  *
    723  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-XTS.
    724  1.1  riastrad  *
    725  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    726  1.1  riastrad  *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
    727  1.1  riastrad  *
    728  1.1  riastrad  *	Standard ABI calling convention.
    729  1.1  riastrad  */
    730  1.1  riastrad ENTRY(aesarmv8_xts_dec1)
    731  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    732  1.1  riastrad 	mov	fp, sp
    733  1.1  riastrad 	mov	x9, x0			/* x9 := deckey */
    734  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    735  1.1  riastrad 	ldr	q9, [x4]		/* q9 := tweak */
    736  1.1  riastrad 1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
    737  1.1  riastrad 	mov	x0, x9			/* x0 := deckey */
    738  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    739  1.1  riastrad 	eor	v0.16b, v0.16b, v9.16b	/* q0 := ptxt ^ tweak */
    740  1.1  riastrad 	bl	aesarmv8_dec1		/* q0 := AES(ptxt ^ tweak) */
    741  1.1  riastrad 	eor	v0.16b, v0.16b, v9.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
    742  1.1  riastrad 	str	q0, [x2], #0x10		/* store ciphertext block */
    743  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    744  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    745  1.1  riastrad 	b.ne	1b			/* repeat if more blocks */
    746  1.1  riastrad 	str	q9, [x4]		/* update tweak */
    747  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    748  1.1  riastrad 	ret
    749  1.1  riastrad END(aesarmv8_xts_dec1)
    750  1.1  riastrad 
    751  1.1  riastrad /*
    752  1.1  riastrad  * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    753  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    754  1.1  riastrad  *     uint32_t nrounds@x5)
    755  1.1  riastrad  *
    756  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-XTS.
    757  1.1  riastrad  *
    758  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    759  1.1  riastrad  *
    760  1.1  riastrad  *	Standard ABI calling convention.
    761  1.1  riastrad  */
    762  1.1  riastrad ENTRY(aesarmv8_xts_dec8)
    763  1.1  riastrad 	stp	fp, lr, [sp, #-48]!	/* push stack frame uint128[2] */
    764  1.1  riastrad 	mov	fp, sp
    765  1.1  riastrad 	mov	x9, x0			/* x9 := deckey */
    766  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    767  1.1  riastrad 	ldr	q9, [x4]		/* q9 := tweak */
    768  1.1  riastrad 1:	str	q9, [sp, #16]		/* save tweak[0] */
    769  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    770  1.1  riastrad 	str	q9, [sp, #32]		/* save tweak[1] */
    771  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    772  1.1  riastrad 	mov	v10.16b, v9.16b		/* q10 := tweak[2] */
    773  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    774  1.1  riastrad 	mov	v11.16b, v9.16b		/* q11 := tweak[3] */
    775  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    776  1.1  riastrad 	mov	v12.16b, v9.16b		/* q11 := tweak[4] */
    777  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    778  1.1  riastrad 	mov	v13.16b, v9.16b		/* q11 := tweak[5] */
    779  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    780  1.1  riastrad 	mov	v14.16b, v9.16b		/* q11 := tweak[6] */
    781  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    782  1.1  riastrad 	mov	v15.16b, v9.16b		/* q11 := tweak[7] */
    783  1.1  riastrad 	ldp	q8, q9, [sp, #16]	/* q8 := tweak[0], q9 := tweak[1] */
    784  1.1  riastrad 	ldp	q0, q1, [x1], #0x20	/* q[i] := pt[i] */
    785  1.1  riastrad 	ldp	q2, q3, [x1], #0x20
    786  1.1  riastrad 	ldp	q4, q5, [x1], #0x20
    787  1.1  riastrad 	ldp	q6, q7, [x1], #0x20
    788  1.1  riastrad 	eor	v0.16b, v0.16b, v8.16b	/* q[i] := pt[i] ^ tweak[i] */
    789  1.1  riastrad 	eor	v1.16b, v1.16b, v9.16b
    790  1.1  riastrad 	eor	v2.16b, v2.16b, v10.16b
    791  1.1  riastrad 	eor	v3.16b, v3.16b, v11.16b
    792  1.1  riastrad 	eor	v4.16b, v4.16b, v12.16b
    793  1.1  riastrad 	eor	v5.16b, v5.16b, v13.16b
    794  1.1  riastrad 	eor	v6.16b, v6.16b, v14.16b
    795  1.1  riastrad 	eor	v7.16b, v7.16b, v15.16b
    796  1.1  riastrad 	mov	x0, x9			/* x0 := deckey */
    797  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    798  1.1  riastrad 	bl	aesarmv8_dec8		/* decrypt q0,...,q7; trash x0/x3/q8 */
    799  1.1  riastrad 	ldr	q8, [sp, #16]		/* reload q8 := tweak[0] */
    800  1.1  riastrad 	eor	v1.16b, v1.16b, v9.16b	/* q[i] := AES(...) ^ tweak[i] */
    801  1.1  riastrad 	eor	v2.16b, v2.16b, v10.16b
    802  1.1  riastrad 	eor	v3.16b, v3.16b, v11.16b
    803  1.1  riastrad 	eor	v0.16b, v0.16b, v8.16b
    804  1.1  riastrad 	eor	v4.16b, v4.16b, v12.16b
    805  1.1  riastrad 	eor	v5.16b, v5.16b, v13.16b
    806  1.1  riastrad 	eor	v6.16b, v6.16b, v14.16b
    807  1.1  riastrad 	eor	v7.16b, v7.16b, v15.16b
    808  1.1  riastrad 	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
    809  1.1  riastrad 	stp	q2, q3, [x2], #0x20	/* store ciphertext blocks */
    810  1.1  riastrad 	stp	q4, q5, [x2], #0x20	/* store ciphertext blocks */
    811  1.1  riastrad 	stp	q6, q7, [x2], #0x20	/* store ciphertext blocks */
    812  1.1  riastrad 	mov	v9.16b, v15.16b		/* q9 := q15 = tweak[7] */
    813  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
    814  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    815  1.1  riastrad 	b.ne	1b			/* repeat if more block groups */
    816  1.1  riastrad 	str	q9, [x4]		/* update tweak */
    817  1.1  riastrad 	ldp	fp, lr, [sp], #48	/* pop stack frame */
    818  1.1  riastrad 	ret
    819  1.1  riastrad END(aesarmv8_xts_dec8)
    820  1.1  riastrad 
    821  1.1  riastrad /*
    822  1.1  riastrad  * aesarmv8_xts_mulx(tweak@q9)
    823  1.1  riastrad  *
    824  1.1  riastrad  *	Multiply q9 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    825  1.1  riastrad  *	Uses x0 and q0/q1 as temporaries.
    826  1.1  riastrad  */
    827  1.1  riastrad 	.text
    828  1.1  riastrad 	_ALIGN_TEXT
    829  1.1  riastrad 	.type	aesarmv8_xts_mulx,@function
    830  1.1  riastrad aesarmv8_xts_mulx:
    831  1.1  riastrad 	/*
    832  1.1  riastrad 	 * Simultaneously determine
    833  1.1  riastrad 	 * (a) whether the high bit of the low half must be
    834  1.1  riastrad 	 *     shifted into the low bit of the high half, and
    835  1.1  riastrad 	 * (b) whether the high bit of the high half must be
    836  1.1  riastrad 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    837  1.1  riastrad 	 */
    838  1.1  riastrad 	adrl	x0, xtscarry
    839  1.1  riastrad 	cmlt	v1.2d, v9.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
    840  1.1  riastrad 	ldr	q0, [x0]		/* q0 := xtscarry */
    841  1.1  riastrad 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
    842  1.1  riastrad 	shl	v9.2d, v9.2d, #1	/* shift */
    843  1.1  riastrad 	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
    844  1.1  riastrad 	eor	v9.16b, v9.16b, v0.16b	/* incorporate (a) and (b) */
    845  1.1  riastrad 	ret
    846  1.1  riastrad END(aesarmv8_xts_mulx)
    847  1.1  riastrad 
    848  1.1  riastrad 	.section .rodata
    849  1.2  riastrad 	.p2align 4
    850  1.1  riastrad 	.type	xtscarry,@object
    851  1.1  riastrad xtscarry:
    852  1.1  riastrad 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    853  1.1  riastrad END(xtscarry)
    854  1.1  riastrad 
    855  1.1  riastrad /*
    856  1.1  riastrad  * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
    857  1.1  riastrad  *
    858  1.1  riastrad  *	Update an AES-XTS tweak.
    859  1.1  riastrad  *
    860  1.1  riastrad  *	Standard ABI calling convention.
    861  1.1  riastrad  */
    862  1.1  riastrad ENTRY(aesarmv8_xts_update)
    863  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    864  1.1  riastrad 	mov	fp, sp
    865  1.1  riastrad 	ldr	q9, [x0]		/* load tweak */
    866  1.1  riastrad 	bl	aesarmv8_xts_mulx	/* q9 *= x */
    867  1.1  riastrad 	str	q9, [x1]		/* store tweak */
    868  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    869  1.1  riastrad 	ret
    870  1.1  riastrad END(aesarmv8_xts_update)
    871  1.1  riastrad 
    872  1.1  riastrad /*
    873  1.1  riastrad  * aesarmv8_enc1(const struct aesenc *enckey@x0,
    874  1.1  riastrad  *     uint128_t block@q0, uint32_t nrounds@x3)
    875  1.1  riastrad  *
    876  1.1  riastrad  *	Encrypt a single AES block in q0.
    877  1.1  riastrad  *
    878  1.1  riastrad  *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
    879  1.1  riastrad  */
    880  1.1  riastrad 	.text
    881  1.1  riastrad 	_ALIGN_TEXT
    882  1.1  riastrad 	.type	aesarmv8_enc1,@function
    883  1.1  riastrad aesarmv8_enc1:
    884  1.1  riastrad 	ldr	q8, [x0], #0x10		/* load round key */
    885  1.1  riastrad 1:	subs	x3, x3, #1
    886  1.1  riastrad 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q8(q0))) */
    887  1.1  riastrad 	aese	v0.16b, v8.16b
    888  1.1  riastrad 	ldr	q8, [x0], #0x10		/* load next round key */
    889  1.1  riastrad 	b.eq	2f
    890  1.1  riastrad 	/* q0 := MixColumns(q0) */
    891  1.1  riastrad 	aesmc	v0.16b, v0.16b
    892  1.1  riastrad 	b	1b
    893  1.1  riastrad 2:	eor	v0.16b, v0.16b, v8.16b
    894  1.1  riastrad 	ret
    895  1.1  riastrad END(aesarmv8_enc1)
    896  1.1  riastrad 
    897  1.1  riastrad /*
    898  1.1  riastrad  * aesarmv8_enc8(const struct aesenc *enckey@x0,
    899  1.1  riastrad  *     uint128_t block0@q0, ..., uint128_t block7@q7,
    900  1.1  riastrad  *     uint32_t nrounds@x3)
    901  1.1  riastrad  *
    902  1.1  riastrad  *	Encrypt eight AES blocks in q0 through q7 in parallel.
    903  1.1  riastrad  *
    904  1.1  riastrad  *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
    905  1.1  riastrad  */
    906  1.1  riastrad 	.text
    907  1.1  riastrad 	_ALIGN_TEXT
    908  1.1  riastrad 	.type	aesarmv8_enc8,@function
    909  1.1  riastrad aesarmv8_enc8:
    910  1.1  riastrad 	ldr	q8, [x0], #0x10		/* load round key */
    911  1.1  riastrad 1:	subs	x3, x3, #1
    912  1.1  riastrad 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */
    913  1.1  riastrad 	aese	v0.16b, v8.16b
    914  1.1  riastrad 	aese	v1.16b, v8.16b
    915  1.1  riastrad 	aese	v2.16b, v8.16b
    916  1.1  riastrad 	aese	v3.16b, v8.16b
    917  1.1  riastrad 	aese	v4.16b, v8.16b
    918  1.1  riastrad 	aese	v5.16b, v8.16b
    919  1.1  riastrad 	aese	v6.16b, v8.16b
    920  1.1  riastrad 	aese	v7.16b, v8.16b
    921  1.1  riastrad 	ldr	q8, [x0], #0x10		/* load next round key */
    922  1.1  riastrad 	b.eq	2f
    923  1.1  riastrad 	/* q[i] := MixColumns(q[i]) */
    924  1.1  riastrad 	aesmc	v0.16b, v0.16b
    925  1.1  riastrad 	aesmc	v1.16b, v1.16b
    926  1.1  riastrad 	aesmc	v2.16b, v2.16b
    927  1.1  riastrad 	aesmc	v3.16b, v3.16b
    928  1.1  riastrad 	aesmc	v4.16b, v4.16b
    929  1.1  riastrad 	aesmc	v5.16b, v5.16b
    930  1.1  riastrad 	aesmc	v6.16b, v6.16b
    931  1.1  riastrad 	aesmc	v7.16b, v7.16b
    932  1.1  riastrad 	b	1b
    933  1.1  riastrad 2:	eor	v0.16b, v0.16b, v8.16b	/* AddRoundKey */
    934  1.1  riastrad 	eor	v1.16b, v1.16b, v8.16b
    935  1.1  riastrad 	eor	v2.16b, v2.16b, v8.16b
    936  1.1  riastrad 	eor	v3.16b, v3.16b, v8.16b
    937  1.1  riastrad 	eor	v4.16b, v4.16b, v8.16b
    938  1.1  riastrad 	eor	v5.16b, v5.16b, v8.16b
    939  1.1  riastrad 	eor	v6.16b, v6.16b, v8.16b
    940  1.1  riastrad 	eor	v7.16b, v7.16b, v8.16b
    941  1.1  riastrad 	ret
    942  1.1  riastrad END(aesarmv8_enc8)
    943  1.1  riastrad 
    944  1.1  riastrad /*
    945  1.1  riastrad  * aesarmv8_dec1(const struct aesdec *deckey@x0,
    946  1.1  riastrad  *     uint128_t block@q0, uint32_t nrounds@x3)
    947  1.1  riastrad  *
    948  1.1  riastrad  *	Decrypt a single AES block in q0.
    949  1.1  riastrad  *
    950  1.1  riastrad  *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
    951  1.1  riastrad  */
    952  1.1  riastrad 	.text
    953  1.1  riastrad 	_ALIGN_TEXT
    954  1.1  riastrad 	.type	aesarmv8_dec1,@function
    955  1.1  riastrad aesarmv8_dec1:
    956  1.1  riastrad 	ldr	q8, [x0], #0x10		/* load round key */
    957  1.1  riastrad 1:	subs	x3, x3, #1
    958  1.1  riastrad 	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q8(q0))) */
    959  1.1  riastrad 	aesd	v0.16b, v8.16b
    960  1.1  riastrad 	ldr	q8, [x0], #0x10		/* load next round key */
    961  1.1  riastrad 	b.eq	2f
    962  1.1  riastrad 	/* q0 := InMixColumns(q0) */
    963  1.1  riastrad 	aesimc	v0.16b, v0.16b
    964  1.1  riastrad 	b	1b
    965  1.1  riastrad 2:	eor	v0.16b, v0.16b, v8.16b
    966  1.1  riastrad 	ret
    967  1.1  riastrad END(aesarmv8_dec1)
    968  1.1  riastrad 
    969  1.1  riastrad /*
    970  1.1  riastrad  * aesarmv8_dec8(const struct aesdec *deckey@x0,
    971  1.1  riastrad  *     uint128_t block0@q0, ..., uint128_t block7@q7,
    972  1.1  riastrad  *     uint32_t nrounds@x3)
    973  1.1  riastrad  *
    974  1.1  riastrad  *	Decrypt eight AES blocks in q0 through q7 in parallel.
    975  1.1  riastrad  *
    976  1.1  riastrad  *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
    977  1.1  riastrad  */
    978  1.1  riastrad 	.text
    979  1.1  riastrad 	_ALIGN_TEXT
    980  1.1  riastrad 	.type	aesarmv8_dec8,@function
    981  1.1  riastrad aesarmv8_dec8:
    982  1.1  riastrad 	ldr	q8, [x0], #0x10		/* load round key */
    983  1.1  riastrad 1:	subs	x3, x3, #1
    984  1.1  riastrad 	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */
    985  1.1  riastrad 	aesd	v0.16b, v8.16b
    986  1.1  riastrad 	aesd	v1.16b, v8.16b
    987  1.1  riastrad 	aesd	v2.16b, v8.16b
    988  1.1  riastrad 	aesd	v3.16b, v8.16b
    989  1.1  riastrad 	aesd	v4.16b, v8.16b
    990  1.1  riastrad 	aesd	v5.16b, v8.16b
    991  1.1  riastrad 	aesd	v6.16b, v8.16b
    992  1.1  riastrad 	aesd	v7.16b, v8.16b
    993  1.1  riastrad 	ldr	q8, [x0], #0x10		/* load next round key */
    994  1.1  riastrad 	b.eq	2f
    995  1.1  riastrad 	/* q[i] := InMixColumns(q[i]) */
    996  1.1  riastrad 	aesimc	v0.16b, v0.16b
    997  1.1  riastrad 	aesimc	v1.16b, v1.16b
    998  1.1  riastrad 	aesimc	v2.16b, v2.16b
    999  1.1  riastrad 	aesimc	v3.16b, v3.16b
   1000  1.1  riastrad 	aesimc	v4.16b, v4.16b
   1001  1.1  riastrad 	aesimc	v5.16b, v5.16b
   1002  1.1  riastrad 	aesimc	v6.16b, v6.16b
   1003  1.1  riastrad 	aesimc	v7.16b, v7.16b
   1004  1.1  riastrad 	b	1b
   1005  1.1  riastrad 2:	eor	v0.16b, v0.16b, v8.16b	/* AddRoundKey */
   1006  1.1  riastrad 	eor	v1.16b, v1.16b, v8.16b
   1007  1.1  riastrad 	eor	v2.16b, v2.16b, v8.16b
   1008  1.1  riastrad 	eor	v3.16b, v3.16b, v8.16b
   1009  1.1  riastrad 	eor	v4.16b, v4.16b, v8.16b
   1010  1.1  riastrad 	eor	v5.16b, v5.16b, v8.16b
   1011  1.1  riastrad 	eor	v6.16b, v6.16b, v8.16b
   1012  1.1  riastrad 	eor	v7.16b, v7.16b, v8.16b
   1013  1.1  riastrad 	ret
   1014  1.1  riastrad END(aesarmv8_dec8)
   1015