Home | History | Annotate | Line # | Download | only in arm
aes_armv8_64.S revision 1.9
      1  1.9  riastrad /*	$NetBSD: aes_armv8_64.S,v 1.9 2020/07/27 20:53:22 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.8  riastrad #include <sys/endian.h>
     30  1.8  riastrad 
     31  1.1  riastrad #include <aarch64/asm.h>
     32  1.1  riastrad 
     33  1.3  riastrad 	.arch_extension	aes
     34  1.1  riastrad 
     35  1.1  riastrad /*
     36  1.1  riastrad  * uint32_t rcon[10]
     37  1.1  riastrad  *
     38  1.1  riastrad  *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
     39  1.1  riastrad  *	Such elements of GF(8) need only eight bits to be represented,
     40  1.1  riastrad  *	but we store them in 4-byte units so we can copy one into all
     41  1.1  riastrad  *	four 4-byte lanes of a vector register with a single LD1R.  The
     42  1.1  riastrad  *	access pattern is fixed, so indices into this table are never
     43  1.1  riastrad  *	secret.
     44  1.1  riastrad  */
     45  1.1  riastrad 	.section .rodata
     46  1.2  riastrad 	.p2align 2
     47  1.1  riastrad 	.type	rcon,@object
     48  1.1  riastrad rcon:
     49  1.1  riastrad 	.long	0x01
     50  1.1  riastrad 	.long	0x02
     51  1.1  riastrad 	.long	0x04
     52  1.1  riastrad 	.long	0x08
     53  1.1  riastrad 	.long	0x10
     54  1.1  riastrad 	.long	0x20
     55  1.1  riastrad 	.long	0x40
     56  1.1  riastrad 	.long	0x80
     57  1.1  riastrad 	.long	0x1b
     58  1.1  riastrad 	.long	0x36
     59  1.1  riastrad END(rcon)
     60  1.1  riastrad 
     61  1.1  riastrad /*
     62  1.1  riastrad  * uint128_t unshiftrows_rotword_1
     63  1.1  riastrad  *
     64  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then do
     65  1.1  riastrad  *	RotWord on word 1, and then copy it into all the other words.
     66  1.1  riastrad  */
     67  1.1  riastrad 	.section .rodata
     68  1.2  riastrad 	.p2align 4
     69  1.1  riastrad 	.type	unshiftrows_rotword_1,@object
     70  1.1  riastrad unshiftrows_rotword_1:
     71  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     72  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     73  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     74  1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     75  1.1  riastrad END(unshiftrows_rotword_1)
     76  1.1  riastrad 
     77  1.1  riastrad /*
     78  1.1  riastrad  * uint128_t unshiftrows_3
     79  1.1  riastrad  *
     80  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then copy word
     81  1.1  riastrad  *	3 into all the other words.
     82  1.1  riastrad  */
     83  1.1  riastrad 	.section .rodata
     84  1.2  riastrad 	.p2align 4
     85  1.1  riastrad 	.type	unshiftrows_3,@object
     86  1.1  riastrad unshiftrows_3:
     87  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     88  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     89  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     90  1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     91  1.1  riastrad END(unshiftrows_3)
     92  1.1  riastrad 
     93  1.1  riastrad /*
     94  1.1  riastrad  * uint128_t unshiftrows_rotword_3
     95  1.1  riastrad  *
     96  1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then do
     97  1.1  riastrad  *	RotWord on word 3, and then copy it into all the other words.
     98  1.1  riastrad  */
     99  1.1  riastrad 	.section .rodata
    100  1.2  riastrad 	.p2align 4
    101  1.1  riastrad 	.type	unshiftrows_rotword_3,@object
    102  1.1  riastrad unshiftrows_rotword_3:
    103  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    104  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    105  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    106  1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    107  1.1  riastrad END(unshiftrows_rotword_3)
    108  1.1  riastrad 
    109  1.1  riastrad /*
    110  1.1  riastrad  * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
    111  1.1  riastrad  *
    112  1.1  riastrad  *	Expand a 16-byte AES-128 key into 10 round keys.
    113  1.1  riastrad  *
    114  1.1  riastrad  *	Standard ABI calling convention.
    115  1.1  riastrad  */
    116  1.1  riastrad ENTRY(aesarmv8_setenckey128)
    117  1.1  riastrad 	ldr	q1, [x1]	/* q1 := master key */
    118  1.1  riastrad 
    119  1.1  riastrad 	adrl	x4, unshiftrows_rotword_3
    120  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    121  1.4  riastrad 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 table */
    122  1.1  riastrad 
    123  1.1  riastrad 	str	q1, [x0], #0x10	/* store master key as first round key */
    124  1.1  riastrad 	mov	x2, #10		/* round count */
    125  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    126  1.1  riastrad 
    127  1.1  riastrad 1:	/*
    128  1.1  riastrad 	 * q0 = 0
    129  1.1  riastrad 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    130  1.1  riastrad 	 * x0 = pointer to round key to compute
    131  1.1  riastrad 	 * x2 = round count
    132  1.1  riastrad 	 * x3 = rcon pointer
    133  1.1  riastrad 	 */
    134  1.1  riastrad 
    135  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q1)) */
    136  1.1  riastrad 	mov	v3.16b, v1.16b
    137  1.1  riastrad 	aese	v3.16b, v0.16b
    138  1.1  riastrad 
    139  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    140  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    141  1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    142  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    143  1.1  riastrad 
    144  1.1  riastrad 	/*
    145  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    146  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    147  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    148  1.1  riastrad 	 */
    149  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    150  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    151  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    152  1.1  riastrad 
    153  1.1  riastrad 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    154  1.1  riastrad 	eor	v1.16b, v1.16b, v3.16b
    155  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    156  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    157  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    158  1.1  riastrad 
    159  1.1  riastrad 	subs	x2, x2, #1	/* count down rounds */
    160  1.1  riastrad 	str	q1, [x0], #0x10	/* store round key */
    161  1.1  riastrad 	b.ne	1b
    162  1.1  riastrad 
    163  1.1  riastrad 	ret
    164  1.1  riastrad END(aesarmv8_setenckey128)
    165  1.1  riastrad 
    166  1.1  riastrad /*
    167  1.1  riastrad  * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
    168  1.1  riastrad  *
    169  1.1  riastrad  *	Expand a 24-byte AES-192 key into 12 round keys.
    170  1.1  riastrad  *
    171  1.1  riastrad  *	Standard ABI calling convention.
    172  1.1  riastrad  */
    173  1.1  riastrad ENTRY(aesarmv8_setenckey192)
    174  1.1  riastrad 	ldr	q1, [x1], #0x10	/* q1 := master key[0:128) */
    175  1.1  riastrad 	ldr	d2, [x1]	/* d2 := master key[128:192) */
    176  1.1  riastrad 
    177  1.1  riastrad 	adrl	x4, unshiftrows_rotword_1
    178  1.1  riastrad 	adrl	x5, unshiftrows_rotword_3
    179  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    180  1.4  riastrad 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_1 */
    181  1.4  riastrad 	ldr	q17, [x5]	/* q17 := unshiftrows_rotword_3 */
    182  1.1  riastrad 
    183  1.1  riastrad 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
    184  1.1  riastrad 	mov	x2, #12		/* round count */
    185  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    186  1.1  riastrad 
    187  1.1  riastrad 1:	/*
    188  1.1  riastrad 	 * q0 = 0
    189  1.1  riastrad 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    190  1.1  riastrad 	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
    191  1.1  riastrad 	 * x0 = pointer to three round keys to compute
    192  1.1  riastrad 	 * x2 = round count
    193  1.1  riastrad 	 * x3 = rcon pointer
    194  1.1  riastrad 	 */
    195  1.1  riastrad 
    196  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q2)) */
    197  1.1  riastrad 	mov	v3.16b, v2.16b
    198  1.1  riastrad 	aese	v3.16b, v0.16b
    199  1.1  riastrad 
    200  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
    201  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    202  1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    203  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    204  1.1  riastrad 
    205  1.1  riastrad 	/*
    206  1.1  riastrad 	 * We need to compute:
    207  1.1  riastrad 	 *
    208  1.1  riastrad 	 * rk[0] := rklo[0]
    209  1.1  riastrad 	 * rk[1] := rklo[1]
    210  1.1  riastrad 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    211  1.1  riastrad 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    212  1.1  riastrad 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    213  1.1  riastrad 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    214  1.1  riastrad 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    215  1.1  riastrad 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    216  1.1  riastrad 	 *     ^ rklo[1]
    217  1.1  riastrad 	 */
    218  1.1  riastrad 
    219  1.1  riastrad 	/*
    220  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    221  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    222  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    223  1.1  riastrad 	 */
    224  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    225  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    226  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    227  1.1  riastrad 
    228  1.1  riastrad 	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
    229  1.1  riastrad 	eor	v5.16b, v5.16b, v1.16b
    230  1.1  riastrad 	eor	v5.16b, v5.16b, v3.16b
    231  1.1  riastrad 	eor	v5.16b, v5.16b, v6.16b
    232  1.1  riastrad 	eor	v5.16b, v5.16b, v7.16b
    233  1.1  riastrad 
    234  1.1  riastrad 	/*
    235  1.1  riastrad 	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
    236  1.1  riastrad 	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
    237  1.1  riastrad 	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
    238  1.1  riastrad 	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
    239  1.1  riastrad 	 * (rklo[0],rklo[1],...).
    240  1.1  riastrad 	 */
    241  1.1  riastrad 
    242  1.1  riastrad 	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    243  1.5       ryo 	dup	v1.4s, v5.s[3]
    244  1.5       ryo 	mov	v1.s[0], v5.s[2]
    245  1.1  riastrad 
    246  1.1  riastrad 	/*
    247  1.1  riastrad 	 * v6.4s := (0, 0, rklo[0], rklo[1])
    248  1.1  riastrad 	 * v7.4s := (0, 0, 0, rklo[0])
    249  1.1  riastrad 	 */
    250  1.1  riastrad 	ext	v6.16b, v0.16b, v2.16b, #8
    251  1.1  riastrad 	ext	v7.16b, v0.16b, v2.16b, #4
    252  1.1  riastrad 
    253  1.1  riastrad 	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    254  1.1  riastrad 	eor	v3.16b, v1.16b, v6.16b
    255  1.1  riastrad 	eor	v3.16b, v3.16b, v7.16b
    256  1.1  riastrad 
    257  1.1  riastrad 	/*
    258  1.1  riastrad 	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
    259  1.1  riastrad 	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
    260  1.1  riastrad 	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
    261  1.1  riastrad 	 */
    262  1.5       ryo 	mov	v2.d[1], v5.d[0]
    263  1.1  riastrad 
    264  1.1  riastrad 	/* store two round keys */
    265  1.1  riastrad 	stp	q2, q3, [x0], #0x20
    266  1.1  riastrad 
    267  1.1  riastrad 	/*
    268  1.1  riastrad 	 * Live vector registers at this point:
    269  1.1  riastrad 	 *
    270  1.1  riastrad 	 *	q0 = zero
    271  1.1  riastrad 	 *	q2 = rk
    272  1.1  riastrad 	 *	q3 = nrk
    273  1.1  riastrad 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
    274  1.4  riastrad 	 *	q16 = unshiftrows_rotword_1
    275  1.4  riastrad 	 *	q17 = unshiftrows_rotword_3
    276  1.1  riastrad 	 *
    277  1.1  riastrad 	 * We have to compute, in q1:
    278  1.1  riastrad 	 *
    279  1.1  riastrad 	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
    280  1.1  riastrad 	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
    281  1.1  riastrad 	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    282  1.1  riastrad 	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    283  1.1  riastrad 	 *     ^ nrk[1]
    284  1.1  riastrad 	 *
    285  1.1  riastrad 	 * And, if there's any more afterward, in q2:
    286  1.1  riastrad 	 *
    287  1.1  riastrad 	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    288  1.1  riastrad 	 *     ^ nrk[1] ^ nrk[2]
    289  1.1  riastrad 	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    290  1.1  riastrad 	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
    291  1.1  riastrad 	 */
    292  1.1  riastrad 
    293  1.1  riastrad 	/* q1 := RotWords(SubBytes(q3)) */
    294  1.1  riastrad 	mov	v1.16b, v3.16b
    295  1.1  riastrad 	aese	v1.16b, v0.16b
    296  1.1  riastrad 
    297  1.1  riastrad 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
    298  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    299  1.4  riastrad 	tbl	v1.16b, {v1.16b}, v17.16b
    300  1.1  riastrad 	eor	v1.16b, v1.16b, v4.16b
    301  1.1  riastrad 
    302  1.1  riastrad 	/*
    303  1.1  riastrad 	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
    304  1.1  riastrad 	 * v4.4s := (0, rk[2], rk[3], nrk[0])
    305  1.1  riastrad 	 * v6.4s := (0, 0, rk[2], rk[3])
    306  1.1  riastrad 	 * v7.4s := (0, 0, 0, rk[2])
    307  1.1  riastrad 	 */
    308  1.1  riastrad 	ext	v4.16b, v0.16b, v5.16b, #12
    309  1.1  riastrad 	ext	v6.16b, v0.16b, v5.16b, #8
    310  1.1  riastrad 	ext	v7.16b, v0.16b, v5.16b, #4
    311  1.1  riastrad 
    312  1.1  riastrad 	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
    313  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    314  1.1  riastrad 	eor	v1.16b, v1.16b, v4.16b
    315  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    316  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    317  1.1  riastrad 
    318  1.1  riastrad 	subs	x2, x2, #3	/* count down three rounds */
    319  1.1  riastrad 	str	q1, [x0], #0x10	/* store third round key */
    320  1.1  riastrad 	b.eq	2f
    321  1.1  riastrad 
    322  1.1  riastrad 	/*
    323  1.1  riastrad 	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
    324  1.1  riastrad 	 * v5.4s := (0, nrk[2], xxx, xxx)
    325  1.1  riastrad 	 */
    326  1.1  riastrad 	ext	v4.16b, v3.16b, v0.16b, #8
    327  1.1  riastrad 	ext	v5.16b, v0.16b, v4.16b, #12
    328  1.1  riastrad 
    329  1.1  riastrad 	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
    330  1.5       ryo 	dup	v2.4s, v1.s[3]
    331  1.1  riastrad 
    332  1.1  riastrad 	/*
    333  1.1  riastrad 	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
    334  1.1  riastrad 	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
    335  1.1  riastrad 	 *     xxx, xxx)
    336  1.1  riastrad 	 */
    337  1.1  riastrad 	eor	v2.16b, v2.16b, v4.16b
    338  1.1  riastrad 	eor	v2.16b, v2.16b, v5.16b
    339  1.1  riastrad 
    340  1.1  riastrad 	b	1b
    341  1.1  riastrad 
    342  1.1  riastrad 2:	ret
    343  1.1  riastrad END(aesarmv8_setenckey192)
    344  1.1  riastrad 
    345  1.1  riastrad /*
    346  1.1  riastrad  * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
    347  1.1  riastrad  *
    348  1.1  riastrad  *	Expand a 32-byte AES-256 key into 14 round keys.
    349  1.1  riastrad  *
    350  1.1  riastrad  *	Standard ABI calling convention.
    351  1.1  riastrad  */
    352  1.1  riastrad ENTRY(aesarmv8_setenckey256)
    353  1.1  riastrad 	/* q1 := key[0:128), q2 := key[128:256) */
    354  1.1  riastrad 	ldp	q1, q2, [x1], #0x20
    355  1.1  riastrad 
    356  1.1  riastrad 	adrl	x4, unshiftrows_rotword_3
    357  1.1  riastrad 	adrl	x5, unshiftrows_3
    358  1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    359  1.4  riastrad 	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 */
    360  1.4  riastrad 	ldr	q17, [x5]	/* q17 := unshiftrows_3 */
    361  1.1  riastrad 
    362  1.1  riastrad 	/* store master key as first two round keys */
    363  1.1  riastrad 	stp	q1, q2, [x0], #0x20
    364  1.1  riastrad 	mov	x2, #14		/* round count */
    365  1.1  riastrad 	adrl	x3, rcon	/* round constant */
    366  1.1  riastrad 
    367  1.1  riastrad 1:	/*
    368  1.1  riastrad 	 * q0 = 0
    369  1.1  riastrad 	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
    370  1.1  riastrad 	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
    371  1.1  riastrad 	 * x2 = round count
    372  1.1  riastrad 	 * x3 = rcon pointer
    373  1.1  riastrad 	 */
    374  1.1  riastrad 
    375  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q2)) */
    376  1.1  riastrad 	mov	v3.16b, v2.16b
    377  1.1  riastrad 	aese	v3.16b, v0.16b
    378  1.1  riastrad 
    379  1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    380  1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    381  1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    382  1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    383  1.1  riastrad 
    384  1.1  riastrad 	/*
    385  1.1  riastrad 	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
    386  1.1  riastrad 	 * v6.4s := (0,0,pprk[0],pprk[1])
    387  1.1  riastrad 	 * v7.4s := (0,0,0,pprk[0])
    388  1.1  riastrad 	 */
    389  1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    390  1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    391  1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    392  1.1  riastrad 
    393  1.1  riastrad 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    394  1.1  riastrad 	eor	v1.16b, v1.16b, v3.16b
    395  1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    396  1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    397  1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    398  1.1  riastrad 
    399  1.1  riastrad 	subs	x2, x2, #2		/* count down two rounds */
    400  1.1  riastrad 	b.eq	2f			/* stop if this is the last one */
    401  1.1  riastrad 
    402  1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q1)) */
    403  1.1  riastrad 	mov	v3.16b, v1.16b
    404  1.1  riastrad 	aese	v3.16b, v0.16b
    405  1.1  riastrad 
    406  1.1  riastrad 	/* v3.4s[i] := SubBytes(rk[3]) */
    407  1.4  riastrad 	tbl	v3.16b, {v3.16b}, v17.16b
    408  1.1  riastrad 
    409  1.1  riastrad 	/*
    410  1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    411  1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    412  1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    413  1.1  riastrad 	 */
    414  1.1  riastrad 	ext	v5.16b, v0.16b, v2.16b, #12
    415  1.1  riastrad 	ext	v6.16b, v0.16b, v2.16b, #8
    416  1.1  riastrad 	ext	v7.16b, v0.16b, v2.16b, #4
    417  1.1  riastrad 
    418  1.1  riastrad 	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    419  1.1  riastrad 	eor	v2.16b, v2.16b, v3.16b
    420  1.1  riastrad 	eor	v2.16b, v2.16b, v5.16b
    421  1.1  riastrad 	eor	v2.16b, v2.16b, v6.16b
    422  1.1  riastrad 	eor	v2.16b, v2.16b, v7.16b
    423  1.1  riastrad 
    424  1.1  riastrad 	stp	q1, q2, [x0], #0x20	/* store two round keys */
    425  1.1  riastrad 	b	1b
    426  1.1  riastrad 
    427  1.1  riastrad 2:	str	q1, [x0]		/* store last round key */
    428  1.1  riastrad 	ret
    429  1.1  riastrad END(aesarmv8_setenckey256)
    430  1.1  riastrad 
    431  1.1  riastrad /*
    432  1.1  riastrad  * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
    433  1.1  riastrad  *     uint32_t nrounds@x2)
    434  1.1  riastrad  *
    435  1.1  riastrad  *	Convert AES encryption round keys to AES decryption round keys.
    436  1.1  riastrad  *	`rounds' must be between 10 and 14.
    437  1.1  riastrad  *
    438  1.1  riastrad  *	Standard ABI calling convention.
    439  1.1  riastrad  */
    440  1.1  riastrad ENTRY(aesarmv8_enctodec)
    441  1.1  riastrad 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
    442  1.7  riastrad 	b	2f
    443  1.9  riastrad 	_ALIGN_TEXT
    444  1.7  riastrad 1:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
    445  1.7  riastrad 2:	str	q0, [x1], #0x10	/* store round key */
    446  1.1  riastrad 	subs	x2, x2, #1	/* count down round */
    447  1.1  riastrad 	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
    448  1.7  riastrad 	b.ne	1b		/* repeat if there's more */
    449  1.7  riastrad 	str	q0, [x1]	/* store first round key verbatim */
    450  1.1  riastrad 	ret
    451  1.1  riastrad END(aesarmv8_enctodec)
    452  1.1  riastrad 
    453  1.1  riastrad /*
    454  1.1  riastrad  * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
    455  1.1  riastrad  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    456  1.1  riastrad  *
    457  1.1  riastrad  *	Encrypt a single block.
    458  1.1  riastrad  *
    459  1.1  riastrad  *	Standard ABI calling convention.
    460  1.1  riastrad  */
    461  1.1  riastrad ENTRY(aesarmv8_enc)
    462  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    463  1.1  riastrad 	mov	fp, sp
    464  1.4  riastrad 	ldr	q0, [x1]	/* q0 := ptxt */
    465  1.4  riastrad 	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
    466  1.4  riastrad 	str	q0, [x2]	/* store ctxt */
    467  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    468  1.1  riastrad 	ret
    469  1.1  riastrad END(aesarmv8_enc)
    470  1.1  riastrad 
    471  1.1  riastrad /*
    472  1.1  riastrad  * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
    473  1.1  riastrad  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    474  1.1  riastrad  *
    475  1.1  riastrad  *	Decrypt a single block.
    476  1.1  riastrad  *
    477  1.1  riastrad  *	Standard ABI calling convention.
    478  1.1  riastrad  */
    479  1.1  riastrad ENTRY(aesarmv8_dec)
    480  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    481  1.1  riastrad 	mov	fp, sp
    482  1.4  riastrad 	ldr	q0, [x1]	/* q0 := ctxt */
    483  1.4  riastrad 	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
    484  1.4  riastrad 	str	q0, [x2]	/* store ptxt */
    485  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    486  1.1  riastrad 	ret
    487  1.1  riastrad END(aesarmv8_dec)
    488  1.1  riastrad 
    489  1.1  riastrad /*
    490  1.1  riastrad  * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    491  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
    492  1.1  riastrad  *     uint32_t nrounds@x5)
    493  1.1  riastrad  *
    494  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    495  1.1  riastrad  *
    496  1.1  riastrad  *	nbytes must be an integral multiple of 16.
    497  1.1  riastrad  *
    498  1.1  riastrad  *	Standard ABI calling convention.
    499  1.1  riastrad  */
    500  1.1  riastrad ENTRY(aesarmv8_cbc_enc)
    501  1.1  riastrad 	cbz	x3, 2f			/* stop if nothing to do */
    502  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    503  1.1  riastrad 	mov	fp, sp
    504  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    505  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    506  1.1  riastrad 	ldr	q0, [x4]		/* q0 := chaining value */
    507  1.9  riastrad 	_ALIGN_TEXT
    508  1.1  riastrad 1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
    509  1.1  riastrad 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
    510  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    511  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    512  1.4  riastrad 	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
    513  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    514  1.1  riastrad 	str	q0, [x2], #0x10		/* store ciphertext block */
    515  1.1  riastrad 	b.ne	1b			/* repeat if x10 is nonzero */
    516  1.1  riastrad 	str	q0, [x4]		/* store chaining value */
    517  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    518  1.1  riastrad 2:	ret
    519  1.1  riastrad END(aesarmv8_cbc_enc)
    520  1.1  riastrad 
    521  1.1  riastrad /*
    522  1.1  riastrad  * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    523  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    524  1.1  riastrad  *     uint32_t nrounds@x5)
    525  1.1  riastrad  *
    526  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    527  1.1  riastrad  *
    528  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    529  1.1  riastrad  *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
    530  1.1  riastrad  *
    531  1.1  riastrad  *	Standard ABI calling convention.
    532  1.1  riastrad  */
    533  1.1  riastrad ENTRY(aesarmv8_cbc_dec1)
    534  1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    535  1.1  riastrad 	mov	fp, sp
    536  1.4  riastrad 	ldr	q24, [x4]		/* q24 := iv */
    537  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    538  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    539  1.1  riastrad 	add	x1, x1, x3		/* x1 := pointer past end of in */
    540  1.1  riastrad 	add	x2, x2, x3		/* x2 := pointer past end of out */
    541  1.1  riastrad 	ldr	q0, [x1, #-0x10]!	/* q0 := last ciphertext block */
    542  1.1  riastrad 	str	q0, [x4]		/* update iv */
    543  1.7  riastrad 	b	2f
    544  1.9  riastrad 	_ALIGN_TEXT
    545  1.7  riastrad 1:	ldr	q31, [x1, #-0x10]!	/* q31 := chaining value */
    546  1.7  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
    547  1.7  riastrad 	str	q0, [x2, #-0x10]!	/* store plaintext block */
    548  1.7  riastrad 	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
    549  1.7  riastrad 2:	mov	x0, x9			/* x0 := enckey */
    550  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    551  1.4  riastrad 	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
    552  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    553  1.7  riastrad 	b.ne	1b			/* repeat if more blocks */
    554  1.7  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
    555  1.1  riastrad 	str	q0, [x2, #-0x10]!	/* store first plaintext block */
    556  1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    557  1.1  riastrad 	ret
    558  1.1  riastrad END(aesarmv8_cbc_dec1)
    559  1.1  riastrad 
    560  1.1  riastrad /*
    561  1.1  riastrad  * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    562  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    563  1.1  riastrad  *     uint32_t nrounds@x5)
    564  1.1  riastrad  *
    565  1.1  riastrad  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    566  1.1  riastrad  *
    567  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    568  1.1  riastrad  *
    569  1.1  riastrad  *	Standard ABI calling convention.
    570  1.1  riastrad  */
    571  1.1  riastrad ENTRY(aesarmv8_cbc_dec8)
    572  1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    573  1.1  riastrad 	mov	fp, sp
    574  1.4  riastrad 	ldr	q24, [x4]		/* q24 := iv */
    575  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    576  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    577  1.1  riastrad 	add	x1, x1, x3		/* x1 := pointer past end of in */
    578  1.1  riastrad 	add	x2, x2, x3		/* x2 := pointer past end of out */
    579  1.1  riastrad 	ldp	q6, q7, [x1, #-0x20]!	/* q6, q7 := last ciphertext blocks */
    580  1.1  riastrad 	str	q7, [x4]		/* update iv */
    581  1.7  riastrad 	b	2f
    582  1.9  riastrad 	_ALIGN_TEXT
    583  1.7  riastrad 1:	ldp	q6, q7, [x1, #-0x20]!
    584  1.7  riastrad 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
    585  1.7  riastrad 	stp	q0, q1, [x2, #-0x20]!
    586  1.7  riastrad 2:	ldp	q4, q5, [x1, #-0x20]!
    587  1.1  riastrad 	ldp	q2, q3, [x1, #-0x20]!
    588  1.1  riastrad 	ldp	q0, q1, [x1, #-0x20]!
    589  1.4  riastrad 	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0<i<8 */
    590  1.4  riastrad 	mov	v30.16b, v5.16b
    591  1.4  riastrad 	mov	v29.16b, v4.16b
    592  1.4  riastrad 	mov	v28.16b, v3.16b
    593  1.4  riastrad 	mov	v27.16b, v2.16b
    594  1.4  riastrad 	mov	v26.16b, v1.16b
    595  1.4  riastrad 	mov	v25.16b, v0.16b
    596  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    597  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    598  1.4  riastrad 	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i];
    599  1.4  riastrad 					 * trash x0/x3/q16 */
    600  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b	/* q[i] := pt[i] */
    601  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    602  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    603  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    604  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    605  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    606  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    607  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    608  1.1  riastrad 	stp	q6, q7, [x2, #-0x20]!	/* store plaintext blocks */
    609  1.1  riastrad 	stp	q4, q5, [x2, #-0x20]!
    610  1.1  riastrad 	stp	q2, q3, [x2, #-0x20]!
    611  1.7  riastrad 	b.ne	1b			/* repeat if there's more */
    612  1.7  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q0 := pt0 */
    613  1.1  riastrad 	stp	q0, q1, [x2, #-0x20]!	/* store first two plaintext blocks */
    614  1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    615  1.1  riastrad 	ret
    616  1.1  riastrad END(aesarmv8_cbc_dec8)
    617  1.1  riastrad 
    618  1.1  riastrad /*
    619  1.1  riastrad  * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    620  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    621  1.1  riastrad  *     uint32_t nrounds@x5)
    622  1.1  riastrad  *
    623  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    624  1.1  riastrad  *
    625  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    626  1.1  riastrad  *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
    627  1.1  riastrad  *
    628  1.1  riastrad  *	Standard ABI calling convention.
    629  1.1  riastrad  */
    630  1.1  riastrad ENTRY(aesarmv8_xts_enc1)
    631  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    632  1.1  riastrad 	mov	fp, sp
    633  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    634  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    635  1.4  riastrad 	ldr	q31, [x4]		/* q31 := tweak */
    636  1.9  riastrad 	_ALIGN_TEXT
    637  1.1  riastrad 1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
    638  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    639  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    640  1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ptxt ^ tweak */
    641  1.4  riastrad 	bl	aesarmv8_enc1		/* q0 := AES(...); trash x0/x3/q16 */
    642  1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
    643  1.1  riastrad 	str	q0, [x2], #0x10		/* store ciphertext block */
    644  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    645  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    646  1.1  riastrad 	b.ne	1b			/* repeat if more blocks */
    647  1.4  riastrad 	str	q31, [x4]		/* update tweak */
    648  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    649  1.1  riastrad 	ret
    650  1.1  riastrad END(aesarmv8_xts_enc1)
    651  1.1  riastrad 
    652  1.1  riastrad /*
    653  1.1  riastrad  * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    654  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    655  1.1  riastrad  *     uint32_t nrounds@x5)
    656  1.1  riastrad  *
    657  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    658  1.1  riastrad  *
    659  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    660  1.1  riastrad  *
    661  1.1  riastrad  *	Standard ABI calling convention.
    662  1.1  riastrad  */
    663  1.1  riastrad ENTRY(aesarmv8_xts_enc8)
    664  1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    665  1.1  riastrad 	mov	fp, sp
    666  1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    667  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    668  1.4  riastrad 	ldr	q31, [x4]		/* q31 := tweak */
    669  1.9  riastrad 	_ALIGN_TEXT
    670  1.4  riastrad 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    671  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    672  1.4  riastrad 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    673  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    674  1.4  riastrad 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    675  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    676  1.4  riastrad 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    677  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    678  1.4  riastrad 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    679  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    680  1.4  riastrad 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    681  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    682  1.4  riastrad 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    683  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    684  1.4  riastrad 					/* q31 := tweak[7] */
    685  1.4  riastrad 	ldp	q0, q1, [x1], #0x20	/* q[i] := ptxt[i] */
    686  1.1  riastrad 	ldp	q2, q3, [x1], #0x20
    687  1.1  riastrad 	ldp	q4, q5, [x1], #0x20
    688  1.1  riastrad 	ldp	q6, q7, [x1], #0x20
    689  1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
    690  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    691  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    692  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    693  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    694  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    695  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    696  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    697  1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    698  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    699  1.4  riastrad 	bl	aesarmv8_enc8		/* encrypt q0-q7; trash x0/x3/q16 */
    700  1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    701  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    702  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    703  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    704  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    705  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    706  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    707  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    708  1.1  riastrad 	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
    709  1.4  riastrad 	stp	q2, q3, [x2], #0x20
    710  1.4  riastrad 	stp	q4, q5, [x2], #0x20
    711  1.4  riastrad 	stp	q6, q7, [x2], #0x20
    712  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    713  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    714  1.1  riastrad 	b.ne	1b			/* repeat if more block groups */
    715  1.4  riastrad 	str	q31, [x4]		/* update tweak */
    716  1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    717  1.1  riastrad 	ret
    718  1.1  riastrad END(aesarmv8_xts_enc8)
    719  1.1  riastrad 
    720  1.1  riastrad /*
    721  1.1  riastrad  * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    722  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    723  1.1  riastrad  *     uint32_t nrounds@x5)
    724  1.1  riastrad  *
    725  1.4  riastrad  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    726  1.1  riastrad  *
    727  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    728  1.1  riastrad  *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
    729  1.1  riastrad  *
    730  1.1  riastrad  *	Standard ABI calling convention.
    731  1.1  riastrad  */
    732  1.1  riastrad ENTRY(aesarmv8_xts_dec1)
    733  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    734  1.1  riastrad 	mov	fp, sp
    735  1.1  riastrad 	mov	x9, x0			/* x9 := deckey */
    736  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    737  1.4  riastrad 	ldr	q31, [x4]		/* q31 := tweak */
    738  1.9  riastrad 	_ALIGN_TEXT
    739  1.4  riastrad 1:	ldr	q0, [x1], #0x10		/* q0 := ctxt */
    740  1.1  riastrad 	mov	x0, x9			/* x0 := deckey */
    741  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    742  1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ctxt ^ tweak */
    743  1.4  riastrad 	bl	aesarmv8_dec1		/* q0 := AES(...); trash x0/x3/q16 */
    744  1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ctxt ^ tweak) ^ tweak */
    745  1.4  riastrad 	str	q0, [x2], #0x10		/* store plaintext block */
    746  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    747  1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    748  1.1  riastrad 	b.ne	1b			/* repeat if more blocks */
    749  1.4  riastrad 	str	q31, [x4]		/* update tweak */
    750  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    751  1.1  riastrad 	ret
    752  1.1  riastrad END(aesarmv8_xts_dec1)
    753  1.1  riastrad 
    754  1.1  riastrad /*
    755  1.1  riastrad  * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    756  1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    757  1.1  riastrad  *     uint32_t nrounds@x5)
    758  1.1  riastrad  *
    759  1.4  riastrad  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    760  1.1  riastrad  *
    761  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    762  1.1  riastrad  *
    763  1.1  riastrad  *	Standard ABI calling convention.
    764  1.1  riastrad  */
    765  1.1  riastrad ENTRY(aesarmv8_xts_dec8)
    766  1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    767  1.1  riastrad 	mov	fp, sp
    768  1.1  riastrad 	mov	x9, x0			/* x9 := deckey */
    769  1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    770  1.4  riastrad 	ldr	q31, [x4]		/* q31 := tweak */
    771  1.9  riastrad 	_ALIGN_TEXT
    772  1.4  riastrad 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    773  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    774  1.4  riastrad 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    775  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    776  1.4  riastrad 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    777  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    778  1.4  riastrad 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    779  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    780  1.4  riastrad 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    781  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    782  1.4  riastrad 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    783  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    784  1.4  riastrad 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    785  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    786  1.4  riastrad 					/* q31 := tweak[7] */
    787  1.4  riastrad 	ldp	q0, q1, [x1], #0x20	/* q[i] := ctxt[i] */
    788  1.1  riastrad 	ldp	q2, q3, [x1], #0x20
    789  1.1  riastrad 	ldp	q4, q5, [x1], #0x20
    790  1.1  riastrad 	ldp	q6, q7, [x1], #0x20
    791  1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ctxt[i] ^ tweak[i] */
    792  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    793  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    794  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    795  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    796  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    797  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    798  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    799  1.1  riastrad 	mov	x0, x9			/* x0 := deckey */
    800  1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    801  1.4  riastrad 	bl	aesarmv8_dec8		/* decrypt q0-q7; trash x0/x3/q16 */
    802  1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    803  1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    804  1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    805  1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    806  1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    807  1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    808  1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    809  1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    810  1.4  riastrad 	stp	q0, q1, [x2], #0x20	/* store plaintext blocks */
    811  1.4  riastrad 	stp	q2, q3, [x2], #0x20
    812  1.4  riastrad 	stp	q4, q5, [x2], #0x20
    813  1.4  riastrad 	stp	q6, q7, [x2], #0x20
    814  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    815  1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    816  1.1  riastrad 	b.ne	1b			/* repeat if more block groups */
    817  1.4  riastrad 	str	q31, [x4]		/* update tweak */
    818  1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    819  1.1  riastrad 	ret
    820  1.1  riastrad END(aesarmv8_xts_dec8)
    821  1.1  riastrad 
    822  1.1  riastrad /*
    823  1.4  riastrad  * aesarmv8_xts_mulx(tweak@q31)
    824  1.1  riastrad  *
    825  1.4  riastrad  *	Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    826  1.1  riastrad  *	Uses x0 and q0/q1 as temporaries.
    827  1.1  riastrad  */
    828  1.1  riastrad 	.text
    829  1.1  riastrad 	_ALIGN_TEXT
    830  1.1  riastrad 	.type	aesarmv8_xts_mulx,@function
    831  1.1  riastrad aesarmv8_xts_mulx:
    832  1.1  riastrad 	/*
    833  1.1  riastrad 	 * Simultaneously determine
    834  1.1  riastrad 	 * (a) whether the high bit of the low half must be
    835  1.1  riastrad 	 *     shifted into the low bit of the high half, and
    836  1.1  riastrad 	 * (b) whether the high bit of the high half must be
    837  1.1  riastrad 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    838  1.1  riastrad 	 */
    839  1.1  riastrad 	adrl	x0, xtscarry
    840  1.6  riastrad 	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
    841  1.1  riastrad 	ldr	q0, [x0]		/* q0 := xtscarry */
    842  1.1  riastrad 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
    843  1.4  riastrad 	shl	v31.2d, v31.2d, #1	/* shift */
    844  1.1  riastrad 	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
    845  1.4  riastrad 	eor	v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
    846  1.1  riastrad 	ret
    847  1.1  riastrad END(aesarmv8_xts_mulx)
    848  1.1  riastrad 
    849  1.1  riastrad 	.section .rodata
    850  1.2  riastrad 	.p2align 4
    851  1.1  riastrad 	.type	xtscarry,@object
    852  1.1  riastrad xtscarry:
    853  1.1  riastrad 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    854  1.1  riastrad END(xtscarry)
    855  1.1  riastrad 
    856  1.1  riastrad /*
    857  1.1  riastrad  * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
    858  1.1  riastrad  *
    859  1.1  riastrad  *	Update an AES-XTS tweak.
    860  1.1  riastrad  *
    861  1.1  riastrad  *	Standard ABI calling convention.
    862  1.1  riastrad  */
    863  1.1  riastrad ENTRY(aesarmv8_xts_update)
    864  1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    865  1.1  riastrad 	mov	fp, sp
    866  1.4  riastrad 	ldr	q31, [x0]		/* load tweak */
    867  1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    868  1.4  riastrad 	str	q31, [x1]		/* store tweak */
    869  1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    870  1.1  riastrad 	ret
    871  1.1  riastrad END(aesarmv8_xts_update)
    872  1.1  riastrad 
    873  1.1  riastrad /*
    874  1.8  riastrad  * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0,
    875  1.8  riastrad  *     const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3,
    876  1.8  riastrad  *     uint32_t nrounds@x4)
    877  1.8  riastrad  *
    878  1.8  riastrad  *	Update CBC-MAC.
    879  1.8  riastrad  *
    880  1.8  riastrad  *	nbytes must be a positive integral multiple of 16.
    881  1.8  riastrad  *
    882  1.8  riastrad  *	Standard ABI calling convention.
    883  1.8  riastrad  */
    884  1.8  riastrad ENTRY(aesarmv8_cbcmac_update1)
    885  1.8  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    886  1.8  riastrad 	mov	fp, sp
    887  1.8  riastrad 	ldr	q0, [x3]		/* q0 := initial authenticator */
    888  1.8  riastrad 	mov	x9, x0			/* x9 := enckey */
    889  1.8  riastrad 	mov	x5, x3			/* x5 := &auth (enc1 trashes x3) */
    890  1.9  riastrad 	_ALIGN_TEXT
    891  1.8  riastrad 1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
    892  1.8  riastrad 	mov	x0, x9			/* x0 := enckey */
    893  1.8  riastrad 	mov	x3, x4			/* x3 := nrounds */
    894  1.8  riastrad 	eor	v0.16b, v0.16b, v1.16b	/* q0 := auth ^ ptxt */
    895  1.8  riastrad 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
    896  1.8  riastrad 	subs	x2, x2, #0x10		/* count down nbytes */
    897  1.8  riastrad 	b.ne	1b			/* repeat if x10 is nonzero */
    898  1.8  riastrad 	str	q0, [x5]		/* store updated authenticator */
    899  1.8  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    900  1.8  riastrad 	ret
    901  1.8  riastrad END(aesarmv8_cbcmac_update1)
    902  1.8  riastrad 
    903  1.8  riastrad /*
    904  1.8  riastrad  * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    905  1.8  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    906  1.8  riastrad  *     uint32_t nrounds@x5)
    907  1.8  riastrad  *
    908  1.8  riastrad  *	Update CCM encryption.
    909  1.8  riastrad  *
    910  1.8  riastrad  *	nbytes must be a positive integral multiple of 16.
    911  1.8  riastrad  *
    912  1.8  riastrad  *	Standard ABI calling convention.
    913  1.8  riastrad  */
    914  1.8  riastrad ENTRY(aesarmv8_ccm_enc1)
    915  1.8  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    916  1.8  riastrad 	mov	fp, sp
    917  1.8  riastrad 	ldp	q0, q2, [x4]		/* q0 := auth, q2 := ctr (be) */
    918  1.8  riastrad 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    919  1.8  riastrad 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    920  1.8  riastrad 	mov	x9, x0			/* x9 := enckey */
    921  1.8  riastrad 	mov	x10, x3			/* x10 := nbytes */
    922  1.8  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    923  1.8  riastrad 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
    924  1.8  riastrad #endif
    925  1.9  riastrad 	_ALIGN_TEXT
    926  1.8  riastrad 1:	ldr	q3, [x1], #0x10		/* q3 := plaintext block */
    927  1.8  riastrad 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    928  1.8  riastrad 	mov	x0, x9			/* x0 := enckey */
    929  1.8  riastrad 	mov	x3, x5			/* x3 := nrounds */
    930  1.8  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    931  1.8  riastrad 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    932  1.8  riastrad #else
    933  1.8  riastrad 	mov	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    934  1.8  riastrad #endif
    935  1.8  riastrad 	eor	v0.16b, v0.16b, v3.16b	/* q0 := auth ^ ptxt */
    936  1.8  riastrad 	bl	aesarmv8_enc2		/* q0 := auth', q1 := pad;
    937  1.8  riastrad 					 * trash x0/x3/q16 */
    938  1.8  riastrad 	eor	v3.16b, v1.16b, v3.16b	/* q3 := ciphertext block */
    939  1.8  riastrad 	subs	x10, x10, #0x10		/* count down bytes */
    940  1.8  riastrad 	str	q3, [x2], #0x10		/* store ciphertext block */
    941  1.8  riastrad 	b.ne	1b			/* repeat if more blocks */
    942  1.8  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    943  1.8  riastrad 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
    944  1.8  riastrad #endif
    945  1.8  riastrad 	stp	q0, q2, [x4]		/* store updated auth/ctr */
    946  1.8  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    947  1.8  riastrad 	ret
    948  1.8  riastrad END(aesarmv8_ccm_enc1)
    949  1.8  riastrad 
    950  1.8  riastrad /*
    951  1.8  riastrad  * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    952  1.8  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    953  1.8  riastrad  *     uint32_t nrounds@x5)
    954  1.8  riastrad  *
    955  1.8  riastrad  *	Update CCM decryption.
    956  1.8  riastrad  *
    957  1.8  riastrad  *	nbytes must be a positive integral multiple of 16.
    958  1.8  riastrad  *
    959  1.8  riastrad  *	Standard ABI calling convention.
    960  1.8  riastrad  */
    961  1.8  riastrad ENTRY(aesarmv8_ccm_dec1)
    962  1.8  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    963  1.8  riastrad 	mov	fp, sp
    964  1.8  riastrad 	ldp	q1, q2, [x4]		/* q1 := auth, q2 := ctr (be) */
    965  1.8  riastrad 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    966  1.8  riastrad 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    967  1.8  riastrad 	mov	x9, x0			/* x9 := enckey */
    968  1.8  riastrad 	mov	x10, x3			/* x10 := nbytes */
    969  1.8  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    970  1.8  riastrad 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
    971  1.8  riastrad #endif
    972  1.8  riastrad 
    973  1.8  riastrad 	/* Decrypt the first block.  */
    974  1.8  riastrad 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    975  1.8  riastrad 	mov	x3, x5			/* x3 := nrounds */
    976  1.8  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    977  1.8  riastrad 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    978  1.8  riastrad #else
    979  1.8  riastrad 	mov	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    980  1.8  riastrad #endif
    981  1.8  riastrad 	ldr	q3, [x1], #0x10		/* q3 := ctxt */
    982  1.8  riastrad 	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
    983  1.8  riastrad 	b	2f
    984  1.8  riastrad 
    985  1.9  riastrad 	_ALIGN_TEXT
    986  1.8  riastrad 1:	/*
    987  1.8  riastrad 	 * Authenticate the last block and decrypt the next block
    988  1.8  riastrad 	 * simultaneously.
    989  1.8  riastrad 	 *
    990  1.8  riastrad 	 *	q1 = auth ^ ptxt[-1]
    991  1.8  riastrad 	 *	q2 = ctr[-1] (le)
    992  1.8  riastrad 	 */
    993  1.8  riastrad 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    994  1.8  riastrad 	mov	x0, x9			/* x0 := enckey */
    995  1.8  riastrad 	mov	x3, x5			/* x3 := nrounds */
    996  1.8  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    997  1.8  riastrad 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    998  1.8  riastrad #else
    999  1.8  riastrad 	mov	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
   1000  1.8  riastrad #endif
   1001  1.8  riastrad 	ldr	q3, [x1], #0x10		/* q3 := ctxt */
   1002  1.8  riastrad 	bl	aesarmv8_enc2		/* q0 := pad, q1 := auth';
   1003  1.8  riastrad 					 * trash x0/x3/q16 */
   1004  1.8  riastrad 2:	eor	v3.16b, v0.16b, v3.16b	/* q3 := plaintext block */
   1005  1.8  riastrad 	subs	x10, x10, #0x10
   1006  1.8  riastrad 	str	q3, [x2], #0x10		/* store plaintext */
   1007  1.8  riastrad 	eor	v1.16b, v1.16b, v3.16b	/* q1 := auth ^ ptxt */
   1008  1.8  riastrad 	b.ne	1b
   1009  1.8  riastrad 
   1010  1.8  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
   1011  1.8  riastrad 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
   1012  1.8  riastrad #endif
   1013  1.8  riastrad 
   1014  1.8  riastrad 	/* Authenticate the last block.  */
   1015  1.8  riastrad 	mov	x0, x9			/* x0 := enckey */
   1016  1.8  riastrad 	mov	x3, x5			/* x3 := nrounds */
   1017  1.8  riastrad 	mov	v0.16b, v1.16b		/* q0 := auth ^ ptxt */
   1018  1.8  riastrad 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
   1019  1.8  riastrad 	stp	q0, q2, [x4]		/* store updated auth/ctr */
   1020  1.8  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
   1021  1.8  riastrad 	ret
   1022  1.8  riastrad END(aesarmv8_ccm_dec1)
   1023  1.8  riastrad 
   1024  1.8  riastrad 	.section .rodata
   1025  1.8  riastrad 	.p2align 4
   1026  1.8  riastrad 	.type	ctr32_inc,@object
   1027  1.8  riastrad ctr32_inc:
   1028  1.8  riastrad 	.int	0, 0, 0, 1
   1029  1.8  riastrad END(ctr32_inc)
   1030  1.8  riastrad 
   1031  1.8  riastrad /*
   1032  1.1  riastrad  * aesarmv8_enc1(const struct aesenc *enckey@x0,
   1033  1.1  riastrad  *     uint128_t block@q0, uint32_t nrounds@x3)
   1034  1.1  riastrad  *
   1035  1.1  riastrad  *	Encrypt a single AES block in q0.
   1036  1.1  riastrad  *
   1037  1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1038  1.1  riastrad  */
   1039  1.1  riastrad 	.text
   1040  1.1  riastrad 	_ALIGN_TEXT
   1041  1.1  riastrad 	.type	aesarmv8_enc1,@function
   1042  1.1  riastrad aesarmv8_enc1:
   1043  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1044  1.7  riastrad 	b	2f
   1045  1.9  riastrad 	_ALIGN_TEXT
   1046  1.7  riastrad 1:	/* q0 := MixColumns(q0) */
   1047  1.7  riastrad 	aesmc	v0.16b, v0.16b
   1048  1.7  riastrad 2:	subs	x3, x3, #1
   1049  1.4  riastrad 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
   1050  1.4  riastrad 	aese	v0.16b, v16.16b
   1051  1.4  riastrad 	ldr	q16, [x0], #0x10		/* load next round key */
   1052  1.7  riastrad 	b.ne	1b
   1053  1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b
   1054  1.1  riastrad 	ret
   1055  1.1  riastrad END(aesarmv8_enc1)
   1056  1.1  riastrad 
   1057  1.1  riastrad /*
   1058  1.8  riastrad  * aesarmv8_enc2(const struct aesenc *enckey@x0,
   1059  1.8  riastrad  *     uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3)
   1060  1.8  riastrad  *
   1061  1.8  riastrad  *	Encrypt two AES blocks in q0 and q1.
   1062  1.8  riastrad  *
   1063  1.8  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1064  1.8  riastrad  */
   1065  1.8  riastrad 	.text
   1066  1.8  riastrad 	_ALIGN_TEXT
   1067  1.8  riastrad 	.type	aesarmv8_enc2,@function
   1068  1.8  riastrad aesarmv8_enc2:
   1069  1.8  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1070  1.8  riastrad 	b	2f
   1071  1.9  riastrad 	_ALIGN_TEXT
   1072  1.8  riastrad 1:	/* q[i] := MixColumns(q[i]) */
   1073  1.8  riastrad 	aesmc	v0.16b, v0.16b
   1074  1.8  riastrad 	aesmc	v1.16b, v1.16b
   1075  1.8  riastrad 2:	subs	x3, x3, #1
   1076  1.8  riastrad 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1077  1.8  riastrad 	aese	v0.16b, v16.16b
   1078  1.8  riastrad 	aese	v1.16b, v16.16b
   1079  1.8  riastrad 	ldr	q16, [x0], #0x10		/* load next round key */
   1080  1.8  riastrad 	b.ne	1b
   1081  1.8  riastrad 	eor	v0.16b, v0.16b, v16.16b
   1082  1.8  riastrad 	eor	v1.16b, v1.16b, v16.16b
   1083  1.8  riastrad 	ret
   1084  1.8  riastrad END(aesarmv8_enc2)
   1085  1.8  riastrad 
   1086  1.8  riastrad /*
   1087  1.1  riastrad  * aesarmv8_enc8(const struct aesenc *enckey@x0,
   1088  1.1  riastrad  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1089  1.1  riastrad  *     uint32_t nrounds@x3)
   1090  1.1  riastrad  *
   1091  1.1  riastrad  *	Encrypt eight AES blocks in q0 through q7 in parallel.
   1092  1.1  riastrad  *
   1093  1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1094  1.1  riastrad  */
   1095  1.1  riastrad 	.text
   1096  1.1  riastrad 	_ALIGN_TEXT
   1097  1.1  riastrad 	.type	aesarmv8_enc8,@function
   1098  1.1  riastrad aesarmv8_enc8:
   1099  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1100  1.7  riastrad 	b	2f
   1101  1.9  riastrad 	_ALIGN_TEXT
   1102  1.7  riastrad 1:	/* q[i] := MixColumns(q[i]) */
   1103  1.7  riastrad 	aesmc	v0.16b, v0.16b
   1104  1.7  riastrad 	aesmc	v1.16b, v1.16b
   1105  1.7  riastrad 	aesmc	v2.16b, v2.16b
   1106  1.7  riastrad 	aesmc	v3.16b, v3.16b
   1107  1.7  riastrad 	aesmc	v4.16b, v4.16b
   1108  1.7  riastrad 	aesmc	v5.16b, v5.16b
   1109  1.7  riastrad 	aesmc	v6.16b, v6.16b
   1110  1.7  riastrad 	aesmc	v7.16b, v7.16b
   1111  1.7  riastrad 2:	subs	x3, x3, #1
   1112  1.4  riastrad 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1113  1.4  riastrad 	aese	v0.16b, v16.16b
   1114  1.4  riastrad 	aese	v1.16b, v16.16b
   1115  1.4  riastrad 	aese	v2.16b, v16.16b
   1116  1.4  riastrad 	aese	v3.16b, v16.16b
   1117  1.4  riastrad 	aese	v4.16b, v16.16b
   1118  1.4  riastrad 	aese	v5.16b, v16.16b
   1119  1.4  riastrad 	aese	v6.16b, v16.16b
   1120  1.4  riastrad 	aese	v7.16b, v16.16b
   1121  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
   1122  1.7  riastrad 	b.ne	1b
   1123  1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
   1124  1.4  riastrad 	eor	v1.16b, v1.16b, v16.16b
   1125  1.4  riastrad 	eor	v2.16b, v2.16b, v16.16b
   1126  1.4  riastrad 	eor	v3.16b, v3.16b, v16.16b
   1127  1.4  riastrad 	eor	v4.16b, v4.16b, v16.16b
   1128  1.4  riastrad 	eor	v5.16b, v5.16b, v16.16b
   1129  1.4  riastrad 	eor	v6.16b, v6.16b, v16.16b
   1130  1.4  riastrad 	eor	v7.16b, v7.16b, v16.16b
   1131  1.1  riastrad 	ret
   1132  1.1  riastrad END(aesarmv8_enc8)
   1133  1.1  riastrad 
   1134  1.1  riastrad /*
   1135  1.1  riastrad  * aesarmv8_dec1(const struct aesdec *deckey@x0,
   1136  1.1  riastrad  *     uint128_t block@q0, uint32_t nrounds@x3)
   1137  1.1  riastrad  *
   1138  1.1  riastrad  *	Decrypt a single AES block in q0.
   1139  1.1  riastrad  *
   1140  1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1141  1.1  riastrad  */
   1142  1.1  riastrad 	.text
   1143  1.1  riastrad 	_ALIGN_TEXT
   1144  1.1  riastrad 	.type	aesarmv8_dec1,@function
   1145  1.1  riastrad aesarmv8_dec1:
   1146  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1147  1.7  riastrad 	b	2f
   1148  1.9  riastrad 	_ALIGN_TEXT
   1149  1.7  riastrad 1:	/* q0 := InMixColumns(q0) */
   1150  1.7  riastrad 	aesimc	v0.16b, v0.16b
   1151  1.7  riastrad 2:	subs	x3, x3, #1
   1152  1.4  riastrad 	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1153  1.4  riastrad 	aesd	v0.16b, v16.16b
   1154  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
   1155  1.7  riastrad 	b.ne	1b
   1156  1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b
   1157  1.1  riastrad 	ret
   1158  1.1  riastrad END(aesarmv8_dec1)
   1159  1.1  riastrad 
   1160  1.1  riastrad /*
   1161  1.1  riastrad  * aesarmv8_dec8(const struct aesdec *deckey@x0,
   1162  1.1  riastrad  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1163  1.1  riastrad  *     uint32_t nrounds@x3)
   1164  1.1  riastrad  *
   1165  1.1  riastrad  *	Decrypt eight AES blocks in q0 through q7 in parallel.
   1166  1.1  riastrad  *
   1167  1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1168  1.1  riastrad  */
   1169  1.1  riastrad 	.text
   1170  1.1  riastrad 	_ALIGN_TEXT
   1171  1.1  riastrad 	.type	aesarmv8_dec8,@function
   1172  1.1  riastrad aesarmv8_dec8:
   1173  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1174  1.7  riastrad 	b	2f
   1175  1.9  riastrad 	_ALIGN_TEXT
   1176  1.7  riastrad 1:	/* q[i] := InMixColumns(q[i]) */
   1177  1.7  riastrad 	aesimc	v0.16b, v0.16b
   1178  1.7  riastrad 	aesimc	v1.16b, v1.16b
   1179  1.7  riastrad 	aesimc	v2.16b, v2.16b
   1180  1.7  riastrad 	aesimc	v3.16b, v3.16b
   1181  1.7  riastrad 	aesimc	v4.16b, v4.16b
   1182  1.7  riastrad 	aesimc	v5.16b, v5.16b
   1183  1.7  riastrad 	aesimc	v6.16b, v6.16b
   1184  1.7  riastrad 	aesimc	v7.16b, v7.16b
   1185  1.7  riastrad 2:	subs	x3, x3, #1
   1186  1.4  riastrad 	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1187  1.4  riastrad 	aesd	v0.16b, v16.16b
   1188  1.4  riastrad 	aesd	v1.16b, v16.16b
   1189  1.4  riastrad 	aesd	v2.16b, v16.16b
   1190  1.4  riastrad 	aesd	v3.16b, v16.16b
   1191  1.4  riastrad 	aesd	v4.16b, v16.16b
   1192  1.4  riastrad 	aesd	v5.16b, v16.16b
   1193  1.4  riastrad 	aesd	v6.16b, v16.16b
   1194  1.4  riastrad 	aesd	v7.16b, v16.16b
   1195  1.4  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
   1196  1.7  riastrad 	b.ne	1b
   1197  1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
   1198  1.4  riastrad 	eor	v1.16b, v1.16b, v16.16b
   1199  1.4  riastrad 	eor	v2.16b, v2.16b, v16.16b
   1200  1.4  riastrad 	eor	v3.16b, v3.16b, v16.16b
   1201  1.4  riastrad 	eor	v4.16b, v4.16b, v16.16b
   1202  1.4  riastrad 	eor	v5.16b, v5.16b, v16.16b
   1203  1.4  riastrad 	eor	v6.16b, v6.16b, v16.16b
   1204  1.4  riastrad 	eor	v7.16b, v7.16b, v16.16b
   1205  1.1  riastrad 	ret
   1206  1.1  riastrad END(aesarmv8_dec8)
   1207