Home | History | Annotate | Line # | Download | only in arm
      1  1.15  riastrad /*	$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $	*/
      2   1.1  riastrad 
      3   1.1  riastrad /*-
      4   1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5   1.1  riastrad  * All rights reserved.
      6   1.1  riastrad  *
      7   1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8   1.1  riastrad  * modification, are permitted provided that the following conditions
      9   1.1  riastrad  * are met:
     10   1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11   1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12   1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13   1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14   1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15   1.1  riastrad  *
     16   1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17   1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18   1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19   1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20   1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21   1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22   1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23   1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24   1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25   1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26   1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27   1.1  riastrad  */
     28   1.1  riastrad 
     29   1.1  riastrad #include <aarch64/asm.h>
     30   1.1  riastrad 
     31  1.15  riastrad RCSID("$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $")
     32  1.11  riastrad 
     33   1.3  riastrad 	.arch_extension	aes
     34   1.1  riastrad 
     35   1.1  riastrad /*
     36   1.1  riastrad  * uint32_t rcon[10]
     37   1.1  riastrad  *
     38   1.1  riastrad  *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
     39   1.1  riastrad  *	Such elements of GF(8) need only eight bits to be represented,
     40   1.1  riastrad  *	but we store them in 4-byte units so we can copy one into all
     41   1.1  riastrad  *	four 4-byte lanes of a vector register with a single LD1R.  The
     42   1.1  riastrad  *	access pattern is fixed, so indices into this table are never
     43   1.1  riastrad  *	secret.
     44   1.1  riastrad  */
     45   1.1  riastrad 	.section .rodata
     46   1.2  riastrad 	.p2align 2
     47   1.1  riastrad 	.type	rcon,@object
     48   1.1  riastrad rcon:
     49   1.1  riastrad 	.long	0x01
     50   1.1  riastrad 	.long	0x02
     51   1.1  riastrad 	.long	0x04
     52   1.1  riastrad 	.long	0x08
     53   1.1  riastrad 	.long	0x10
     54   1.1  riastrad 	.long	0x20
     55   1.1  riastrad 	.long	0x40
     56   1.1  riastrad 	.long	0x80
     57   1.1  riastrad 	.long	0x1b
     58   1.1  riastrad 	.long	0x36
     59   1.1  riastrad END(rcon)
     60   1.1  riastrad 
     61   1.1  riastrad /*
     62   1.1  riastrad  * uint128_t unshiftrows_rotword_1
     63   1.1  riastrad  *
     64   1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then do
     65   1.1  riastrad  *	RotWord on word 1, and then copy it into all the other words.
     66   1.1  riastrad  */
     67   1.1  riastrad 	.section .rodata
     68   1.2  riastrad 	.p2align 4
     69   1.1  riastrad 	.type	unshiftrows_rotword_1,@object
     70   1.1  riastrad unshiftrows_rotword_1:
     71   1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     72   1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     73   1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     74   1.1  riastrad 	.byte	0x01,0x0e,0x0b,0x04
     75   1.1  riastrad END(unshiftrows_rotword_1)
     76   1.1  riastrad 
     77   1.1  riastrad /*
     78   1.1  riastrad  * uint128_t unshiftrows_3
     79   1.1  riastrad  *
     80   1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then copy word
     81   1.1  riastrad  *	3 into all the other words.
     82   1.1  riastrad  */
     83   1.1  riastrad 	.section .rodata
     84   1.2  riastrad 	.p2align 4
     85   1.1  riastrad 	.type	unshiftrows_3,@object
     86   1.1  riastrad unshiftrows_3:
     87   1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     88   1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     89   1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     90   1.1  riastrad 	.byte	0x0c,0x09,0x06,0x03
     91   1.1  riastrad END(unshiftrows_3)
     92   1.1  riastrad 
     93   1.1  riastrad /*
     94   1.1  riastrad  * uint128_t unshiftrows_rotword_3
     95   1.1  riastrad  *
     96   1.1  riastrad  *	Table for TBL instruction to undo ShiftRows, and then do
     97   1.1  riastrad  *	RotWord on word 3, and then copy it into all the other words.
     98   1.1  riastrad  */
     99   1.1  riastrad 	.section .rodata
    100   1.2  riastrad 	.p2align 4
    101   1.1  riastrad 	.type	unshiftrows_rotword_3,@object
    102   1.1  riastrad unshiftrows_rotword_3:
    103   1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    104   1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    105   1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    106   1.1  riastrad 	.byte	0x09,0x06,0x03,0x0c
    107   1.1  riastrad END(unshiftrows_rotword_3)
    108   1.1  riastrad 
    109   1.1  riastrad /*
    110   1.1  riastrad  * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
    111   1.1  riastrad  *
    112   1.1  riastrad  *	Expand a 16-byte AES-128 key into 10 round keys.
    113   1.1  riastrad  *
    114   1.1  riastrad  *	Standard ABI calling convention.
    115   1.1  riastrad  */
    116   1.1  riastrad ENTRY(aesarmv8_setenckey128)
    117  1.13  riastrad 	ld1	{v1.16b}, [x1]	/* q1 := master key */
    118   1.1  riastrad 
    119   1.1  riastrad 	adrl	x4, unshiftrows_rotword_3
    120   1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    121  1.13  riastrad 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 table */
    122   1.1  riastrad 
    123   1.1  riastrad 	str	q1, [x0], #0x10	/* store master key as first round key */
    124   1.1  riastrad 	mov	x2, #10		/* round count */
    125   1.1  riastrad 	adrl	x3, rcon	/* round constant */
    126   1.1  riastrad 
    127   1.1  riastrad 1:	/*
    128   1.1  riastrad 	 * q0 = 0
    129   1.1  riastrad 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    130   1.1  riastrad 	 * x0 = pointer to round key to compute
    131   1.1  riastrad 	 * x2 = round count
    132   1.1  riastrad 	 * x3 = rcon pointer
    133   1.1  riastrad 	 */
    134   1.1  riastrad 
    135   1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q1)) */
    136   1.1  riastrad 	mov	v3.16b, v1.16b
    137   1.1  riastrad 	aese	v3.16b, v0.16b
    138   1.1  riastrad 
    139   1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    140   1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    141   1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    142   1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    143   1.1  riastrad 
    144   1.1  riastrad 	/*
    145   1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    146   1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    147   1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    148   1.1  riastrad 	 */
    149   1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    150   1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    151   1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    152   1.1  riastrad 
    153   1.1  riastrad 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    154   1.1  riastrad 	eor	v1.16b, v1.16b, v3.16b
    155   1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    156   1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    157   1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    158   1.1  riastrad 
    159   1.1  riastrad 	subs	x2, x2, #1	/* count down rounds */
    160   1.1  riastrad 	str	q1, [x0], #0x10	/* store round key */
    161   1.1  riastrad 	b.ne	1b
    162   1.1  riastrad 
    163   1.1  riastrad 	ret
    164   1.1  riastrad END(aesarmv8_setenckey128)
    165   1.1  riastrad 
    166   1.1  riastrad /*
    167   1.1  riastrad  * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
    168   1.1  riastrad  *
    169   1.1  riastrad  *	Expand a 24-byte AES-192 key into 12 round keys.
    170   1.1  riastrad  *
    171   1.1  riastrad  *	Standard ABI calling convention.
    172   1.1  riastrad  */
    173   1.1  riastrad ENTRY(aesarmv8_setenckey192)
    174  1.13  riastrad 	ld1	{v1.16b}, [x1], #0x10	/* q1 := master key[0:128) */
    175  1.13  riastrad 	ld1	{v2.8b}, [x1]	/* d2 := master key[128:192) */
    176   1.1  riastrad 
    177   1.1  riastrad 	adrl	x4, unshiftrows_rotword_1
    178   1.1  riastrad 	adrl	x5, unshiftrows_rotword_3
    179   1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    180  1.13  riastrad 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_1 */
    181  1.13  riastrad 	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_rotword_3 */
    182   1.1  riastrad 
    183   1.1  riastrad 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
    184   1.1  riastrad 	mov	x2, #12		/* round count */
    185   1.1  riastrad 	adrl	x3, rcon	/* round constant */
    186   1.1  riastrad 
    187   1.1  riastrad 1:	/*
    188   1.1  riastrad 	 * q0 = 0
    189   1.1  riastrad 	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
    190   1.1  riastrad 	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
    191   1.1  riastrad 	 * x0 = pointer to three round keys to compute
    192   1.1  riastrad 	 * x2 = round count
    193   1.1  riastrad 	 * x3 = rcon pointer
    194   1.1  riastrad 	 */
    195   1.1  riastrad 
    196   1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q2)) */
    197   1.1  riastrad 	mov	v3.16b, v2.16b
    198   1.1  riastrad 	aese	v3.16b, v0.16b
    199   1.1  riastrad 
    200   1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
    201   1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    202   1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    203   1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    204   1.1  riastrad 
    205   1.1  riastrad 	/*
    206   1.1  riastrad 	 * We need to compute:
    207   1.1  riastrad 	 *
    208   1.1  riastrad 	 * rk[0] := rklo[0]
    209   1.1  riastrad 	 * rk[1] := rklo[1]
    210   1.1  riastrad 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    211   1.1  riastrad 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    212   1.1  riastrad 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    213   1.1  riastrad 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    214   1.1  riastrad 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    215   1.1  riastrad 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    216   1.1  riastrad 	 *     ^ rklo[1]
    217   1.1  riastrad 	 */
    218   1.1  riastrad 
    219   1.1  riastrad 	/*
    220   1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    221   1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    222   1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    223   1.1  riastrad 	 */
    224   1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    225   1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    226   1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    227   1.1  riastrad 
    228   1.1  riastrad 	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
    229   1.1  riastrad 	eor	v5.16b, v5.16b, v1.16b
    230   1.1  riastrad 	eor	v5.16b, v5.16b, v3.16b
    231   1.1  riastrad 	eor	v5.16b, v5.16b, v6.16b
    232   1.1  riastrad 	eor	v5.16b, v5.16b, v7.16b
    233   1.1  riastrad 
    234   1.1  riastrad 	/*
    235   1.1  riastrad 	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
    236   1.1  riastrad 	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
    237   1.1  riastrad 	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
    238   1.1  riastrad 	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
    239   1.1  riastrad 	 * (rklo[0],rklo[1],...).
    240   1.1  riastrad 	 */
    241   1.1  riastrad 
    242   1.1  riastrad 	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    243   1.5       ryo 	dup	v1.4s, v5.s[3]
    244   1.5       ryo 	mov	v1.s[0], v5.s[2]
    245   1.1  riastrad 
    246   1.1  riastrad 	/*
    247   1.1  riastrad 	 * v6.4s := (0, 0, rklo[0], rklo[1])
    248   1.1  riastrad 	 * v7.4s := (0, 0, 0, rklo[0])
    249   1.1  riastrad 	 */
    250   1.1  riastrad 	ext	v6.16b, v0.16b, v2.16b, #8
    251   1.1  riastrad 	ext	v7.16b, v0.16b, v2.16b, #4
    252   1.1  riastrad 
    253   1.1  riastrad 	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    254   1.1  riastrad 	eor	v3.16b, v1.16b, v6.16b
    255   1.1  riastrad 	eor	v3.16b, v3.16b, v7.16b
    256   1.1  riastrad 
    257   1.1  riastrad 	/*
    258   1.1  riastrad 	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
    259   1.1  riastrad 	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
    260   1.1  riastrad 	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
    261   1.1  riastrad 	 */
    262   1.5       ryo 	mov	v2.d[1], v5.d[0]
    263   1.1  riastrad 
    264   1.1  riastrad 	/* store two round keys */
    265   1.1  riastrad 	stp	q2, q3, [x0], #0x20
    266   1.1  riastrad 
    267   1.1  riastrad 	/*
    268   1.1  riastrad 	 * Live vector registers at this point:
    269   1.1  riastrad 	 *
    270   1.1  riastrad 	 *	q0 = zero
    271   1.1  riastrad 	 *	q2 = rk
    272   1.1  riastrad 	 *	q3 = nrk
    273   1.1  riastrad 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
    274   1.4  riastrad 	 *	q16 = unshiftrows_rotword_1
    275   1.4  riastrad 	 *	q17 = unshiftrows_rotword_3
    276   1.1  riastrad 	 *
    277   1.1  riastrad 	 * We have to compute, in q1:
    278   1.1  riastrad 	 *
    279   1.1  riastrad 	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
    280   1.1  riastrad 	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
    281   1.1  riastrad 	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    282   1.1  riastrad 	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    283   1.1  riastrad 	 *     ^ nrk[1]
    284   1.1  riastrad 	 *
    285   1.1  riastrad 	 * And, if there's any more afterward, in q2:
    286   1.1  riastrad 	 *
    287   1.1  riastrad 	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    288   1.1  riastrad 	 *     ^ nrk[1] ^ nrk[2]
    289   1.1  riastrad 	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
    290   1.1  riastrad 	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
    291   1.1  riastrad 	 */
    292   1.1  riastrad 
    293   1.1  riastrad 	/* q1 := RotWords(SubBytes(q3)) */
    294   1.1  riastrad 	mov	v1.16b, v3.16b
    295   1.1  riastrad 	aese	v1.16b, v0.16b
    296   1.1  riastrad 
    297   1.1  riastrad 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
    298   1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    299   1.4  riastrad 	tbl	v1.16b, {v1.16b}, v17.16b
    300   1.1  riastrad 	eor	v1.16b, v1.16b, v4.16b
    301   1.1  riastrad 
    302   1.1  riastrad 	/*
    303   1.1  riastrad 	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
    304   1.1  riastrad 	 * v4.4s := (0, rk[2], rk[3], nrk[0])
    305   1.1  riastrad 	 * v6.4s := (0, 0, rk[2], rk[3])
    306   1.1  riastrad 	 * v7.4s := (0, 0, 0, rk[2])
    307   1.1  riastrad 	 */
    308   1.1  riastrad 	ext	v4.16b, v0.16b, v5.16b, #12
    309   1.1  riastrad 	ext	v6.16b, v0.16b, v5.16b, #8
    310   1.1  riastrad 	ext	v7.16b, v0.16b, v5.16b, #4
    311   1.1  riastrad 
    312   1.1  riastrad 	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
    313   1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    314   1.1  riastrad 	eor	v1.16b, v1.16b, v4.16b
    315   1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    316   1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    317   1.1  riastrad 
    318   1.1  riastrad 	subs	x2, x2, #3	/* count down three rounds */
    319   1.1  riastrad 	str	q1, [x0], #0x10	/* store third round key */
    320   1.1  riastrad 	b.eq	2f
    321   1.1  riastrad 
    322   1.1  riastrad 	/*
    323   1.1  riastrad 	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
    324   1.1  riastrad 	 * v5.4s := (0, nrk[2], xxx, xxx)
    325   1.1  riastrad 	 */
    326   1.1  riastrad 	ext	v4.16b, v3.16b, v0.16b, #8
    327   1.1  riastrad 	ext	v5.16b, v0.16b, v4.16b, #12
    328   1.1  riastrad 
    329   1.1  riastrad 	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
    330   1.5       ryo 	dup	v2.4s, v1.s[3]
    331   1.1  riastrad 
    332   1.1  riastrad 	/*
    333   1.1  riastrad 	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
    334   1.1  riastrad 	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
    335   1.1  riastrad 	 *     xxx, xxx)
    336   1.1  riastrad 	 */
    337   1.1  riastrad 	eor	v2.16b, v2.16b, v4.16b
    338   1.1  riastrad 	eor	v2.16b, v2.16b, v5.16b
    339   1.1  riastrad 
    340   1.1  riastrad 	b	1b
    341   1.1  riastrad 
    342   1.1  riastrad 2:	ret
    343   1.1  riastrad END(aesarmv8_setenckey192)
    344   1.1  riastrad 
    345   1.1  riastrad /*
    346   1.1  riastrad  * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
    347   1.1  riastrad  *
    348   1.1  riastrad  *	Expand a 32-byte AES-256 key into 14 round keys.
    349   1.1  riastrad  *
    350   1.1  riastrad  *	Standard ABI calling convention.
    351   1.1  riastrad  */
    352   1.1  riastrad ENTRY(aesarmv8_setenckey256)
    353   1.1  riastrad 	/* q1 := key[0:128), q2 := key[128:256) */
    354  1.13  riastrad 	ld1	{v1.16b-v2.16b}, [x1], #0x20
    355   1.1  riastrad 
    356   1.1  riastrad 	adrl	x4, unshiftrows_rotword_3
    357   1.1  riastrad 	adrl	x5, unshiftrows_3
    358   1.1  riastrad 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
    359  1.13  riastrad 	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 */
    360  1.13  riastrad 	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_3 */
    361   1.1  riastrad 
    362   1.1  riastrad 	/* store master key as first two round keys */
    363   1.1  riastrad 	stp	q1, q2, [x0], #0x20
    364   1.1  riastrad 	mov	x2, #14		/* round count */
    365   1.1  riastrad 	adrl	x3, rcon	/* round constant */
    366   1.1  riastrad 
    367   1.1  riastrad 1:	/*
    368   1.1  riastrad 	 * q0 = 0
    369   1.1  riastrad 	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
    370   1.1  riastrad 	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
    371   1.1  riastrad 	 * x2 = round count
    372   1.1  riastrad 	 * x3 = rcon pointer
    373   1.1  riastrad 	 */
    374   1.1  riastrad 
    375   1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q2)) */
    376   1.1  riastrad 	mov	v3.16b, v2.16b
    377   1.1  riastrad 	aese	v3.16b, v0.16b
    378   1.1  riastrad 
    379   1.1  riastrad 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
    380   1.1  riastrad 	ld1r	{v4.4s}, [x3], #4
    381   1.4  riastrad 	tbl	v3.16b, {v3.16b}, v16.16b
    382   1.1  riastrad 	eor	v3.16b, v3.16b, v4.16b
    383   1.1  riastrad 
    384   1.1  riastrad 	/*
    385   1.1  riastrad 	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
    386   1.1  riastrad 	 * v6.4s := (0,0,pprk[0],pprk[1])
    387   1.1  riastrad 	 * v7.4s := (0,0,0,pprk[0])
    388   1.1  riastrad 	 */
    389   1.1  riastrad 	ext	v5.16b, v0.16b, v1.16b, #12
    390   1.1  riastrad 	ext	v6.16b, v0.16b, v1.16b, #8
    391   1.1  riastrad 	ext	v7.16b, v0.16b, v1.16b, #4
    392   1.1  riastrad 
    393   1.1  riastrad 	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
    394   1.1  riastrad 	eor	v1.16b, v1.16b, v3.16b
    395   1.1  riastrad 	eor	v1.16b, v1.16b, v5.16b
    396   1.1  riastrad 	eor	v1.16b, v1.16b, v6.16b
    397   1.1  riastrad 	eor	v1.16b, v1.16b, v7.16b
    398   1.1  riastrad 
    399   1.1  riastrad 	subs	x2, x2, #2		/* count down two rounds */
    400   1.1  riastrad 	b.eq	2f			/* stop if this is the last one */
    401   1.1  riastrad 
    402   1.1  riastrad 	/* q3 := ShiftRows(SubBytes(q1)) */
    403   1.1  riastrad 	mov	v3.16b, v1.16b
    404   1.1  riastrad 	aese	v3.16b, v0.16b
    405   1.1  riastrad 
    406   1.1  riastrad 	/* v3.4s[i] := SubBytes(rk[3]) */
    407   1.4  riastrad 	tbl	v3.16b, {v3.16b}, v17.16b
    408   1.1  riastrad 
    409   1.1  riastrad 	/*
    410   1.1  riastrad 	 * v5.4s := (0,prk[0],prk[1],prk[2])
    411   1.1  riastrad 	 * v6.4s := (0,0,prk[0],prk[1])
    412   1.1  riastrad 	 * v7.4s := (0,0,0,prk[0])
    413   1.1  riastrad 	 */
    414   1.1  riastrad 	ext	v5.16b, v0.16b, v2.16b, #12
    415   1.1  riastrad 	ext	v6.16b, v0.16b, v2.16b, #8
    416   1.1  riastrad 	ext	v7.16b, v0.16b, v2.16b, #4
    417   1.1  riastrad 
    418   1.1  riastrad 	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
    419   1.1  riastrad 	eor	v2.16b, v2.16b, v3.16b
    420   1.1  riastrad 	eor	v2.16b, v2.16b, v5.16b
    421   1.1  riastrad 	eor	v2.16b, v2.16b, v6.16b
    422   1.1  riastrad 	eor	v2.16b, v2.16b, v7.16b
    423   1.1  riastrad 
    424   1.1  riastrad 	stp	q1, q2, [x0], #0x20	/* store two round keys */
    425   1.1  riastrad 	b	1b
    426   1.1  riastrad 
    427   1.1  riastrad 2:	str	q1, [x0]		/* store last round key */
    428   1.1  riastrad 	ret
    429   1.1  riastrad END(aesarmv8_setenckey256)
    430   1.1  riastrad 
    431   1.1  riastrad /*
    432   1.1  riastrad  * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
    433   1.1  riastrad  *     uint32_t nrounds@x2)
    434   1.1  riastrad  *
    435   1.1  riastrad  *	Convert AES encryption round keys to AES decryption round keys.
    436   1.1  riastrad  *	`rounds' must be between 10 and 14.
    437   1.1  riastrad  *
    438   1.1  riastrad  *	Standard ABI calling convention.
    439   1.1  riastrad  */
    440   1.1  riastrad ENTRY(aesarmv8_enctodec)
    441   1.1  riastrad 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
    442   1.7  riastrad 	b	2f
    443   1.9  riastrad 	_ALIGN_TEXT
    444   1.7  riastrad 1:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
    445   1.7  riastrad 2:	str	q0, [x1], #0x10	/* store round key */
    446   1.1  riastrad 	subs	x2, x2, #1	/* count down round */
    447   1.1  riastrad 	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
    448   1.7  riastrad 	b.ne	1b		/* repeat if there's more */
    449   1.7  riastrad 	str	q0, [x1]	/* store first round key verbatim */
    450   1.1  riastrad 	ret
    451   1.1  riastrad END(aesarmv8_enctodec)
    452   1.1  riastrad 
    453   1.1  riastrad /*
    454   1.1  riastrad  * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
    455   1.1  riastrad  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    456   1.1  riastrad  *
    457   1.1  riastrad  *	Encrypt a single block.
    458   1.1  riastrad  *
    459   1.1  riastrad  *	Standard ABI calling convention.
    460   1.1  riastrad  */
    461   1.1  riastrad ENTRY(aesarmv8_enc)
    462   1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    463   1.1  riastrad 	mov	fp, sp
    464  1.13  riastrad 	ld1	{v0.16b}, [x1]	/* q0 := ptxt */
    465   1.4  riastrad 	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
    466  1.13  riastrad 	st1	{v0.16b}, [x2]	/* store ctxt */
    467   1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    468   1.1  riastrad 	ret
    469   1.1  riastrad END(aesarmv8_enc)
    470   1.1  riastrad 
    471   1.1  riastrad /*
    472   1.1  riastrad  * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
    473   1.1  riastrad  *     uint8_t out[16] @x2, uint32_t nrounds@x3)
    474   1.1  riastrad  *
    475   1.1  riastrad  *	Decrypt a single block.
    476   1.1  riastrad  *
    477   1.1  riastrad  *	Standard ABI calling convention.
    478   1.1  riastrad  */
    479   1.1  riastrad ENTRY(aesarmv8_dec)
    480   1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    481   1.1  riastrad 	mov	fp, sp
    482  1.13  riastrad 	ld1	{v0.16b}, [x1]	/* q0 := ctxt */
    483   1.4  riastrad 	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
    484  1.13  riastrad 	st1	{v0.16b}, [x2]	/* store ptxt */
    485   1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    486   1.1  riastrad 	ret
    487   1.1  riastrad END(aesarmv8_dec)
    488   1.1  riastrad 
    489   1.1  riastrad /*
    490   1.1  riastrad  * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    491   1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
    492   1.1  riastrad  *     uint32_t nrounds@x5)
    493   1.1  riastrad  *
    494   1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    495   1.1  riastrad  *
    496   1.1  riastrad  *	nbytes must be an integral multiple of 16.
    497   1.1  riastrad  *
    498   1.1  riastrad  *	Standard ABI calling convention.
    499   1.1  riastrad  */
    500   1.1  riastrad ENTRY(aesarmv8_cbc_enc)
    501   1.1  riastrad 	cbz	x3, 2f			/* stop if nothing to do */
    502   1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    503   1.1  riastrad 	mov	fp, sp
    504   1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    505   1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    506  1.13  riastrad 	ld1	{v0.16b}, [x4]		/* q0 := chaining value */
    507   1.9  riastrad 	_ALIGN_TEXT
    508  1.13  riastrad 1:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
    509   1.1  riastrad 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
    510   1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    511   1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    512   1.4  riastrad 	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
    513   1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    514  1.13  riastrad 	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
    515   1.1  riastrad 	b.ne	1b			/* repeat if x10 is nonzero */
    516  1.13  riastrad 	st1	{v0.16b}, [x4]		/* store chaining value */
    517   1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    518   1.1  riastrad 2:	ret
    519   1.1  riastrad END(aesarmv8_cbc_enc)
    520   1.1  riastrad 
    521   1.1  riastrad /*
    522   1.1  riastrad  * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    523   1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    524   1.1  riastrad  *     uint32_t nrounds@x5)
    525   1.1  riastrad  *
    526   1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    527   1.1  riastrad  *
    528   1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    529   1.1  riastrad  *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
    530   1.1  riastrad  *
    531   1.1  riastrad  *	Standard ABI calling convention.
    532   1.1  riastrad  */
    533   1.1  riastrad ENTRY(aesarmv8_cbc_dec1)
    534   1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    535   1.1  riastrad 	mov	fp, sp
    536  1.13  riastrad 	ld1	{v24.16b}, [x4]		/* q24 := iv */
    537   1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    538   1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    539   1.1  riastrad 	add	x1, x1, x3		/* x1 := pointer past end of in */
    540   1.1  riastrad 	add	x2, x2, x3		/* x2 := pointer past end of out */
    541  1.13  riastrad 	sub	x1, x1, #0x10
    542  1.13  riastrad 	ld1	{v0.16b}, [x1]		/* q0 := last ciphertext block */
    543  1.13  riastrad 	st1	{v0.16b}, [x4]		/* update iv */
    544   1.7  riastrad 	b	2f
    545   1.9  riastrad 	_ALIGN_TEXT
    546  1.13  riastrad 1:	sub	x1, x1, #0x10
    547  1.13  riastrad 	ld1	{v31.16b}, [x1]		/* q31 := chaining value */
    548  1.13  riastrad 	sub	x2, x2, #0x10
    549   1.7  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
    550  1.13  riastrad 	st1	{v0.16b}, [x2]		/* store plaintext block */
    551   1.7  riastrad 	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
    552   1.7  riastrad 2:	mov	x0, x9			/* x0 := enckey */
    553   1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    554   1.4  riastrad 	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
    555   1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    556   1.7  riastrad 	b.ne	1b			/* repeat if more blocks */
    557   1.7  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
    558  1.13  riastrad 	sub	x2, x2, #0x10		/* store first plaintext block */
    559  1.13  riastrad 	st1	{v0.16b}, [x2]
    560   1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    561   1.1  riastrad 	ret
    562   1.1  riastrad END(aesarmv8_cbc_dec1)
    563   1.1  riastrad 
    564   1.1  riastrad /*
    565   1.1  riastrad  * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    566   1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
    567   1.1  riastrad  *     uint32_t nrounds@x5)
    568   1.1  riastrad  *
    569   1.1  riastrad  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    570   1.1  riastrad  *
    571   1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    572   1.1  riastrad  *
    573   1.1  riastrad  *	Standard ABI calling convention.
    574   1.1  riastrad  */
    575   1.1  riastrad ENTRY(aesarmv8_cbc_dec8)
    576   1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    577   1.1  riastrad 	mov	fp, sp
    578  1.13  riastrad 	ld1	{v24.16b}, [x4]		/* q24 := iv */
    579   1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    580   1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    581   1.1  riastrad 	add	x1, x1, x3		/* x1 := pointer past end of in */
    582   1.1  riastrad 	add	x2, x2, x3		/* x2 := pointer past end of out */
    583  1.13  riastrad 	sub	x1, x1, #0x20
    584  1.13  riastrad 	ld1	{v6.16b, v7.16b}, [x1]	/* q6, q7 := last ciphertext blocks */
    585  1.13  riastrad 	st1	{v7.16b}, [x4]		/* update iv */
    586   1.7  riastrad 	b	2f
    587   1.9  riastrad 	_ALIGN_TEXT
    588  1.13  riastrad 1:	sub	x1, x1, #0x20
    589  1.13  riastrad 	ld1	{v6.16b, v7.16b}, [x1]
    590   1.7  riastrad 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
    591  1.13  riastrad 	sub	x2, x2, #0x20
    592  1.13  riastrad 	st1	{v0.16b, v1.16b}, [x2]
    593  1.13  riastrad 2:	sub	x1, x1, #0x20
    594  1.13  riastrad 	ld1	{v4.16b-v5.16b}, [x1]
    595  1.13  riastrad 	sub	x1, x1, #0x40
    596  1.13  riastrad 	ld1	{v0.16b-v3.16b}, [x1]
    597  1.13  riastrad 
    598   1.4  riastrad 	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0<i<8 */
    599   1.4  riastrad 	mov	v30.16b, v5.16b
    600   1.4  riastrad 	mov	v29.16b, v4.16b
    601   1.4  riastrad 	mov	v28.16b, v3.16b
    602   1.4  riastrad 	mov	v27.16b, v2.16b
    603   1.4  riastrad 	mov	v26.16b, v1.16b
    604   1.4  riastrad 	mov	v25.16b, v0.16b
    605   1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    606   1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    607   1.4  riastrad 	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i];
    608   1.4  riastrad 					 * trash x0/x3/q16 */
    609   1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b	/* q[i] := pt[i] */
    610   1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    611   1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    612   1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    613   1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    614   1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    615   1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    616   1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    617  1.13  riastrad 	sub	x2, x2, #0x20		/* store plaintext blocks */
    618  1.13  riastrad 	st1	{v6.16b-v7.16b}, [x2]
    619  1.13  riastrad 	sub	x2, x2, #0x40
    620  1.13  riastrad 	st1	{v2.16b-v5.16b}, [x2]
    621   1.7  riastrad 	b.ne	1b			/* repeat if there's more */
    622   1.7  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q0 := pt0 */
    623  1.13  riastrad 	sub	x2, x2, #0x20
    624  1.13  riastrad 	st1	{v0.16b, v1.16b}, [x2]	/* store first two plaintext blocks */
    625   1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    626   1.1  riastrad 	ret
    627   1.1  riastrad END(aesarmv8_cbc_dec8)
    628   1.1  riastrad 
    629   1.1  riastrad /*
    630   1.1  riastrad  * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    631   1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    632   1.1  riastrad  *     uint32_t nrounds@x5)
    633   1.1  riastrad  *
    634   1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    635   1.1  riastrad  *
    636   1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    637   1.1  riastrad  *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
    638   1.1  riastrad  *
    639   1.1  riastrad  *	Standard ABI calling convention.
    640   1.1  riastrad  */
    641   1.1  riastrad ENTRY(aesarmv8_xts_enc1)
    642   1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    643   1.1  riastrad 	mov	fp, sp
    644   1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    645   1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    646  1.13  riastrad 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    647   1.9  riastrad 	_ALIGN_TEXT
    648  1.13  riastrad 1:	ld1	{v0.16b}, [x1], #0x10	/* q0 := ptxt */
    649   1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    650   1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    651   1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ptxt ^ tweak */
    652   1.4  riastrad 	bl	aesarmv8_enc1		/* q0 := AES(...); trash x0/x3/q16 */
    653   1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
    654  1.13  riastrad 	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
    655   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    656   1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    657   1.1  riastrad 	b.ne	1b			/* repeat if more blocks */
    658  1.13  riastrad 	st1	{v31.16b}, [x4]		/* update tweak */
    659   1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    660   1.1  riastrad 	ret
    661   1.1  riastrad END(aesarmv8_xts_enc1)
    662   1.1  riastrad 
    663   1.1  riastrad /*
    664   1.1  riastrad  * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    665   1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    666   1.1  riastrad  *     uint32_t nrounds@x5)
    667   1.1  riastrad  *
    668   1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    669   1.1  riastrad  *
    670   1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    671   1.1  riastrad  *
    672   1.1  riastrad  *	Standard ABI calling convention.
    673   1.1  riastrad  */
    674   1.1  riastrad ENTRY(aesarmv8_xts_enc8)
    675   1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    676   1.1  riastrad 	mov	fp, sp
    677   1.1  riastrad 	mov	x9, x0			/* x9 := enckey */
    678   1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    679  1.13  riastrad 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    680   1.9  riastrad 	_ALIGN_TEXT
    681   1.4  riastrad 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    682   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    683   1.4  riastrad 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    684   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    685   1.4  riastrad 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    686   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    687   1.4  riastrad 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    688   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    689   1.4  riastrad 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    690   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    691   1.4  riastrad 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    692   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    693   1.4  riastrad 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    694   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    695   1.4  riastrad 					/* q31 := tweak[7] */
    696  1.14  riastrad 	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ptxt[i] */
    697  1.14  riastrad 	ld1	{v4.16b-v7.16b}, [x1], #0x40
    698   1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
    699   1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    700   1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    701   1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    702   1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    703   1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    704   1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    705   1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    706   1.1  riastrad 	mov	x0, x9			/* x0 := enckey */
    707   1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    708   1.4  riastrad 	bl	aesarmv8_enc8		/* encrypt q0-q7; trash x0/x3/q16 */
    709   1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    710   1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    711   1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    712   1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    713   1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    714   1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    715   1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    716   1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    717  1.14  riastrad 	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store ciphertext blocks */
    718  1.14  riastrad 	st1	{v4.16b-v7.16b}, [x2], #0x40
    719   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    720   1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    721   1.1  riastrad 	b.ne	1b			/* repeat if more block groups */
    722  1.13  riastrad 	st1	{v31.16b}, [x4]		/* update tweak */
    723   1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    724   1.1  riastrad 	ret
    725   1.1  riastrad END(aesarmv8_xts_enc8)
    726   1.1  riastrad 
    727   1.1  riastrad /*
    728   1.1  riastrad  * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    729   1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    730   1.1  riastrad  *     uint32_t nrounds@x5)
    731   1.1  riastrad  *
    732   1.4  riastrad  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    733   1.1  riastrad  *
    734   1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    735   1.1  riastrad  *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
    736   1.1  riastrad  *
    737   1.1  riastrad  *	Standard ABI calling convention.
    738   1.1  riastrad  */
    739   1.1  riastrad ENTRY(aesarmv8_xts_dec1)
    740   1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    741   1.1  riastrad 	mov	fp, sp
    742   1.1  riastrad 	mov	x9, x0			/* x9 := deckey */
    743   1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    744  1.13  riastrad 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    745   1.9  riastrad 	_ALIGN_TEXT
    746  1.13  riastrad 1:	ld1	{v0.16b}, [x1], #0x10	/* q0 := ctxt */
    747   1.1  riastrad 	mov	x0, x9			/* x0 := deckey */
    748   1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    749   1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := ctxt ^ tweak */
    750   1.4  riastrad 	bl	aesarmv8_dec1		/* q0 := AES(...); trash x0/x3/q16 */
    751   1.4  riastrad 	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ctxt ^ tweak) ^ tweak */
    752  1.13  riastrad 	st1	{v0.16b}, [x2], #0x10	/* store plaintext block */
    753   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    754   1.1  riastrad 	subs	x10, x10, #0x10		/* count down nbytes */
    755   1.1  riastrad 	b.ne	1b			/* repeat if more blocks */
    756  1.13  riastrad 	st1	{v31.16b}, [x4]		/* update tweak */
    757   1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    758   1.1  riastrad 	ret
    759   1.1  riastrad END(aesarmv8_xts_dec1)
    760   1.1  riastrad 
    761   1.1  riastrad /*
    762   1.1  riastrad  * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
    763   1.1  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
    764   1.1  riastrad  *     uint32_t nrounds@x5)
    765   1.1  riastrad  *
    766   1.4  riastrad  *	Decrypt a contiguous sequdece of blocks with AES-XTS.
    767   1.1  riastrad  *
    768   1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    769   1.1  riastrad  *
    770   1.1  riastrad  *	Standard ABI calling convention.
    771   1.1  riastrad  */
    772   1.1  riastrad ENTRY(aesarmv8_xts_dec8)
    773   1.4  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    774   1.1  riastrad 	mov	fp, sp
    775   1.1  riastrad 	mov	x9, x0			/* x9 := deckey */
    776   1.1  riastrad 	mov	x10, x3			/* x10 := nbytes */
    777  1.13  riastrad 	ld1	{v31.16b}, [x4]		/* q31 := tweak */
    778   1.9  riastrad 	_ALIGN_TEXT
    779   1.4  riastrad 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
    780   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    781   1.4  riastrad 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
    782   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    783   1.4  riastrad 	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
    784   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    785   1.4  riastrad 	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
    786   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    787   1.4  riastrad 	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
    788   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    789   1.4  riastrad 	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
    790   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    791   1.4  riastrad 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
    792   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    793   1.4  riastrad 					/* q31 := tweak[7] */
    794  1.13  riastrad 	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ctxt[i] */
    795  1.13  riastrad 	ld1	{v4.16b-v7.16b}, [x1], #0x40
    796   1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ctxt[i] ^ tweak[i] */
    797   1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    798   1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    799   1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    800   1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    801   1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    802   1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    803   1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    804   1.1  riastrad 	mov	x0, x9			/* x0 := deckey */
    805   1.1  riastrad 	mov	x3, x5			/* x3 := nrounds */
    806   1.4  riastrad 	bl	aesarmv8_dec8		/* decrypt q0-q7; trash x0/x3/q16 */
    807   1.4  riastrad 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
    808   1.4  riastrad 	eor	v1.16b, v1.16b, v25.16b
    809   1.4  riastrad 	eor	v2.16b, v2.16b, v26.16b
    810   1.4  riastrad 	eor	v3.16b, v3.16b, v27.16b
    811   1.4  riastrad 	eor	v4.16b, v4.16b, v28.16b
    812   1.4  riastrad 	eor	v5.16b, v5.16b, v29.16b
    813   1.4  riastrad 	eor	v6.16b, v6.16b, v30.16b
    814   1.4  riastrad 	eor	v7.16b, v7.16b, v31.16b
    815  1.13  riastrad 	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store plaintext blocks */
    816  1.13  riastrad 	st1	{v4.16b-v7.16b}, [x2], #0x40
    817   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    818   1.1  riastrad 	subs	x10, x10, #0x80		/* count down nbytes */
    819   1.1  riastrad 	b.ne	1b			/* repeat if more block groups */
    820  1.13  riastrad 	st1	{v31.16b}, [x4]		/* update tweak */
    821   1.4  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    822   1.1  riastrad 	ret
    823   1.1  riastrad END(aesarmv8_xts_dec8)
    824   1.1  riastrad 
    825   1.1  riastrad /*
    826   1.4  riastrad  * aesarmv8_xts_mulx(tweak@q31)
    827   1.1  riastrad  *
    828   1.4  riastrad  *	Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    829   1.1  riastrad  *	Uses x0 and q0/q1 as temporaries.
    830   1.1  riastrad  */
    831   1.1  riastrad 	.text
    832   1.1  riastrad 	_ALIGN_TEXT
    833   1.1  riastrad 	.type	aesarmv8_xts_mulx,@function
    834   1.1  riastrad aesarmv8_xts_mulx:
    835   1.1  riastrad 	/*
    836   1.1  riastrad 	 * Simultaneously determine
    837   1.1  riastrad 	 * (a) whether the high bit of the low half must be
    838   1.1  riastrad 	 *     shifted into the low bit of the high half, and
    839   1.1  riastrad 	 * (b) whether the high bit of the high half must be
    840   1.1  riastrad 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    841   1.1  riastrad 	 */
    842   1.1  riastrad 	adrl	x0, xtscarry
    843   1.6  riastrad 	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
    844  1.13  riastrad 	ld1	{v0.16b}, [x0]		/* q0 := xtscarry */
    845   1.1  riastrad 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
    846   1.4  riastrad 	shl	v31.2d, v31.2d, #1	/* shift */
    847   1.1  riastrad 	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
    848   1.4  riastrad 	eor	v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
    849   1.1  riastrad 	ret
    850   1.1  riastrad END(aesarmv8_xts_mulx)
    851   1.1  riastrad 
    852   1.1  riastrad 	.section .rodata
    853   1.2  riastrad 	.p2align 4
    854   1.1  riastrad 	.type	xtscarry,@object
    855   1.1  riastrad xtscarry:
    856   1.1  riastrad 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    857   1.1  riastrad END(xtscarry)
    858   1.1  riastrad 
    859   1.1  riastrad /*
    860   1.1  riastrad  * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
    861   1.1  riastrad  *
    862   1.1  riastrad  *	Update an AES-XTS tweak.
    863   1.1  riastrad  *
    864   1.1  riastrad  *	Standard ABI calling convention.
    865   1.1  riastrad  */
    866   1.1  riastrad ENTRY(aesarmv8_xts_update)
    867   1.1  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    868   1.1  riastrad 	mov	fp, sp
    869  1.13  riastrad 	ld1	{v31.16b}, [x0]		/* load tweak */
    870   1.4  riastrad 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
    871  1.13  riastrad 	st1	{v31.16b}, [x1]		/* store tweak */
    872   1.1  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    873   1.1  riastrad 	ret
    874   1.1  riastrad END(aesarmv8_xts_update)
    875   1.1  riastrad 
    876   1.1  riastrad /*
    877   1.8  riastrad  * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0,
    878   1.8  riastrad  *     const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3,
    879   1.8  riastrad  *     uint32_t nrounds@x4)
    880   1.8  riastrad  *
    881   1.8  riastrad  *	Update CBC-MAC.
    882   1.8  riastrad  *
    883   1.8  riastrad  *	nbytes must be a positive integral multiple of 16.
    884   1.8  riastrad  *
    885   1.8  riastrad  *	Standard ABI calling convention.
    886   1.8  riastrad  */
    887   1.8  riastrad ENTRY(aesarmv8_cbcmac_update1)
    888   1.8  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    889   1.8  riastrad 	mov	fp, sp
    890  1.13  riastrad 	ld1	{v0.16b}, [x3]		/* q0 := initial authenticator */
    891   1.8  riastrad 	mov	x9, x0			/* x9 := enckey */
    892   1.8  riastrad 	mov	x5, x3			/* x5 := &auth (enc1 trashes x3) */
    893   1.9  riastrad 	_ALIGN_TEXT
    894  1.13  riastrad 1:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
    895   1.8  riastrad 	mov	x0, x9			/* x0 := enckey */
    896   1.8  riastrad 	mov	x3, x4			/* x3 := nrounds */
    897   1.8  riastrad 	eor	v0.16b, v0.16b, v1.16b	/* q0 := auth ^ ptxt */
    898   1.8  riastrad 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
    899   1.8  riastrad 	subs	x2, x2, #0x10		/* count down nbytes */
    900   1.8  riastrad 	b.ne	1b			/* repeat if x10 is nonzero */
    901  1.13  riastrad 	st1	{v0.16b}, [x5]		/* store updated authenticator */
    902   1.8  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    903   1.8  riastrad 	ret
    904   1.8  riastrad END(aesarmv8_cbcmac_update1)
    905   1.8  riastrad 
    906   1.8  riastrad /*
    907   1.8  riastrad  * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    908   1.8  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    909   1.8  riastrad  *     uint32_t nrounds@x5)
    910   1.8  riastrad  *
    911   1.8  riastrad  *	Update CCM encryption.
    912   1.8  riastrad  *
    913   1.8  riastrad  *	nbytes must be a positive integral multiple of 16.
    914   1.8  riastrad  *
    915   1.8  riastrad  *	Standard ABI calling convention.
    916   1.8  riastrad  */
    917   1.8  riastrad ENTRY(aesarmv8_ccm_enc1)
    918   1.8  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    919   1.8  riastrad 	mov	fp, sp
    920  1.15  riastrad 	ld1	{v0.16b-v1.16b}, [x4]	/* q0 := auth, q1 := ctr (be) */
    921   1.8  riastrad 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    922   1.8  riastrad 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    923   1.8  riastrad 	mov	x9, x0			/* x9 := enckey */
    924   1.8  riastrad 	mov	x10, x3			/* x10 := nbytes */
    925  1.15  riastrad 	rev32	v2.16b, v1.16b		/* q2 := ctr (host-endian) */
    926   1.9  riastrad 	_ALIGN_TEXT
    927  1.13  riastrad 1:	ld1	{v3.16b}, [x1], #0x10	/* q3 := plaintext block */
    928   1.8  riastrad 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    929   1.8  riastrad 	mov	x0, x9			/* x0 := enckey */
    930   1.8  riastrad 	mov	x3, x5			/* x3 := nrounds */
    931   1.8  riastrad 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    932   1.8  riastrad 	eor	v0.16b, v0.16b, v3.16b	/* q0 := auth ^ ptxt */
    933   1.8  riastrad 	bl	aesarmv8_enc2		/* q0 := auth', q1 := pad;
    934   1.8  riastrad 					 * trash x0/x3/q16 */
    935   1.8  riastrad 	eor	v3.16b, v1.16b, v3.16b	/* q3 := ciphertext block */
    936   1.8  riastrad 	subs	x10, x10, #0x10		/* count down bytes */
    937  1.13  riastrad 	st1	{v3.16b}, [x2], #0x10	/* store ciphertext block */
    938   1.8  riastrad 	b.ne	1b			/* repeat if more blocks */
    939  1.15  riastrad 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
    940  1.15  riastrad 	st1	{v0.16b-v1.16b}, [x4]	/* store updated auth/ctr */
    941   1.8  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
    942   1.8  riastrad 	ret
    943   1.8  riastrad END(aesarmv8_ccm_enc1)
    944   1.8  riastrad 
    945   1.8  riastrad /*
    946   1.8  riastrad  * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
    947   1.8  riastrad  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
    948   1.8  riastrad  *     uint32_t nrounds@x5)
    949   1.8  riastrad  *
    950   1.8  riastrad  *	Update CCM decryption.
    951   1.8  riastrad  *
    952   1.8  riastrad  *	nbytes must be a positive integral multiple of 16.
    953   1.8  riastrad  *
    954   1.8  riastrad  *	Standard ABI calling convention.
    955   1.8  riastrad  */
    956   1.8  riastrad ENTRY(aesarmv8_ccm_dec1)
    957   1.8  riastrad 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
    958   1.8  riastrad 	mov	fp, sp
    959  1.13  riastrad 	ld1	{v1.16b, v2.16b}, [x4]	/* q1 := auth, q2 := ctr (be) */
    960   1.8  riastrad 	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
    961   1.8  riastrad 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
    962   1.8  riastrad 	mov	x9, x0			/* x9 := enckey */
    963   1.8  riastrad 	mov	x10, x3			/* x10 := nbytes */
    964   1.8  riastrad 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
    965   1.8  riastrad 
    966   1.8  riastrad 	/* Decrypt the first block.  */
    967   1.8  riastrad 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    968   1.8  riastrad 	mov	x3, x5			/* x3 := nrounds */
    969   1.8  riastrad 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    970  1.13  riastrad 	ld1	{v3.16b}, [x1], #0x10	/* q3 := ctxt */
    971   1.8  riastrad 	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
    972   1.8  riastrad 	b	2f
    973   1.8  riastrad 
    974   1.9  riastrad 	_ALIGN_TEXT
    975   1.8  riastrad 1:	/*
    976   1.8  riastrad 	 * Authenticate the last block and decrypt the next block
    977   1.8  riastrad 	 * simultaneously.
    978   1.8  riastrad 	 *
    979   1.8  riastrad 	 *	q1 = auth ^ ptxt[-1]
    980   1.8  riastrad 	 *	q2 = ctr[-1] (le)
    981   1.8  riastrad 	 */
    982   1.8  riastrad 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
    983   1.8  riastrad 	mov	x0, x9			/* x0 := enckey */
    984   1.8  riastrad 	mov	x3, x5			/* x3 := nrounds */
    985   1.8  riastrad 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
    986  1.13  riastrad 	ld1	{v3.16b}, [x1], #0x10	/* q3 := ctxt */
    987   1.8  riastrad 	bl	aesarmv8_enc2		/* q0 := pad, q1 := auth';
    988   1.8  riastrad 					 * trash x0/x3/q16 */
    989   1.8  riastrad 2:	eor	v3.16b, v0.16b, v3.16b	/* q3 := plaintext block */
    990   1.8  riastrad 	subs	x10, x10, #0x10
    991  1.13  riastrad 	st1	{v3.16b}, [x2], #0x10		/* store plaintext */
    992   1.8  riastrad 	eor	v1.16b, v1.16b, v3.16b	/* q1 := auth ^ ptxt */
    993   1.8  riastrad 	b.ne	1b
    994   1.8  riastrad 
    995   1.8  riastrad 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
    996   1.8  riastrad 
    997   1.8  riastrad 	/* Authenticate the last block.  */
    998   1.8  riastrad 	mov	x0, x9			/* x0 := enckey */
    999   1.8  riastrad 	mov	x3, x5			/* x3 := nrounds */
   1000   1.8  riastrad 	mov	v0.16b, v1.16b		/* q0 := auth ^ ptxt */
   1001   1.8  riastrad 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
   1002  1.12  riastrad 
   1003  1.13  riastrad 	mov	v1.16b, v2.16b		/* store updated auth/ctr */
   1004  1.13  riastrad 	st1	{v0.16b-v1.16b}, [x4]
   1005   1.8  riastrad 	ldp	fp, lr, [sp], #16	/* pop stack frame */
   1006   1.8  riastrad 	ret
   1007   1.8  riastrad END(aesarmv8_ccm_dec1)
   1008   1.8  riastrad 
   1009   1.8  riastrad 	.section .rodata
   1010   1.8  riastrad 	.p2align 4
   1011   1.8  riastrad 	.type	ctr32_inc,@object
   1012   1.8  riastrad ctr32_inc:
   1013   1.8  riastrad 	.int	0, 0, 0, 1
   1014   1.8  riastrad END(ctr32_inc)
   1015   1.8  riastrad 
   1016   1.8  riastrad /*
   1017   1.1  riastrad  * aesarmv8_enc1(const struct aesenc *enckey@x0,
   1018   1.1  riastrad  *     uint128_t block@q0, uint32_t nrounds@x3)
   1019   1.1  riastrad  *
   1020   1.1  riastrad  *	Encrypt a single AES block in q0.
   1021   1.1  riastrad  *
   1022   1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1023   1.1  riastrad  */
   1024   1.1  riastrad 	.text
   1025   1.1  riastrad 	_ALIGN_TEXT
   1026   1.1  riastrad 	.type	aesarmv8_enc1,@function
   1027   1.1  riastrad aesarmv8_enc1:
   1028   1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1029  1.10  riastrad 	sub	x3, x3, #1
   1030   1.9  riastrad 	_ALIGN_TEXT
   1031  1.10  riastrad 1:	/* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
   1032  1.10  riastrad 	aese	v0.16b, v16.16b
   1033   1.7  riastrad 	aesmc	v0.16b, v0.16b
   1034  1.10  riastrad 	ldr	q16, [x0], #0x10
   1035  1.10  riastrad 	subs	x3, x3, #1
   1036  1.10  riastrad 	b.ne	1b
   1037   1.4  riastrad 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
   1038   1.4  riastrad 	aese	v0.16b, v16.16b
   1039  1.10  riastrad 	ldr	q16, [x0]		/* load last round key */
   1040  1.10  riastrad 	/* q0 := AddRoundKey_q16(q0) */
   1041   1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b
   1042   1.1  riastrad 	ret
   1043   1.1  riastrad END(aesarmv8_enc1)
   1044   1.1  riastrad 
   1045   1.1  riastrad /*
   1046   1.8  riastrad  * aesarmv8_enc2(const struct aesenc *enckey@x0,
   1047   1.8  riastrad  *     uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3)
   1048   1.8  riastrad  *
   1049   1.8  riastrad  *	Encrypt two AES blocks in q0 and q1.
   1050   1.8  riastrad  *
   1051   1.8  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1052   1.8  riastrad  */
   1053   1.8  riastrad 	.text
   1054   1.8  riastrad 	_ALIGN_TEXT
   1055   1.8  riastrad 	.type	aesarmv8_enc2,@function
   1056   1.8  riastrad aesarmv8_enc2:
   1057   1.8  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1058  1.10  riastrad 	sub	x3, x3, #1
   1059   1.9  riastrad 	_ALIGN_TEXT
   1060  1.10  riastrad 1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
   1061  1.10  riastrad 	aese	v0.16b, v16.16b
   1062   1.8  riastrad 	aesmc	v0.16b, v0.16b
   1063  1.10  riastrad 	aese	v1.16b, v16.16b
   1064   1.8  riastrad 	aesmc	v1.16b, v1.16b
   1065  1.10  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
   1066  1.10  riastrad 	subs	x3, x3, #1
   1067  1.10  riastrad 	b.ne	1b
   1068   1.8  riastrad 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1069   1.8  riastrad 	aese	v0.16b, v16.16b
   1070   1.8  riastrad 	aese	v1.16b, v16.16b
   1071  1.10  riastrad 	ldr	q16, [x0]		/* load last round key */
   1072  1.10  riastrad 	/* q[i] := AddRoundKey_q16(q[i]) */
   1073   1.8  riastrad 	eor	v0.16b, v0.16b, v16.16b
   1074   1.8  riastrad 	eor	v1.16b, v1.16b, v16.16b
   1075   1.8  riastrad 	ret
   1076   1.8  riastrad END(aesarmv8_enc2)
   1077   1.8  riastrad 
   1078   1.8  riastrad /*
   1079   1.1  riastrad  * aesarmv8_enc8(const struct aesenc *enckey@x0,
   1080   1.1  riastrad  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1081   1.1  riastrad  *     uint32_t nrounds@x3)
   1082   1.1  riastrad  *
   1083   1.1  riastrad  *	Encrypt eight AES blocks in q0 through q7 in parallel.
   1084   1.1  riastrad  *
   1085   1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1086   1.1  riastrad  */
   1087   1.1  riastrad 	.text
   1088   1.1  riastrad 	_ALIGN_TEXT
   1089   1.1  riastrad 	.type	aesarmv8_enc8,@function
   1090   1.1  riastrad aesarmv8_enc8:
   1091   1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1092  1.10  riastrad 	sub	x3, x3, #1
   1093   1.9  riastrad 	_ALIGN_TEXT
   1094  1.10  riastrad 1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
   1095  1.10  riastrad 	aese	v0.16b, v16.16b
   1096   1.7  riastrad 	aesmc	v0.16b, v0.16b
   1097  1.10  riastrad 	aese	v1.16b, v16.16b
   1098   1.7  riastrad 	aesmc	v1.16b, v1.16b
   1099  1.10  riastrad 	aese	v2.16b, v16.16b
   1100   1.7  riastrad 	aesmc	v2.16b, v2.16b
   1101  1.10  riastrad 	aese	v3.16b, v16.16b
   1102   1.7  riastrad 	aesmc	v3.16b, v3.16b
   1103  1.10  riastrad 	aese	v4.16b, v16.16b
   1104   1.7  riastrad 	aesmc	v4.16b, v4.16b
   1105  1.10  riastrad 	aese	v5.16b, v16.16b
   1106   1.7  riastrad 	aesmc	v5.16b, v5.16b
   1107  1.10  riastrad 	aese	v6.16b, v16.16b
   1108   1.7  riastrad 	aesmc	v6.16b, v6.16b
   1109  1.10  riastrad 	aese	v7.16b, v16.16b
   1110   1.7  riastrad 	aesmc	v7.16b, v7.16b
   1111  1.10  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
   1112  1.10  riastrad 	subs	x3, x3, #1
   1113  1.10  riastrad 	b.ne	1b
   1114   1.4  riastrad 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
   1115   1.4  riastrad 	aese	v0.16b, v16.16b
   1116   1.4  riastrad 	aese	v1.16b, v16.16b
   1117   1.4  riastrad 	aese	v2.16b, v16.16b
   1118   1.4  riastrad 	aese	v3.16b, v16.16b
   1119   1.4  riastrad 	aese	v4.16b, v16.16b
   1120   1.4  riastrad 	aese	v5.16b, v16.16b
   1121   1.4  riastrad 	aese	v6.16b, v16.16b
   1122   1.4  riastrad 	aese	v7.16b, v16.16b
   1123  1.10  riastrad 	ldr	q16, [x0]		/* load last round key */
   1124  1.10  riastrad 	/* q[i] := AddRoundKey_q16(q[i]) */
   1125  1.10  riastrad 	eor	v0.16b, v0.16b, v16.16b
   1126   1.4  riastrad 	eor	v1.16b, v1.16b, v16.16b
   1127   1.4  riastrad 	eor	v2.16b, v2.16b, v16.16b
   1128   1.4  riastrad 	eor	v3.16b, v3.16b, v16.16b
   1129   1.4  riastrad 	eor	v4.16b, v4.16b, v16.16b
   1130   1.4  riastrad 	eor	v5.16b, v5.16b, v16.16b
   1131   1.4  riastrad 	eor	v6.16b, v6.16b, v16.16b
   1132   1.4  riastrad 	eor	v7.16b, v7.16b, v16.16b
   1133   1.1  riastrad 	ret
   1134   1.1  riastrad END(aesarmv8_enc8)
   1135   1.1  riastrad 
   1136   1.1  riastrad /*
   1137   1.1  riastrad  * aesarmv8_dec1(const struct aesdec *deckey@x0,
   1138   1.1  riastrad  *     uint128_t block@q0, uint32_t nrounds@x3)
   1139   1.1  riastrad  *
   1140   1.1  riastrad  *	Decrypt a single AES block in q0.
   1141   1.1  riastrad  *
   1142   1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1143   1.1  riastrad  */
   1144   1.1  riastrad 	.text
   1145   1.1  riastrad 	_ALIGN_TEXT
   1146   1.1  riastrad 	.type	aesarmv8_dec1,@function
   1147   1.1  riastrad aesarmv8_dec1:
   1148   1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1149  1.10  riastrad 	sub	x3, x3, #1
   1150   1.9  riastrad 	_ALIGN_TEXT
   1151  1.10  riastrad 1:	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1152  1.10  riastrad 	aesd	v0.16b, v16.16b
   1153  1.10  riastrad 	/* q0 := InMixColumns(q0) */
   1154   1.7  riastrad 	aesimc	v0.16b, v0.16b
   1155  1.10  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
   1156  1.10  riastrad 	subs	x3, x3, #1
   1157  1.10  riastrad 	b.ne	1b
   1158   1.4  riastrad 	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
   1159   1.4  riastrad 	aesd	v0.16b, v16.16b
   1160  1.10  riastrad 	ldr	q16, [x0]		/* load last round key */
   1161  1.10  riastrad 	/* q0 := AddRoundKey_q16(q0) */
   1162   1.7  riastrad 	eor	v0.16b, v0.16b, v16.16b
   1163   1.1  riastrad 	ret
   1164   1.1  riastrad END(aesarmv8_dec1)
   1165   1.1  riastrad 
   1166   1.1  riastrad /*
   1167   1.1  riastrad  * aesarmv8_dec8(const struct aesdec *deckey@x0,
   1168   1.1  riastrad  *     uint128_t block0@q0, ..., uint128_t block7@q7,
   1169   1.1  riastrad  *     uint32_t nrounds@x3)
   1170   1.1  riastrad  *
   1171   1.1  riastrad  *	Decrypt eight AES blocks in q0 through q7 in parallel.
   1172   1.1  riastrad  *
   1173   1.4  riastrad  *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
   1174   1.1  riastrad  */
   1175   1.1  riastrad 	.text
   1176   1.1  riastrad 	_ALIGN_TEXT
   1177   1.1  riastrad 	.type	aesarmv8_dec8,@function
   1178   1.1  riastrad aesarmv8_dec8:
   1179   1.4  riastrad 	ldr	q16, [x0], #0x10	/* load round key */
   1180  1.10  riastrad 	sub	x3, x3, #1
   1181   1.9  riastrad 	_ALIGN_TEXT
   1182  1.10  riastrad 1:	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1183  1.10  riastrad 	aesd	v0.16b, v16.16b
   1184  1.10  riastrad 	/* q[i] := InMixColumns(q[i]) */
   1185   1.7  riastrad 	aesimc	v0.16b, v0.16b
   1186  1.10  riastrad 	aesd	v1.16b, v16.16b
   1187   1.7  riastrad 	aesimc	v1.16b, v1.16b
   1188  1.10  riastrad 	aesd	v2.16b, v16.16b
   1189   1.7  riastrad 	aesimc	v2.16b, v2.16b
   1190  1.10  riastrad 	aesd	v3.16b, v16.16b
   1191   1.7  riastrad 	aesimc	v3.16b, v3.16b
   1192  1.10  riastrad 	aesd	v4.16b, v16.16b
   1193   1.7  riastrad 	aesimc	v4.16b, v4.16b
   1194  1.10  riastrad 	aesd	v5.16b, v16.16b
   1195   1.7  riastrad 	aesimc	v5.16b, v5.16b
   1196  1.10  riastrad 	aesd	v6.16b, v16.16b
   1197   1.7  riastrad 	aesimc	v6.16b, v6.16b
   1198  1.10  riastrad 	aesd	v7.16b, v16.16b
   1199   1.7  riastrad 	aesimc	v7.16b, v7.16b
   1200  1.10  riastrad 	ldr	q16, [x0], #0x10	/* load next round key */
   1201  1.10  riastrad 	subs	x3, x3, #1
   1202  1.10  riastrad 	b.ne	1b
   1203   1.4  riastrad 	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
   1204   1.4  riastrad 	aesd	v0.16b, v16.16b
   1205   1.4  riastrad 	aesd	v1.16b, v16.16b
   1206   1.4  riastrad 	aesd	v2.16b, v16.16b
   1207   1.4  riastrad 	aesd	v3.16b, v16.16b
   1208   1.4  riastrad 	aesd	v4.16b, v16.16b
   1209   1.4  riastrad 	aesd	v5.16b, v16.16b
   1210   1.4  riastrad 	aesd	v6.16b, v16.16b
   1211   1.4  riastrad 	aesd	v7.16b, v16.16b
   1212  1.10  riastrad 	ldr	q16, [x0]		/* load last round key */
   1213  1.10  riastrad 	/* q[i] := AddRoundKey_q16(q[i]) */
   1214  1.10  riastrad 	eor	v0.16b, v0.16b, v16.16b
   1215   1.4  riastrad 	eor	v1.16b, v1.16b, v16.16b
   1216   1.4  riastrad 	eor	v2.16b, v2.16b, v16.16b
   1217   1.4  riastrad 	eor	v3.16b, v3.16b, v16.16b
   1218   1.4  riastrad 	eor	v4.16b, v4.16b, v16.16b
   1219   1.4  riastrad 	eor	v5.16b, v5.16b, v16.16b
   1220   1.4  riastrad 	eor	v6.16b, v6.16b, v16.16b
   1221   1.4  riastrad 	eor	v7.16b, v7.16b, v16.16b
   1222   1.1  riastrad 	ret
   1223   1.1  riastrad END(aesarmv8_dec8)
   1224