Home | History | Annotate | Line # | Download | only in x86
      1  1.6  riastrad /*	$NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.1  riastrad #include <machine/asm.h>
     30  1.1  riastrad 
     31  1.6  riastrad RCSID("$NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $")
     32  1.6  riastrad 
     33  1.1  riastrad /*
     34  1.1  riastrad  * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined
     35  1.1  riastrad  * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned)
     36  1.1  riastrad  * Packed Single, defined to operate on binary32 floats.  They have
     37  1.1  riastrad  * exactly the same architectural effects (move a 128-bit quantity from
     38  1.1  riastrad  * memory into an xmm register).
     39  1.1  riastrad  *
     40  1.1  riastrad  * In principle, they might have different microarchitectural effects
     41  1.1  riastrad  * so that MOVAPS/MOVUPS might incur a penalty when the register is
     42  1.1  riastrad  * later used for integer paths, but in practice they don't.  So we use
     43  1.1  riastrad  * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS.
     44  1.1  riastrad  */
     45  1.1  riastrad #define	movdqa	movaps
     46  1.1  riastrad #define	movdqu	movups
     47  1.1  riastrad 
     48  1.1  riastrad /*
     49  1.1  riastrad  * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi)
     50  1.1  riastrad  *
     51  1.1  riastrad  *	Expand a 16-byte AES-128 key into 10 round keys.
     52  1.1  riastrad  *
     53  1.1  riastrad  *	Standard ABI calling convention.
     54  1.1  riastrad  */
     55  1.1  riastrad ENTRY(aesni_setenckey128)
     56  1.1  riastrad 	movdqu	(%rsi),%xmm0	/* load master key into %xmm0 */
     57  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store master key as the first round key */
     58  1.1  riastrad 	lea	0x10(%rdi),%rdi	/* advance %rdi to next round key */
     59  1.1  riastrad 	aeskeygenassist $0x1,%xmm0,%xmm2
     60  1.1  riastrad 	call	aesni_expand128
     61  1.1  riastrad 	aeskeygenassist $0x2,%xmm0,%xmm2
     62  1.1  riastrad 	call	aesni_expand128
     63  1.1  riastrad 	aeskeygenassist $0x4,%xmm0,%xmm2
     64  1.1  riastrad 	call	aesni_expand128
     65  1.1  riastrad 	aeskeygenassist $0x8,%xmm0,%xmm2
     66  1.1  riastrad 	call	aesni_expand128
     67  1.1  riastrad 	aeskeygenassist $0x10,%xmm0,%xmm2
     68  1.1  riastrad 	call	aesni_expand128
     69  1.1  riastrad 	aeskeygenassist $0x20,%xmm0,%xmm2
     70  1.1  riastrad 	call	aesni_expand128
     71  1.1  riastrad 	aeskeygenassist $0x40,%xmm0,%xmm2
     72  1.1  riastrad 	call	aesni_expand128
     73  1.1  riastrad 	aeskeygenassist $0x80,%xmm0,%xmm2
     74  1.1  riastrad 	call	aesni_expand128
     75  1.1  riastrad 	aeskeygenassist $0x1b,%xmm0,%xmm2
     76  1.1  riastrad 	call	aesni_expand128
     77  1.1  riastrad 	aeskeygenassist $0x36,%xmm0,%xmm2
     78  1.1  riastrad 	call	aesni_expand128
     79  1.1  riastrad 	ret
     80  1.1  riastrad END(aesni_setenckey128)
     81  1.1  riastrad 
     82  1.1  riastrad /*
     83  1.1  riastrad  * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi)
     84  1.1  riastrad  *
     85  1.1  riastrad  *	Expand a 24-byte AES-192 key into 12 round keys.
     86  1.1  riastrad  *
     87  1.1  riastrad  *	Standard ABI calling convention.
     88  1.1  riastrad  */
     89  1.1  riastrad ENTRY(aesni_setenckey192)
     90  1.1  riastrad 	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
     91  1.1  riastrad 	movq	0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */
     92  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
     93  1.1  riastrad 	lea	0x10(%rdi),%rdi /* advance %rdi to next round key */
     94  1.1  riastrad 	aeskeygenassist $0x1,%xmm1,%xmm2
     95  1.1  riastrad 	call	aesni_expand192a
     96  1.1  riastrad 	aeskeygenassist $0x2,%xmm0,%xmm2
     97  1.1  riastrad 	call	aesni_expand192b
     98  1.1  riastrad 	aeskeygenassist $0x4,%xmm1,%xmm2
     99  1.1  riastrad 	call	aesni_expand192a
    100  1.1  riastrad 	aeskeygenassist $0x8,%xmm0,%xmm2
    101  1.1  riastrad 	call	aesni_expand192b
    102  1.1  riastrad 	aeskeygenassist $0x10,%xmm1,%xmm2
    103  1.1  riastrad 	call	aesni_expand192a
    104  1.1  riastrad 	aeskeygenassist $0x20,%xmm0,%xmm2
    105  1.1  riastrad 	call	aesni_expand192b
    106  1.1  riastrad 	aeskeygenassist $0x40,%xmm1,%xmm2
    107  1.1  riastrad 	call	aesni_expand192a
    108  1.1  riastrad 	aeskeygenassist $0x80,%xmm0,%xmm2
    109  1.1  riastrad 	call	aesni_expand192b
    110  1.1  riastrad 	ret
    111  1.1  riastrad END(aesni_setenckey192)
    112  1.1  riastrad 
    113  1.1  riastrad /*
    114  1.1  riastrad  * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi)
    115  1.1  riastrad  *
    116  1.1  riastrad  *	Expand a 32-byte AES-256 key into 14 round keys.
    117  1.1  riastrad  *
    118  1.1  riastrad  *	Standard ABI calling convention.
    119  1.1  riastrad  */
    120  1.1  riastrad ENTRY(aesni_setenckey256)
    121  1.1  riastrad 	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
    122  1.1  riastrad 	movdqu	0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */
    123  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
    124  1.1  riastrad 	movdqa	%xmm1,0x10(%rdi) /* store master key [128:256) as round key */
    125  1.1  riastrad 	lea	0x20(%rdi),%rdi	/* advance %rdi to next round key */
    126  1.1  riastrad 	aeskeygenassist $0x1,%xmm1,%xmm2
    127  1.1  riastrad 	call	aesni_expand256a
    128  1.1  riastrad 	aeskeygenassist $0x1,%xmm0,%xmm2
    129  1.1  riastrad 	call	aesni_expand256b
    130  1.1  riastrad 	aeskeygenassist $0x2,%xmm1,%xmm2
    131  1.1  riastrad 	call	aesni_expand256a
    132  1.1  riastrad 	aeskeygenassist $0x2,%xmm0,%xmm2
    133  1.1  riastrad 	call	aesni_expand256b
    134  1.1  riastrad 	aeskeygenassist $0x4,%xmm1,%xmm2
    135  1.1  riastrad 	call	aesni_expand256a
    136  1.1  riastrad 	aeskeygenassist $0x4,%xmm0,%xmm2
    137  1.1  riastrad 	call	aesni_expand256b
    138  1.1  riastrad 	aeskeygenassist $0x8,%xmm1,%xmm2
    139  1.1  riastrad 	call	aesni_expand256a
    140  1.1  riastrad 	aeskeygenassist $0x8,%xmm0,%xmm2
    141  1.1  riastrad 	call	aesni_expand256b
    142  1.1  riastrad 	aeskeygenassist $0x10,%xmm1,%xmm2
    143  1.1  riastrad 	call	aesni_expand256a
    144  1.1  riastrad 	aeskeygenassist $0x10,%xmm0,%xmm2
    145  1.1  riastrad 	call	aesni_expand256b
    146  1.1  riastrad 	aeskeygenassist $0x20,%xmm1,%xmm2
    147  1.1  riastrad 	call	aesni_expand256a
    148  1.1  riastrad 	aeskeygenassist $0x20,%xmm0,%xmm2
    149  1.1  riastrad 	call	aesni_expand256b
    150  1.1  riastrad 	aeskeygenassist $0x40,%xmm1,%xmm2
    151  1.1  riastrad 	call	aesni_expand256a
    152  1.1  riastrad 	ret
    153  1.1  riastrad END(aesni_setenckey256)
    154  1.1  riastrad 
    155  1.1  riastrad /*
    156  1.1  riastrad  * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    157  1.1  riastrad  *     uint128_t keygenassist@xmm2)
    158  1.1  riastrad  *
    159  1.1  riastrad  *	1. Compute the AES-128 round key using the previous round key.
    160  1.1  riastrad  *	2. Store it at *rkp.
    161  1.1  riastrad  *	3. Set %xmm0 to it.
    162  1.1  riastrad  *	4. Advance %rdi to point at the next round key.
    163  1.1  riastrad  *
    164  1.1  riastrad  *	Internal ABI.  On entry:
    165  1.1  riastrad  *
    166  1.1  riastrad  *		%rdi = rkp, pointer to round key to compute
    167  1.1  riastrad  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    168  1.1  riastrad  *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON)
    169  1.1  riastrad  *
    170  1.1  riastrad  *	On exit:
    171  1.1  riastrad  *
    172  1.1  riastrad  *		%rdi = &rkp[1], rkp advanced by one round key
    173  1.1  riastrad  *		%xmm0 = rk, the round key we just computed
    174  1.1  riastrad  *		%xmm2 = garbage
    175  1.1  riastrad  *		%xmm4 = garbage
    176  1.1  riastrad  *		%xmm5 = garbage
    177  1.1  riastrad  *		%xmm6 = garbage
    178  1.1  riastrad  *
    179  1.1  riastrad  *	Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15,
    180  1.1  riastrad  *	and all other registers).
    181  1.1  riastrad  */
    182  1.1  riastrad 	.text
    183  1.1  riastrad 	_ALIGN_TEXT
    184  1.1  riastrad 	.type	aesni_expand128,@function
    185  1.1  riastrad aesni_expand128:
    186  1.1  riastrad 	/*
    187  1.1  riastrad 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    188  1.1  riastrad 	 * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON.
    189  1.1  riastrad 	 */
    190  1.1  riastrad 	pshufd	$0b11111111,%xmm2,%xmm2
    191  1.1  riastrad 
    192  1.1  riastrad 	/*
    193  1.1  riastrad 	 * %xmm4 := (0, prk[0], prk[1], prk[2])
    194  1.1  riastrad 	 * %xmm5 := (0, 0, prk[0], prk[1])
    195  1.1  riastrad 	 * %xmm6 := (0, 0, 0, prk[0])
    196  1.1  riastrad 	 */
    197  1.1  riastrad 	movdqa	%xmm0,%xmm4
    198  1.1  riastrad 	movdqa	%xmm0,%xmm5
    199  1.1  riastrad 	movdqa	%xmm0,%xmm6
    200  1.1  riastrad 	pslldq	$4,%xmm4
    201  1.1  riastrad 	pslldq	$8,%xmm5
    202  1.1  riastrad 	pslldq	$12,%xmm6
    203  1.1  riastrad 
    204  1.1  riastrad 	/*
    205  1.1  riastrad 	 * %xmm0 := (rk[0] = t ^ prk[0],
    206  1.1  riastrad 	 *     rk[1] = t ^ prk[0] ^ prk[1],
    207  1.1  riastrad 	 *     rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2],
    208  1.1  riastrad 	 *     rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3])
    209  1.1  riastrad 	 */
    210  1.1  riastrad 	pxor	%xmm2,%xmm0
    211  1.1  riastrad 	pxor	%xmm4,%xmm0
    212  1.1  riastrad 	pxor	%xmm5,%xmm0
    213  1.1  riastrad 	pxor	%xmm6,%xmm0
    214  1.1  riastrad 
    215  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store round key */
    216  1.1  riastrad 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    217  1.1  riastrad 	ret
    218  1.1  riastrad END(aesni_expand128)
    219  1.1  riastrad 
    220  1.1  riastrad /*
    221  1.1  riastrad  * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    222  1.1  riastrad  *     uint64_t rklo@xmm1, uint128_t keygenassist@xmm2)
    223  1.1  riastrad  *
    224  1.1  riastrad  *	Set even-numbered AES-192 round key.
    225  1.1  riastrad  *
    226  1.1  riastrad  *	Internal ABI.  On entry:
    227  1.1  riastrad  *
    228  1.1  riastrad  *		%rdi = rkp, pointer to two round keys to compute
    229  1.1  riastrad  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    230  1.1  riastrad  *		%xmm1 = (rklo[0], rklo[1], xxx, xxx)
    231  1.1  riastrad  *		%xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx)
    232  1.1  riastrad  *
    233  1.1  riastrad  *	On exit:
    234  1.1  riastrad  *
    235  1.1  riastrad  *		%rdi = &rkp[2], rkp advanced by two round keys
    236  1.1  riastrad  *		%xmm0 = nrk, second round key we just computed
    237  1.1  riastrad  *		%xmm1 = rk, first round key we just computed
    238  1.1  riastrad  *		%xmm2 = garbage
    239  1.1  riastrad  *		%xmm4 = garbage
    240  1.1  riastrad  *		%xmm5 = garbage
    241  1.1  riastrad  *		%xmm6 = garbage
    242  1.1  riastrad  *		%xmm7 = garbage
    243  1.1  riastrad  */
    244  1.1  riastrad 	.text
    245  1.1  riastrad 	_ALIGN_TEXT
    246  1.1  riastrad 	.type	aesni_expand192a,@function
    247  1.1  riastrad aesni_expand192a:
    248  1.1  riastrad 	/*
    249  1.1  riastrad 	 * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]),
    250  1.1  riastrad 	 * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON.
    251  1.1  riastrad 	 */
    252  1.1  riastrad 	pshufd	$0b01010101,%xmm2,%xmm2
    253  1.1  riastrad 
    254  1.1  riastrad 	/*
    255  1.1  riastrad 	 * We need to compute:
    256  1.1  riastrad 	 *
    257  1.1  riastrad 	 * rk[0] := rklo[0]
    258  1.1  riastrad 	 * rk[1] := rklo[1]
    259  1.1  riastrad 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    260  1.1  riastrad 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    261  1.1  riastrad 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    262  1.1  riastrad 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    263  1.1  riastrad 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    264  1.1  riastrad 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    265  1.1  riastrad 	 *     ^ rklo[1]
    266  1.1  riastrad 	 */
    267  1.1  riastrad 
    268  1.1  riastrad 	/*
    269  1.1  riastrad 	 * %xmm4 := (prk[0], prk[1], prk[2], prk[3])
    270  1.1  riastrad 	 * %xmm5 := (0, prk[0], prk[1], prk[2])
    271  1.1  riastrad 	 * %xmm6 := (0, 0, prk[0], prk[1])
    272  1.1  riastrad 	 * %xmm7 := (0, 0, 0, prk[0])
    273  1.1  riastrad 	 */
    274  1.1  riastrad 	movdqa	%xmm0,%xmm4
    275  1.1  riastrad 	movdqa	%xmm0,%xmm5
    276  1.1  riastrad 	movdqa	%xmm0,%xmm6
    277  1.1  riastrad 	movdqa	%xmm0,%xmm7
    278  1.1  riastrad 	pslldq	$4,%xmm5
    279  1.1  riastrad 	pslldq	$8,%xmm6
    280  1.1  riastrad 	pslldq	$12,%xmm7
    281  1.1  riastrad 
    282  1.1  riastrad 	/* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */
    283  1.1  riastrad 	pxor	%xmm2,%xmm4
    284  1.1  riastrad 	pxor	%xmm5,%xmm4
    285  1.1  riastrad 	pxor	%xmm6,%xmm4
    286  1.1  riastrad 	pxor	%xmm7,%xmm4
    287  1.1  riastrad 
    288  1.1  riastrad 	/*
    289  1.1  riastrad 	 * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and
    290  1.1  riastrad 	 * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]);
    291  1.1  riastrad 	 * and we have yet to compute nrk[2] or nrk[3], which requires
    292  1.1  riastrad 	 * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...).  We need
    293  1.1  riastrad 	 * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and
    294  1.1  riastrad 	 * nrk into %xmm0.
    295  1.1  riastrad 	 */
    296  1.1  riastrad 
    297  1.1  riastrad 	/* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    298  1.1  riastrad 	pshufd	$0b11111110,%xmm4,%xmm0
    299  1.1  riastrad 
    300  1.1  riastrad 	/*
    301  1.1  riastrad 	 * %xmm6 := (0, 0, rklo[0], rklo[1])
    302  1.1  riastrad 	 * %xmm7 := (0, 0, 0, rklo[0])
    303  1.1  riastrad 	 */
    304  1.1  riastrad 	movdqa	%xmm1,%xmm6
    305  1.1  riastrad 	movdqa	%xmm1,%xmm7
    306  1.1  riastrad 
    307  1.1  riastrad 	pslldq	$8,%xmm6
    308  1.1  riastrad 	pslldq	$12,%xmm7
    309  1.1  riastrad 
    310  1.1  riastrad 	/*
    311  1.1  riastrad 	 * %xmm0 := (nrk[0],
    312  1.1  riastrad 	 *     nrk[1],
    313  1.1  riastrad 	 *     nrk[2] = nrk[1] ^ rklo[0],
    314  1.1  riastrad 	 *     nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1])
    315  1.1  riastrad 	 */
    316  1.1  riastrad 	pxor	%xmm6,%xmm0
    317  1.1  riastrad 	pxor	%xmm7,%xmm0
    318  1.1  riastrad 
    319  1.1  riastrad 	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */
    320  1.1  riastrad 	shufps	$0b01000100,%xmm4,%xmm1
    321  1.1  riastrad 
    322  1.1  riastrad 	movdqa	%xmm1,(%rdi)		/* store round key */
    323  1.1  riastrad 	movdqa	%xmm0,0x10(%rdi)	/* store next round key */
    324  1.1  riastrad 	lea	0x20(%rdi),%rdi		/* advance two round keys */
    325  1.1  riastrad 	ret
    326  1.1  riastrad END(aesni_expand192a)
    327  1.1  riastrad 
    328  1.1  riastrad /*
    329  1.1  riastrad  * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0,
    330  1.1  riastrad  *     uint128_t keygenassist@xmm2)
    331  1.1  riastrad  *
    332  1.1  riastrad  *	Set odd-numbered AES-192 round key.
    333  1.1  riastrad  *
    334  1.1  riastrad  *	Internal ABI.  On entry:
    335  1.1  riastrad  *
    336  1.1  riastrad  *		%rdi = rkp, pointer to round key to compute
    337  1.1  riastrad  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    338  1.1  riastrad  *		%xmm1 = (xxx, xxx, pprk[2], pprk[3])
    339  1.1  riastrad  *		%xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON)
    340  1.1  riastrad  *
    341  1.1  riastrad  *	On exit:
    342  1.1  riastrad  *
    343  1.1  riastrad  *		%rdi = &rkp[1], rkp advanced by one round key
    344  1.1  riastrad  *		%xmm0 = rk, the round key we just computed
    345  1.1  riastrad  *		%xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key
    346  1.1  riastrad  *		%xmm2 = garbage
    347  1.1  riastrad  *		%xmm4 = garbage
    348  1.1  riastrad  *		%xmm5 = garbage
    349  1.1  riastrad  *		%xmm6 = garbage
    350  1.1  riastrad  *		%xmm7 = garbage
    351  1.1  riastrad  */
    352  1.1  riastrad 	.text
    353  1.1  riastrad 	_ALIGN_TEXT
    354  1.1  riastrad 	.type	aesni_expand192b,@function
    355  1.1  riastrad aesni_expand192b:
    356  1.1  riastrad 	/*
    357  1.1  riastrad 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    358  1.1  riastrad 	 * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON.
    359  1.1  riastrad 	 */
    360  1.1  riastrad 	pshufd	$0b11111111,%xmm2,%xmm2
    361  1.1  riastrad 
    362  1.1  riastrad 	/*
    363  1.1  riastrad 	 * We need to compute:
    364  1.1  riastrad 	 *
    365  1.1  riastrad 	 * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2]
    366  1.1  riastrad 	 * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3]
    367  1.1  riastrad 	 * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    368  1.1  riastrad 	 * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    369  1.1  riastrad 	 *     ^ prk[1]
    370  1.1  riastrad 	 * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    371  1.1  riastrad 	 *     ^ prk[1] ^ prk[2]
    372  1.1  riastrad 	 * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    373  1.1  riastrad 	 *     ^ prk[1] ^ prk[2] ^ prk[3]
    374  1.1  riastrad 	 */
    375  1.1  riastrad 
    376  1.1  riastrad 	/* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */
    377  1.1  riastrad 	shufps	$0b01001110,%xmm0,%xmm1
    378  1.1  riastrad 
    379  1.1  riastrad 	/*
    380  1.1  riastrad 	 * %xmm5 := (0, pprk[2], pprk[3], prk[0])
    381  1.1  riastrad 	 * %xmm6 := (0, 0, pprk[2], pprk[3])
    382  1.1  riastrad 	 * %xmm7 := (0, 0, 0, pprk[2])
    383  1.1  riastrad 	 */
    384  1.1  riastrad 	movdqa	%xmm1,%xmm5
    385  1.1  riastrad 	movdqa	%xmm1,%xmm6
    386  1.1  riastrad 	movdqa	%xmm1,%xmm7
    387  1.1  riastrad 	pslldq	$4,%xmm5
    388  1.1  riastrad 	pslldq	$8,%xmm6
    389  1.1  riastrad 	pslldq	$12,%xmm7
    390  1.1  riastrad 
    391  1.1  riastrad 	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */
    392  1.1  riastrad 	pxor	%xmm2,%xmm1
    393  1.1  riastrad 	pxor	%xmm5,%xmm1
    394  1.1  riastrad 	pxor	%xmm6,%xmm1
    395  1.1  riastrad 	pxor	%xmm7,%xmm1
    396  1.1  riastrad 
    397  1.1  riastrad 	/* %xmm4 := (prk[2], prk[3], xxx, xxx) */
    398  1.1  riastrad 	pshufd	$0b00001110,%xmm0,%xmm4
    399  1.1  riastrad 
    400  1.1  riastrad 	/* %xmm5 := (0, prk[2], xxx, xxx) */
    401  1.1  riastrad 	movdqa	%xmm4,%xmm5
    402  1.1  riastrad 	pslldq	$4,%xmm5
    403  1.1  riastrad 
    404  1.1  riastrad 	/* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */
    405  1.1  riastrad 	movdqa	%xmm1,%xmm0
    406  1.1  riastrad 
    407  1.1  riastrad 	/* %xmm1 := (rk[3], rk[3], xxx, xxx) */
    408  1.1  riastrad 	shufps	$0b00001111,%xmm1,%xmm1
    409  1.1  riastrad 
    410  1.1  riastrad 	/*
    411  1.1  riastrad 	 * %xmm1 := (nrk[0] = rk[3] ^ prk[2],
    412  1.1  riastrad 	 *     nrk[1] = rk[3] ^ prk[2] ^ prk[3],
    413  1.1  riastrad 	 *     xxx,
    414  1.1  riastrad 	 *     xxx)
    415  1.1  riastrad 	 */
    416  1.1  riastrad 	pxor	%xmm4,%xmm1
    417  1.1  riastrad 	pxor	%xmm5,%xmm1
    418  1.1  riastrad 
    419  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store round key */
    420  1.1  riastrad 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    421  1.1  riastrad 	ret
    422  1.1  riastrad END(aesni_expand192b)
    423  1.1  riastrad 
    424  1.1  riastrad /*
    425  1.1  riastrad  * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0,
    426  1.1  riastrad  *     uint128_t prk@xmm1, uint128_t keygenassist@xmm2)
    427  1.1  riastrad  *
    428  1.1  riastrad  *	Set even-numbered AES-256 round key.
    429  1.1  riastrad  *
    430  1.1  riastrad  *	Internal ABI.  On entry:
    431  1.1  riastrad  *
    432  1.1  riastrad  *		%rdi = rkp, pointer to round key to compute
    433  1.1  riastrad  *		%xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3])
    434  1.1  riastrad  *		%xmm1 = (prk[0], prk[1], prk[2], prk[3])
    435  1.1  riastrad  *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])))
    436  1.1  riastrad  *
    437  1.1  riastrad  *	On exit:
    438  1.1  riastrad  *
    439  1.1  riastrad  *		%rdi = &rkp[1], rkp advanced by one round key
    440  1.1  riastrad  *		%xmm0 = rk, the round key we just computed
    441  1.1  riastrad  *		%xmm1 = prk, previous round key, preserved from entry
    442  1.1  riastrad  *		%xmm2 = garbage
    443  1.1  riastrad  *		%xmm4 = garbage
    444  1.1  riastrad  *		%xmm5 = garbage
    445  1.1  riastrad  *		%xmm6 = garbage
    446  1.1  riastrad  *
    447  1.1  riastrad  *	The computation turns out to be the same as for AES-128; the
    448  1.1  riastrad  *	previous round key does not figure into it, only the
    449  1.1  riastrad  *	previous-previous round key.
    450  1.1  riastrad  */
    451  1.1  riastrad 	aesni_expand256a = aesni_expand128
    452  1.1  riastrad 
    453  1.1  riastrad /*
    454  1.1  riastrad  * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    455  1.1  riastrad  *     uint128_t pprk@xmm1, uint128_t keygenassist@xmm2)
    456  1.1  riastrad  *
    457  1.1  riastrad  *	Set odd-numbered AES-256 round key.
    458  1.1  riastrad  *
    459  1.1  riastrad  *	Internal ABI.  On entry:
    460  1.1  riastrad  *
    461  1.1  riastrad  *		%rdi = rkp, pointer to round key to compute
    462  1.1  riastrad  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    463  1.1  riastrad  *		%xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3])
    464  1.1  riastrad  *		%xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx)
    465  1.1  riastrad  *
    466  1.1  riastrad  *	On exit:
    467  1.1  riastrad  *
    468  1.1  riastrad  *		%rdi = &rkp[1], rkp advanced by one round key
    469  1.1  riastrad  *		%xmm0 = prk, previous round key, preserved from entry
    470  1.1  riastrad  *		%xmm1 = rk, the round key we just computed
    471  1.1  riastrad  *		%xmm2 = garbage
    472  1.1  riastrad  *		%xmm4 = garbage
    473  1.1  riastrad  *		%xmm5 = garbage
    474  1.1  riastrad  *		%xmm6 = garbage
    475  1.1  riastrad  */
    476  1.1  riastrad 	.text
    477  1.1  riastrad 	_ALIGN_TEXT
    478  1.1  riastrad 	.type	aesni_expand256b,@function
    479  1.1  riastrad aesni_expand256b:
    480  1.1  riastrad 	/*
    481  1.1  riastrad 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    482  1.1  riastrad 	 * i.e., set each word of %xmm2 to t := Sub(prk[3]).
    483  1.1  riastrad 	 */
    484  1.1  riastrad 	pshufd	$0b10101010,%xmm2,%xmm2
    485  1.1  riastrad 
    486  1.1  riastrad 	/*
    487  1.1  riastrad 	 * %xmm4 := (0, pprk[0], pprk[1], pprk[2])
    488  1.1  riastrad 	 * %xmm5 := (0, 0, pprk[0], pprk[1])
    489  1.1  riastrad 	 * %xmm6 := (0, 0, 0, pprk[0])
    490  1.1  riastrad 	 */
    491  1.1  riastrad 	movdqa	%xmm1,%xmm4
    492  1.1  riastrad 	movdqa	%xmm1,%xmm5
    493  1.1  riastrad 	movdqa	%xmm1,%xmm6
    494  1.1  riastrad 	pslldq	$4,%xmm4
    495  1.1  riastrad 	pslldq	$8,%xmm5
    496  1.1  riastrad 	pslldq	$12,%xmm6
    497  1.1  riastrad 
    498  1.1  riastrad 	/*
    499  1.1  riastrad 	 * %xmm0 := (rk[0] = t ^ pprk[0],
    500  1.1  riastrad 	 *     rk[1] = t ^ pprk[0] ^ pprk[1],
    501  1.1  riastrad 	 *     rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2],
    502  1.1  riastrad 	 *     rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3])
    503  1.1  riastrad 	 */
    504  1.1  riastrad 	pxor	%xmm2,%xmm1
    505  1.1  riastrad 	pxor	%xmm4,%xmm1
    506  1.1  riastrad 	pxor	%xmm5,%xmm1
    507  1.1  riastrad 	pxor	%xmm6,%xmm1
    508  1.1  riastrad 
    509  1.1  riastrad 	movdqa	%xmm1,(%rdi)	/* store round key */
    510  1.1  riastrad 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    511  1.1  riastrad 	ret
    512  1.1  riastrad END(aesni_expand256b)
    513  1.1  riastrad 
    514  1.1  riastrad /*
    515  1.1  riastrad  * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi,
    516  1.1  riastrad  *     uint32_t nrounds@rdx)
    517  1.1  riastrad  *
    518  1.1  riastrad  *	Convert AES encryption round keys to AES decryption round keys.
    519  1.1  riastrad  *	`rounds' must be between 10 and 14.
    520  1.1  riastrad  *
    521  1.1  riastrad  *	Standard ABI calling convention.
    522  1.1  riastrad  */
    523  1.1  riastrad ENTRY(aesni_enctodec)
    524  1.1  riastrad 	shl	$4,%edx		/* rdx := byte offset of last round key */
    525  1.1  riastrad 	movdqa	(%rdi,%rdx),%xmm0	/* load last round key */
    526  1.1  riastrad 	movdqa	%xmm0,(%rsi)	/* store last round key verbatim */
    527  1.3  riastrad 	jmp	2f
    528  1.5  riastrad 	_ALIGN_TEXT
    529  1.3  riastrad 1:	movdqa	(%rdi,%rdx),%xmm0	/* load round key */
    530  1.1  riastrad 	aesimc	%xmm0,%xmm0	/* convert encryption to decryption */
    531  1.1  riastrad 	movdqa	%xmm0,(%rsi)	/* store round key */
    532  1.3  riastrad 2:	sub	$0x10,%rdx	/* advance to next round key */
    533  1.3  riastrad 	lea	0x10(%rsi),%rsi
    534  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
    535  1.3  riastrad 	movdqa	(%rdi),%xmm0	/* load first round key */
    536  1.1  riastrad 	movdqa	%xmm0,(%rsi)	/* store first round key verbatim */
    537  1.1  riastrad 	ret
    538  1.1  riastrad END(aesni_enctodec)
    539  1.1  riastrad 
    540  1.1  riastrad /*
    541  1.1  riastrad  * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi,
    542  1.1  riastrad  *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
    543  1.1  riastrad  *
    544  1.1  riastrad  *	Encrypt a single block.
    545  1.1  riastrad  *
    546  1.1  riastrad  *	Standard ABI calling convention.
    547  1.1  riastrad  */
    548  1.1  riastrad ENTRY(aesni_enc)
    549  1.1  riastrad 	movdqu	(%rsi),%xmm0
    550  1.1  riastrad 	call	aesni_enc1
    551  1.1  riastrad 	movdqu	%xmm0,(%rdx)
    552  1.1  riastrad 	ret
    553  1.1  riastrad END(aesni_enc)
    554  1.1  riastrad 
    555  1.1  riastrad /*
    556  1.1  riastrad  * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi,
    557  1.1  riastrad  *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
    558  1.1  riastrad  *
    559  1.1  riastrad  *	Decrypt a single block.
    560  1.1  riastrad  *
    561  1.1  riastrad  *	Standard ABI calling convention.
    562  1.1  riastrad  */
    563  1.1  riastrad ENTRY(aesni_dec)
    564  1.1  riastrad 	movdqu	(%rsi),%xmm0
    565  1.1  riastrad 	call	aesni_dec1
    566  1.1  riastrad 	movdqu	%xmm0,(%rdx)
    567  1.1  riastrad 	ret
    568  1.1  riastrad END(aesni_dec)
    569  1.1  riastrad 
    570  1.1  riastrad /*
    571  1.1  riastrad  * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    572  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8,
    573  1.1  riastrad  *     uint32_t nrounds@r9d)
    574  1.1  riastrad  *
    575  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    576  1.1  riastrad  *
    577  1.1  riastrad  *	nbytes must be an integral multiple of 16.
    578  1.1  riastrad  *
    579  1.1  riastrad  *	Standard ABI calling convention.
    580  1.1  riastrad  */
    581  1.1  riastrad ENTRY(aesni_cbc_enc)
    582  1.1  riastrad 	cmp	$0,%rcx
    583  1.1  riastrad 	jz	2f
    584  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    585  1.1  riastrad 	movdqu	(%r8),%xmm0		/* xmm0 := chaining value */
    586  1.5  riastrad 	_ALIGN_TEXT
    587  1.1  riastrad 1:	movdqu	(%rsi),%xmm1		/* xmm1 := plaintext block */
    588  1.1  riastrad 	lea	0x10(%rsi),%rsi
    589  1.1  riastrad 	pxor	%xmm1,%xmm0		/* xmm0 := cv ^ ptxt */
    590  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    591  1.1  riastrad 	call	aesni_enc1		/* xmm0 := ciphertext block */
    592  1.1  riastrad 	movdqu	%xmm0,(%rdx)
    593  1.1  riastrad 	lea	0x10(%rdx),%rdx
    594  1.1  riastrad 	sub	$0x10,%r10
    595  1.1  riastrad 	jnz	1b			/* repeat if r10 is nonzero */
    596  1.1  riastrad 	movdqu	%xmm0,(%r8)		/* store chaining value */
    597  1.1  riastrad 2:	ret
    598  1.1  riastrad END(aesni_cbc_enc)
    599  1.1  riastrad 
    600  1.1  riastrad /*
    601  1.1  riastrad  * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    602  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
    603  1.1  riastrad  *     uint32_t nrounds@r9)
    604  1.1  riastrad  *
    605  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    606  1.1  riastrad  *
    607  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    608  1.1  riastrad  *	is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once.
    609  1.1  riastrad  *
    610  1.1  riastrad  *	Standard ABI calling convention.
    611  1.1  riastrad  */
    612  1.1  riastrad ENTRY(aesni_cbc_dec1)
    613  1.1  riastrad 	push	%rbp			/* create stack frame uint128[1] */
    614  1.1  riastrad 	mov	%rsp,%rbp
    615  1.1  riastrad 	sub	$0x10,%rsp
    616  1.1  riastrad 	movdqu	(%r8),%xmm8		/* xmm8 := iv */
    617  1.1  riastrad 	movdqa	%xmm8,(%rsp)		/* save iv */
    618  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    619  1.1  riastrad 	movdqu	-0x10(%rsi,%r10),%xmm0	/* xmm0 := last ciphertext block */
    620  1.1  riastrad 	movdqu	%xmm0,(%r8)		/* update iv */
    621  1.3  riastrad 	jmp	2f
    622  1.5  riastrad 	_ALIGN_TEXT
    623  1.3  riastrad 1:	movdqu	-0x10(%rsi,%r10),%xmm8	/* xmm8 := chaining value */
    624  1.1  riastrad 	pxor	%xmm8,%xmm0		/* xmm0 := ptxt */
    625  1.1  riastrad 	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
    626  1.1  riastrad 	movdqa	%xmm8,%xmm0		/* move cv = ciphertext block */
    627  1.3  riastrad 2:	mov	%r9d,%ecx		/* ecx := nrounds */
    628  1.3  riastrad 	call	aesni_dec1		/* xmm0 := cv ^ ptxt */
    629  1.3  riastrad 	sub	$0x10,%r10
    630  1.3  riastrad 	jnz	1b			/* repeat if more blocks */
    631  1.3  riastrad 	pxor	(%rsp),%xmm0		/* xmm0 := ptxt */
    632  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
    633  1.1  riastrad 	leave
    634  1.1  riastrad 	ret
    635  1.1  riastrad END(aesni_cbc_dec1)
    636  1.1  riastrad 
    637  1.1  riastrad /*
    638  1.1  riastrad  * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    639  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
    640  1.1  riastrad  *     uint32_t nrounds@r9)
    641  1.1  riastrad  *
    642  1.1  riastrad  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    643  1.1  riastrad  *
    644  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    645  1.1  riastrad  *
    646  1.1  riastrad  *	Standard ABI calling convention.
    647  1.1  riastrad  */
    648  1.1  riastrad ENTRY(aesni_cbc_dec8)
    649  1.1  riastrad 	push	%rbp			/* create stack frame uint128[1] */
    650  1.1  riastrad 	mov	%rsp,%rbp
    651  1.1  riastrad 	sub	$0x10,%rsp
    652  1.1  riastrad 	movdqu	(%r8),%xmm8		/* xmm8 := iv */
    653  1.1  riastrad 	movdqa	%xmm8,(%rsp)		/* save iv */
    654  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    655  1.1  riastrad 	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := ciphertext block[n-1] */
    656  1.1  riastrad 	movdqu	%xmm7,(%r8)		/* update iv */
    657  1.3  riastrad 	jmp	2f
    658  1.5  riastrad 	_ALIGN_TEXT
    659  1.3  riastrad 1:	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := cv[0] */
    660  1.3  riastrad 	pxor	%xmm7,%xmm0		/* xmm0 := ptxt[0] */
    661  1.3  riastrad 	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
    662  1.3  riastrad 2:	movdqu	-0x20(%rsi,%r10),%xmm6	/* xmm6 := ciphertext block[n-2] */
    663  1.1  riastrad 	movdqu	-0x30(%rsi,%r10),%xmm5	/* xmm5 := ciphertext block[n-3] */
    664  1.1  riastrad 	movdqu	-0x40(%rsi,%r10),%xmm4	/* xmm4 := ciphertext block[n-4] */
    665  1.1  riastrad 	movdqu	-0x50(%rsi,%r10),%xmm3	/* xmm3 := ciphertext block[n-5] */
    666  1.1  riastrad 	movdqu	-0x60(%rsi,%r10),%xmm2	/* xmm2 := ciphertext block[n-6] */
    667  1.1  riastrad 	movdqu	-0x70(%rsi,%r10),%xmm1	/* xmm1 := ciphertext block[n-7] */
    668  1.1  riastrad 	movdqu	-0x80(%rsi,%r10),%xmm0	/* xmm0 := ciphertext block[n-8] */
    669  1.1  riastrad 	movdqa	%xmm6,%xmm15		/* xmm[8+i] := cv[i], 0<i<8 */
    670  1.1  riastrad 	movdqa	%xmm5,%xmm14
    671  1.1  riastrad 	movdqa	%xmm4,%xmm13
    672  1.1  riastrad 	movdqa	%xmm3,%xmm12
    673  1.1  riastrad 	movdqa	%xmm2,%xmm11
    674  1.1  riastrad 	movdqa	%xmm1,%xmm10
    675  1.1  riastrad 	movdqa	%xmm0,%xmm9
    676  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    677  1.1  riastrad 	call	aesni_dec8		/* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */
    678  1.1  riastrad 	pxor	%xmm15,%xmm7		/* xmm[i] := ptxt[i], 0<i<8 */
    679  1.1  riastrad 	pxor	%xmm14,%xmm6
    680  1.1  riastrad 	pxor	%xmm13,%xmm5
    681  1.1  riastrad 	pxor	%xmm12,%xmm4
    682  1.1  riastrad 	pxor	%xmm11,%xmm3
    683  1.1  riastrad 	pxor	%xmm10,%xmm2
    684  1.1  riastrad 	pxor	%xmm9,%xmm1
    685  1.1  riastrad 	movdqu	%xmm7,-0x10(%rdx,%r10)	/* store plaintext blocks */
    686  1.1  riastrad 	movdqu	%xmm6,-0x20(%rdx,%r10)
    687  1.1  riastrad 	movdqu	%xmm5,-0x30(%rdx,%r10)
    688  1.1  riastrad 	movdqu	%xmm4,-0x40(%rdx,%r10)
    689  1.1  riastrad 	movdqu	%xmm3,-0x50(%rdx,%r10)
    690  1.1  riastrad 	movdqu	%xmm2,-0x60(%rdx,%r10)
    691  1.1  riastrad 	movdqu	%xmm1,-0x70(%rdx,%r10)
    692  1.1  riastrad 	sub	$0x80,%r10
    693  1.3  riastrad 	jnz	1b			/* repeat if more blocks */
    694  1.3  riastrad 	pxor	(%rsp),%xmm0		/* xmm0 := ptxt[0] */
    695  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
    696  1.1  riastrad 	leave
    697  1.1  riastrad 	ret
    698  1.1  riastrad END(aesni_cbc_dec8)
    699  1.1  riastrad 
    700  1.1  riastrad /*
    701  1.1  riastrad  * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    702  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    703  1.1  riastrad  *     uint32_t nrounds@r9d)
    704  1.1  riastrad  *
    705  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    706  1.1  riastrad  *
    707  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    708  1.1  riastrad  *	is not vectorized; use aesni_xts_enc8 for >=8 blocks at once.
    709  1.1  riastrad  *
    710  1.1  riastrad  *	Standard ABI calling convention.
    711  1.1  riastrad  */
    712  1.1  riastrad ENTRY(aesni_xts_enc1)
    713  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    714  1.1  riastrad 	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
    715  1.5  riastrad 	_ALIGN_TEXT
    716  1.1  riastrad 1:	movdqu	(%rsi),%xmm0		/* xmm0 := ptxt */
    717  1.1  riastrad 	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
    718  1.1  riastrad 	pxor	%xmm15,%xmm0		/* xmm0 := ptxt ^ tweak */
    719  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    720  1.1  riastrad 	call	aesni_enc1		/* xmm0 := AES(ptxt ^ tweak) */
    721  1.1  riastrad 	pxor	%xmm15,%xmm0		/* xmm0 := AES(ptxt ^ tweak) ^ tweak */
    722  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store ciphertext block */
    723  1.1  riastrad 	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
    724  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
    725  1.1  riastrad 	sub	$0x10,%r10
    726  1.1  riastrad 	jnz	1b			/* repeat if more blocks */
    727  1.1  riastrad 	movdqu	%xmm15,(%r8)		/* update tweak */
    728  1.1  riastrad 	ret
    729  1.1  riastrad END(aesni_xts_enc1)
    730  1.1  riastrad 
    731  1.1  riastrad /*
    732  1.1  riastrad  * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    733  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    734  1.1  riastrad  *     uint32_t nrounds@r9d)
    735  1.1  riastrad  *
    736  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    737  1.1  riastrad  *
    738  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    739  1.1  riastrad  *
    740  1.1  riastrad  *	Standard ABI calling convention.
    741  1.1  riastrad  */
    742  1.1  riastrad ENTRY(aesni_xts_enc8)
    743  1.1  riastrad 	push	%rbp			/* create stack frame uint128[1] */
    744  1.1  riastrad 	mov	%rsp,%rbp
    745  1.1  riastrad 	sub	$0x10,%rsp
    746  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    747  1.1  riastrad 	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
    748  1.5  riastrad 	_ALIGN_TEXT
    749  1.1  riastrad 1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
    750  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
    751  1.1  riastrad 	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
    752  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
    753  1.1  riastrad 	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
    754  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
    755  1.1  riastrad 	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
    756  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[4] */
    757  1.1  riastrad 	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
    758  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
    759  1.1  riastrad 	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
    760  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
    761  1.1  riastrad 	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
    762  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
    763  1.1  riastrad 	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
    764  1.1  riastrad 	movdqu	0x10(%rsi),%xmm1
    765  1.1  riastrad 	movdqu	0x20(%rsi),%xmm2
    766  1.1  riastrad 	movdqu	0x30(%rsi),%xmm3
    767  1.1  riastrad 	movdqu	0x40(%rsi),%xmm4
    768  1.1  riastrad 	movdqu	0x50(%rsi),%xmm5
    769  1.1  riastrad 	movdqu	0x60(%rsi),%xmm6
    770  1.1  riastrad 	movdqu	0x70(%rsi),%xmm7
    771  1.1  riastrad 	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
    772  1.1  riastrad 	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
    773  1.1  riastrad 	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
    774  1.1  riastrad 	pxor	%xmm9,%xmm1
    775  1.1  riastrad 	pxor	%xmm10,%xmm2
    776  1.1  riastrad 	pxor	%xmm11,%xmm3
    777  1.1  riastrad 	pxor	%xmm12,%xmm4
    778  1.1  riastrad 	pxor	%xmm13,%xmm5
    779  1.1  riastrad 	pxor	%xmm14,%xmm6
    780  1.1  riastrad 	pxor	%xmm15,%xmm7
    781  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    782  1.1  riastrad 	call	aesni_enc8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
    783  1.1  riastrad 	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
    784  1.1  riastrad 	pxor	%xmm9,%xmm1
    785  1.1  riastrad 	pxor	%xmm10,%xmm2
    786  1.1  riastrad 	pxor	%xmm11,%xmm3
    787  1.1  riastrad 	pxor	%xmm12,%xmm4
    788  1.1  riastrad 	pxor	%xmm13,%xmm5
    789  1.1  riastrad 	pxor	%xmm14,%xmm6
    790  1.1  riastrad 	pxor	%xmm15,%xmm7
    791  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
    792  1.1  riastrad 	movdqu	%xmm1,0x10(%rdx)
    793  1.1  riastrad 	movdqu	%xmm2,0x20(%rdx)
    794  1.1  riastrad 	movdqu	%xmm3,0x30(%rdx)
    795  1.1  riastrad 	movdqu	%xmm4,0x40(%rdx)
    796  1.1  riastrad 	movdqu	%xmm5,0x50(%rdx)
    797  1.1  riastrad 	movdqu	%xmm6,0x60(%rdx)
    798  1.1  riastrad 	movdqu	%xmm7,0x70(%rdx)
    799  1.1  riastrad 	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
    800  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
    801  1.1  riastrad 	sub	$0x80,%r10
    802  1.1  riastrad 	jnz	1b			/* repeat if more block groups */
    803  1.1  riastrad 	movdqu	%xmm15,(%r8)		/* update tweak */
    804  1.1  riastrad 	leave
    805  1.1  riastrad 	ret
    806  1.1  riastrad END(aesni_xts_enc8)
    807  1.1  riastrad 
    808  1.1  riastrad /*
    809  1.1  riastrad  * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    810  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    811  1.1  riastrad  *     uint32_t nrounds@r9d)
    812  1.1  riastrad  *
    813  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-XTS.
    814  1.1  riastrad  *
    815  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    816  1.1  riastrad  *	is not vectorized; use aesni_xts_dec8 for >=8 blocks at once.
    817  1.1  riastrad  *
    818  1.1  riastrad  *	Standard ABI calling convention.
    819  1.1  riastrad  */
    820  1.1  riastrad ENTRY(aesni_xts_dec1)
    821  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    822  1.1  riastrad 	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
    823  1.5  riastrad 	_ALIGN_TEXT
    824  1.1  riastrad 1:	movdqu	(%rsi),%xmm0		/* xmm0 := ctxt */
    825  1.1  riastrad 	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
    826  1.1  riastrad 	pxor	%xmm15,%xmm0		/* xmm0 := ctxt ^ tweak */
    827  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    828  1.1  riastrad 	call	aesni_dec1		/* xmm0 := AES(ctxt ^ tweak) */
    829  1.1  riastrad 	pxor	%xmm15,%xmm0		/* xmm0 := AES(ctxt ^ tweak) ^ tweak */
    830  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store plaintext block */
    831  1.1  riastrad 	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
    832  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
    833  1.1  riastrad 	sub	$0x10,%r10
    834  1.1  riastrad 	jnz	1b			/* repeat if more blocks */
    835  1.1  riastrad 	movdqu	%xmm15,(%r8)		/* update tweak */
    836  1.1  riastrad 	ret
    837  1.1  riastrad END(aesni_xts_dec1)
    838  1.1  riastrad 
    839  1.1  riastrad /*
    840  1.1  riastrad  * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    841  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    842  1.1  riastrad  *     uint32_t nrounds@r9d)
    843  1.1  riastrad  *
    844  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-XTS.
    845  1.1  riastrad  *
    846  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    847  1.1  riastrad  *
    848  1.1  riastrad  *	Standard ABI calling convention.
    849  1.1  riastrad  */
    850  1.1  riastrad ENTRY(aesni_xts_dec8)
    851  1.1  riastrad 	push	%rbp			/* create stack frame uint128[1] */
    852  1.1  riastrad 	mov	%rsp,%rbp
    853  1.1  riastrad 	sub	$0x10,%rsp
    854  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    855  1.1  riastrad 	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
    856  1.5  riastrad 	_ALIGN_TEXT
    857  1.1  riastrad 1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
    858  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
    859  1.1  riastrad 	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
    860  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
    861  1.1  riastrad 	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
    862  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
    863  1.1  riastrad 	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
    864  1.1  riastrad 	call	aesni_xts_mulx		/* xmm51 := tweak[4] */
    865  1.1  riastrad 	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
    866  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
    867  1.1  riastrad 	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
    868  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
    869  1.1  riastrad 	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
    870  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
    871  1.1  riastrad 	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
    872  1.1  riastrad 	movdqu	0x10(%rsi),%xmm1
    873  1.1  riastrad 	movdqu	0x20(%rsi),%xmm2
    874  1.1  riastrad 	movdqu	0x30(%rsi),%xmm3
    875  1.1  riastrad 	movdqu	0x40(%rsi),%xmm4
    876  1.1  riastrad 	movdqu	0x50(%rsi),%xmm5
    877  1.1  riastrad 	movdqu	0x60(%rsi),%xmm6
    878  1.1  riastrad 	movdqu	0x70(%rsi),%xmm7
    879  1.1  riastrad 	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
    880  1.1  riastrad 	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
    881  1.1  riastrad 	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
    882  1.1  riastrad 	pxor	%xmm9,%xmm1
    883  1.1  riastrad 	pxor	%xmm10,%xmm2
    884  1.1  riastrad 	pxor	%xmm11,%xmm3
    885  1.1  riastrad 	pxor	%xmm12,%xmm4
    886  1.1  riastrad 	pxor	%xmm13,%xmm5
    887  1.1  riastrad 	pxor	%xmm14,%xmm6
    888  1.1  riastrad 	pxor	%xmm15,%xmm7
    889  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    890  1.1  riastrad 	call	aesni_dec8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
    891  1.1  riastrad 	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
    892  1.1  riastrad 	pxor	%xmm9,%xmm1
    893  1.1  riastrad 	pxor	%xmm10,%xmm2
    894  1.1  riastrad 	pxor	%xmm11,%xmm3
    895  1.1  riastrad 	pxor	%xmm12,%xmm4
    896  1.1  riastrad 	pxor	%xmm13,%xmm5
    897  1.1  riastrad 	pxor	%xmm14,%xmm6
    898  1.1  riastrad 	pxor	%xmm15,%xmm7
    899  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
    900  1.1  riastrad 	movdqu	%xmm1,0x10(%rdx)
    901  1.1  riastrad 	movdqu	%xmm2,0x20(%rdx)
    902  1.1  riastrad 	movdqu	%xmm3,0x30(%rdx)
    903  1.1  riastrad 	movdqu	%xmm4,0x40(%rdx)
    904  1.1  riastrad 	movdqu	%xmm5,0x50(%rdx)
    905  1.1  riastrad 	movdqu	%xmm6,0x60(%rdx)
    906  1.1  riastrad 	movdqu	%xmm7,0x70(%rdx)
    907  1.1  riastrad 	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
    908  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
    909  1.1  riastrad 	sub	$0x80,%r10
    910  1.1  riastrad 	jnz	1b			/* repeat if more block groups */
    911  1.1  riastrad 	movdqu	%xmm15,(%r8)		/* update tweak */
    912  1.1  riastrad 	leave
    913  1.1  riastrad 	ret
    914  1.1  riastrad END(aesni_xts_dec8)
    915  1.1  riastrad 
    916  1.1  riastrad /*
    917  1.1  riastrad  * aesni_xts_mulx(tweak@xmm15)
    918  1.1  riastrad  *
    919  1.1  riastrad  *	Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    920  1.1  riastrad  *	Uses %xmm0 as temporary.
    921  1.1  riastrad  */
    922  1.1  riastrad 	.text
    923  1.1  riastrad 	_ALIGN_TEXT
    924  1.1  riastrad 	.type	aesni_xts_mulx,@function
    925  1.1  riastrad aesni_xts_mulx:
    926  1.1  riastrad 	/*
    927  1.1  riastrad 	 * Simultaneously determine
    928  1.1  riastrad 	 * (a) whether the high bit of the low quadword must be
    929  1.1  riastrad 	 *     shifted into the low bit of the high quadword, and
    930  1.1  riastrad 	 * (b) whether the high bit of the high quadword must be
    931  1.1  riastrad 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    932  1.1  riastrad 	 */
    933  1.1  riastrad 	pxor	%xmm0,%xmm0	/* xmm0 := 0 */
    934  1.1  riastrad 	pcmpgtq	%xmm15,%xmm0	/* xmm0[i] := -1 if 0 > xmm15[i] else 0 */
    935  1.1  riastrad 	pshufd	$0b01001110,%xmm0,%xmm0	/* swap halves of xmm0 */
    936  1.1  riastrad 	pand	xtscarry(%rip),%xmm0	/* copy xtscarry according to mask */
    937  1.1  riastrad 	psllq	$1,%xmm15	/* shift */
    938  1.1  riastrad 	pxor	%xmm0,%xmm15	/* incorporate (a) and (b) */
    939  1.1  riastrad 	ret
    940  1.1  riastrad END(aesni_xts_mulx)
    941  1.1  riastrad 
    942  1.1  riastrad 	.section .rodata
    943  1.2  riastrad 	.p2align 4
    944  1.1  riastrad 	.type	xtscarry,@object
    945  1.1  riastrad xtscarry:
    946  1.1  riastrad 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    947  1.1  riastrad END(xtscarry)
    948  1.1  riastrad 
    949  1.1  riastrad /*
    950  1.1  riastrad  * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi)
    951  1.1  riastrad  *
    952  1.1  riastrad  *	Update an AES-XTS tweak.
    953  1.1  riastrad  *
    954  1.1  riastrad  *	Standard ABI calling convention.
    955  1.1  riastrad  */
    956  1.1  riastrad ENTRY(aesni_xts_update)
    957  1.1  riastrad 	movdqu	(%rdi),%xmm15
    958  1.1  riastrad 	call	aesni_xts_mulx
    959  1.1  riastrad 	movdqu	%xmm15,(%rsi)
    960  1.1  riastrad 	ret
    961  1.1  riastrad END(aesni_xts_update)
    962  1.1  riastrad 
    963  1.1  riastrad /*
    964  1.4  riastrad  * aesni_cbcmac_update1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    965  1.4  riastrad  *     size_t nbytes@rdx, uint8_t auth[16] @rcx, uint32_t nrounds@r8d)
    966  1.4  riastrad  *
    967  1.4  riastrad  *	Update CBC-MAC.
    968  1.4  riastrad  *
    969  1.4  riastrad  *	nbytes must be a positive integral multiple of 16.
    970  1.4  riastrad  *
    971  1.4  riastrad  *	Standard ABI calling convention.
    972  1.4  riastrad  */
    973  1.4  riastrad ENTRY(aesni_cbcmac_update1)
    974  1.4  riastrad 	movdqu	(%rcx),%xmm0		/* xmm0 := auth */
    975  1.4  riastrad 	mov	%rdx,%r10		/* r10 := nbytes */
    976  1.4  riastrad 	mov	%rcx,%rdx		/* rdx := &auth */
    977  1.5  riastrad 	_ALIGN_TEXT
    978  1.4  riastrad 1:	pxor	(%rsi),%xmm0		/* xmm0 ^= plaintext block */
    979  1.4  riastrad 	lea	0x10(%rsi),%rsi
    980  1.4  riastrad 	mov	%r8d,%ecx		/* ecx := nrounds */
    981  1.4  riastrad 	call	aesni_enc1		/* xmm0 := auth'; trash rax,rcx,xmm8 */
    982  1.4  riastrad 	sub	$0x10,%r10
    983  1.4  riastrad 	jnz	1b
    984  1.4  riastrad 	movdqu	%xmm0,(%rdx)		/* store auth' */
    985  1.4  riastrad 	ret
    986  1.4  riastrad END(aesni_cbcmac_update1)
    987  1.4  riastrad 
    988  1.4  riastrad /*
    989  1.4  riastrad  * aesni_ccm_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    990  1.4  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx,
    991  1.4  riastrad  *     uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
    992  1.4  riastrad  *
    993  1.4  riastrad  *	Update CCM encryption.
    994  1.4  riastrad  *
    995  1.4  riastrad  *	nbytes must be a positive integral multiple of 16.
    996  1.4  riastrad  *
    997  1.4  riastrad  *	Standard ABI calling convention.
    998  1.4  riastrad  */
    999  1.4  riastrad ENTRY(aesni_ccm_enc1)
   1000  1.4  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
   1001  1.4  riastrad 	movdqu	0x10(%r8),%xmm2		/* xmm2 := ctr (be) */
   1002  1.4  riastrad 	movdqa	bswap32(%rip),%xmm4	/* xmm4 := bswap32 table */
   1003  1.4  riastrad 	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
   1004  1.4  riastrad 	movdqu	(%r8),%xmm0		/* xmm0 := auth */
   1005  1.4  riastrad 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
   1006  1.5  riastrad 	_ALIGN_TEXT
   1007  1.4  riastrad 1:	movdqu	(%rsi),%xmm3		/* xmm3 := plaintext block */
   1008  1.4  riastrad 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
   1009  1.4  riastrad 	lea	0x10(%rsi),%rsi
   1010  1.4  riastrad 	movdqa	%xmm2,%xmm1		/* xmm1 := ctr (le) */
   1011  1.4  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
   1012  1.4  riastrad 	pshufb	%xmm4,%xmm1		/* xmm1 := ctr (be) */
   1013  1.4  riastrad 	pxor	%xmm3,%xmm0		/* xmm0 := auth ^ ptxt */
   1014  1.4  riastrad 	call	aesni_enc2		/* trash rax/rcx/xmm8 */
   1015  1.4  riastrad 	pxor	%xmm1,%xmm3		/* xmm3 := ciphertext block */
   1016  1.4  riastrad 	sub	$0x10,%r10		/* count down bytes */
   1017  1.4  riastrad 	movdqu	%xmm3,(%rdx)		/* store ciphertext block */
   1018  1.4  riastrad 	lea	0x10(%rdx),%rdx
   1019  1.4  riastrad 	jnz	1b			/* repeat if more blocks */
   1020  1.4  riastrad 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (be) */
   1021  1.4  riastrad 	movdqu	%xmm0,(%r8)		/* store updated auth */
   1022  1.4  riastrad 	movdqu	%xmm2,0x10(%r8)		/* store updated ctr */
   1023  1.4  riastrad 	ret
   1024  1.4  riastrad END(aesni_ccm_enc1)
   1025  1.4  riastrad 
   1026  1.4  riastrad /*
   1027  1.4  riastrad  * aesni_ccm_dec1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
   1028  1.4  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx,
   1029  1.4  riastrad  *     uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
   1030  1.4  riastrad  *
   1031  1.4  riastrad  *	Update CCM decryption.
   1032  1.4  riastrad  *
   1033  1.4  riastrad  *	nbytes must be a positive integral multiple of 16.
   1034  1.4  riastrad  *
   1035  1.4  riastrad  *	Standard ABI calling convention.
   1036  1.4  riastrad  */
   1037  1.4  riastrad ENTRY(aesni_ccm_dec1)
   1038  1.4  riastrad 	movdqu	0x10(%r8),%xmm2		/* xmm2 := ctr (be) */
   1039  1.4  riastrad 	movdqa	bswap32(%rip),%xmm4	/* xmm4 := bswap32 table */
   1040  1.4  riastrad 	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
   1041  1.4  riastrad 	movdqu	(%r8),%xmm1		/* xmm1 := auth */
   1042  1.4  riastrad 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
   1043  1.4  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
   1044  1.4  riastrad 
   1045  1.4  riastrad 	/* Decrypt the first block.  */
   1046  1.4  riastrad 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
   1047  1.4  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
   1048  1.4  riastrad 	movdqa	%xmm2,%xmm0		/* xmm0 := ctr (le) */
   1049  1.4  riastrad 	movdqu	(%rsi),%xmm3		/* xmm3 := ctxt */
   1050  1.4  riastrad 	pshufb	%xmm4,%xmm0		/* xmm0 := ctr (be) */
   1051  1.4  riastrad 	lea	0x10(%rsi),%rsi
   1052  1.4  riastrad 	call	aesni_enc1		/* xmm0 := pad; trash rax/rcx/xmm8 */
   1053  1.4  riastrad 	jmp	2f
   1054  1.4  riastrad 
   1055  1.5  riastrad 	_ALIGN_TEXT
   1056  1.4  riastrad 1:	/*
   1057  1.4  riastrad 	 * Authenticate the last block and decrypt the next block
   1058  1.4  riastrad 	 * simultaneously.
   1059  1.4  riastrad 	 *
   1060  1.4  riastrad 	 *	xmm1 = auth ^ ptxt[-1]
   1061  1.4  riastrad 	 *	xmm2 = ctr[-1] (le)
   1062  1.4  riastrad 	 */
   1063  1.4  riastrad 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
   1064  1.4  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
   1065  1.4  riastrad 	movdqa	%xmm2,%xmm0		/* xmm0 := ctr (le) */
   1066  1.4  riastrad 	movdqu	(%rsi),%xmm3		/* xmm3 := ctxt */
   1067  1.4  riastrad 	pshufb	%xmm4,%xmm0		/* xmm0 := ctr (be) */
   1068  1.4  riastrad 	lea	0x10(%rsi),%rsi
   1069  1.4  riastrad 	call	aesni_enc2		/* xmm0 := pad, xmm1 := auth';
   1070  1.4  riastrad 					 * trash rax/rcx/xmm8 */
   1071  1.4  riastrad 2:	pxor	%xmm0,%xmm3		/* xmm3 := ptxt */
   1072  1.4  riastrad 	sub	$0x10,%r10
   1073  1.4  riastrad 	movdqu	%xmm3,(%rdx)		/* store plaintext */
   1074  1.4  riastrad 	lea	0x10(%rdx),%rdx
   1075  1.4  riastrad 	pxor	%xmm3,%xmm1		/* xmm1 := auth ^ ptxt */
   1076  1.4  riastrad 	jnz	1b
   1077  1.4  riastrad 
   1078  1.4  riastrad 	/* Authenticate the last block.  */
   1079  1.4  riastrad 	movdqa	%xmm1,%xmm0		/* xmm0 := auth ^ ptxt */
   1080  1.4  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
   1081  1.4  riastrad 	call	aesni_enc1		/* xmm0 := auth' */
   1082  1.4  riastrad 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (be) */
   1083  1.4  riastrad 	movdqu	%xmm0,(%r8)		/* store updated auth */
   1084  1.4  riastrad 	movdqu	%xmm2,0x10(%r8)		/* store updated ctr */
   1085  1.4  riastrad 	ret
   1086  1.4  riastrad END(aesni_ccm_dec1)
   1087  1.4  riastrad 
   1088  1.4  riastrad 	.section .rodata
   1089  1.4  riastrad 	.p2align 4
   1090  1.4  riastrad 	.type	bswap32,@object
   1091  1.4  riastrad bswap32:
   1092  1.4  riastrad 	.byte	3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
   1093  1.4  riastrad END(bswap32)
   1094  1.4  riastrad 
   1095  1.4  riastrad 	.section .rodata
   1096  1.4  riastrad 	.p2align 4
   1097  1.4  riastrad 	.type	ctr32_inc,@object
   1098  1.4  riastrad ctr32_inc:
   1099  1.4  riastrad 	.byte	0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0
   1100  1.4  riastrad END(ctr32_inc)
   1101  1.4  riastrad 
   1102  1.4  riastrad /*
   1103  1.1  riastrad  * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0,
   1104  1.1  riastrad  *     uint32_t nrounds@ecx)
   1105  1.1  riastrad  *
   1106  1.1  riastrad  *	Encrypt a single AES block in %xmm0.
   1107  1.1  riastrad  *
   1108  1.1  riastrad  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1109  1.1  riastrad  */
   1110  1.1  riastrad 	.text
   1111  1.1  riastrad 	_ALIGN_TEXT
   1112  1.1  riastrad 	.type	aesni_enc1,@function
   1113  1.1  riastrad aesni_enc1:
   1114  1.1  riastrad 	pxor	(%rdi),%xmm0	/* xor in first round key */
   1115  1.1  riastrad 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1116  1.1  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1117  1.1  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1118  1.3  riastrad 	jmp	2f
   1119  1.5  riastrad 	_ALIGN_TEXT
   1120  1.3  riastrad 1:	aesenc	%xmm8,%xmm0
   1121  1.3  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1122  1.1  riastrad 	add	$0x10,%rcx
   1123  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
   1124  1.3  riastrad 	aesenclast %xmm8,%xmm0
   1125  1.1  riastrad 	ret
   1126  1.1  riastrad END(aesni_enc1)
   1127  1.1  riastrad 
   1128  1.1  riastrad /*
   1129  1.4  riastrad  * aesni_enc2(const struct aesenc *enckey@rdi, uint128_t block0@xmm0,
   1130  1.4  riastrad  *     uint128_t block1@xmm1, uint32_t nrounds@ecx)
   1131  1.4  riastrad  *
   1132  1.4  riastrad  *	Encrypt two AES blocks in %xmm0 and %xmm1.
   1133  1.4  riastrad  *
   1134  1.4  riastrad  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1135  1.4  riastrad  */
   1136  1.4  riastrad 	.text
   1137  1.4  riastrad 	_ALIGN_TEXT
   1138  1.4  riastrad 	.type	aesni_enc2,@function
   1139  1.4  riastrad aesni_enc2:
   1140  1.4  riastrad 	movdqa	(%rdi),%xmm8	/* xmm8 := first round key */
   1141  1.4  riastrad 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1142  1.4  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1143  1.4  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1144  1.4  riastrad 	pxor	%xmm8,%xmm0	/* xor in first round key */
   1145  1.4  riastrad 	pxor	%xmm8,%xmm1
   1146  1.4  riastrad 	jmp	2f
   1147  1.5  riastrad 	_ALIGN_TEXT
   1148  1.4  riastrad 1:	aesenc	%xmm8,%xmm0
   1149  1.4  riastrad 	aesenc	%xmm8,%xmm1
   1150  1.4  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1151  1.4  riastrad 	add	$0x10,%rcx
   1152  1.4  riastrad 	jnz	1b		/* repeat if there's more */
   1153  1.4  riastrad 	aesenclast %xmm8,%xmm0
   1154  1.4  riastrad 	aesenclast %xmm8,%xmm1
   1155  1.4  riastrad 	ret
   1156  1.4  riastrad END(aesni_enc2)
   1157  1.4  riastrad 
   1158  1.4  riastrad /*
   1159  1.1  riastrad  * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ...,
   1160  1.1  riastrad  *     block7@xmm7, uint32_t nrounds@ecx)
   1161  1.1  riastrad  *
   1162  1.1  riastrad  *	Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
   1163  1.1  riastrad  *
   1164  1.1  riastrad  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1165  1.1  riastrad  */
   1166  1.1  riastrad 	.text
   1167  1.1  riastrad 	_ALIGN_TEXT
   1168  1.1  riastrad 	.type	aesni_enc8,@function
   1169  1.1  riastrad aesni_enc8:
   1170  1.1  riastrad 	movdqa	(%rdi),%xmm8	/* xor in first round key */
   1171  1.1  riastrad 	pxor	%xmm8,%xmm0
   1172  1.1  riastrad 	pxor	%xmm8,%xmm1
   1173  1.1  riastrad 	pxor	%xmm8,%xmm2
   1174  1.1  riastrad 	pxor	%xmm8,%xmm3
   1175  1.1  riastrad 	pxor	%xmm8,%xmm4
   1176  1.1  riastrad 	pxor	%xmm8,%xmm5
   1177  1.1  riastrad 	pxor	%xmm8,%xmm6
   1178  1.1  riastrad 	pxor	%xmm8,%xmm7
   1179  1.1  riastrad 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1180  1.1  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1181  1.1  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1182  1.3  riastrad 	jmp	2f
   1183  1.5  riastrad 	_ALIGN_TEXT
   1184  1.3  riastrad 1:	aesenc	%xmm8,%xmm0
   1185  1.1  riastrad 	aesenc	%xmm8,%xmm1
   1186  1.1  riastrad 	aesenc	%xmm8,%xmm2
   1187  1.1  riastrad 	aesenc	%xmm8,%xmm3
   1188  1.1  riastrad 	aesenc	%xmm8,%xmm4
   1189  1.1  riastrad 	aesenc	%xmm8,%xmm5
   1190  1.1  riastrad 	aesenc	%xmm8,%xmm6
   1191  1.1  riastrad 	aesenc	%xmm8,%xmm7
   1192  1.3  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1193  1.3  riastrad 	add	$0x10,%rcx
   1194  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
   1195  1.3  riastrad 	aesenclast %xmm8,%xmm0
   1196  1.1  riastrad 	aesenclast %xmm8,%xmm1
   1197  1.1  riastrad 	aesenclast %xmm8,%xmm2
   1198  1.1  riastrad 	aesenclast %xmm8,%xmm3
   1199  1.1  riastrad 	aesenclast %xmm8,%xmm4
   1200  1.1  riastrad 	aesenclast %xmm8,%xmm5
   1201  1.1  riastrad 	aesenclast %xmm8,%xmm6
   1202  1.1  riastrad 	aesenclast %xmm8,%xmm7
   1203  1.1  riastrad 	ret
   1204  1.1  riastrad END(aesni_enc8)
   1205  1.1  riastrad 
   1206  1.1  riastrad /*
   1207  1.1  riastrad  * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0,
   1208  1.1  riastrad  *     uint32_t nrounds@ecx)
   1209  1.1  riastrad  *
   1210  1.1  riastrad  *	Decrypt a single AES block in %xmm0.
   1211  1.1  riastrad  *
   1212  1.1  riastrad  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1213  1.1  riastrad  */
   1214  1.1  riastrad 	.text
   1215  1.1  riastrad 	_ALIGN_TEXT
   1216  1.1  riastrad 	.type	aesni_dec1,@function
   1217  1.1  riastrad aesni_dec1:
   1218  1.1  riastrad 	pxor	(%rdi),%xmm0	/* xor in first round key */
   1219  1.1  riastrad 	shl	$4,%ecx		/* ecx := byte offset of round key */
   1220  1.1  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
   1221  1.1  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1222  1.3  riastrad 	jmp	2f
   1223  1.5  riastrad 	_ALIGN_TEXT
   1224  1.3  riastrad 1:	aesdec	%xmm8,%xmm0
   1225  1.3  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1226  1.1  riastrad 	add	$0x10,%rcx
   1227  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
   1228  1.3  riastrad 	aesdeclast %xmm8,%xmm0
   1229  1.1  riastrad 	ret
   1230  1.1  riastrad END(aesni_dec1)
   1231  1.1  riastrad 
   1232  1.1  riastrad /*
   1233  1.1  riastrad  * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ...,
   1234  1.1  riastrad  *     block7@xmm7, uint32_t nrounds@ecx)
   1235  1.1  riastrad  *
   1236  1.1  riastrad  *	Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
   1237  1.1  riastrad  *
   1238  1.1  riastrad  *	Internal ABI.  Uses %xmm8 as temporary.  Destroys %rcx.
   1239  1.1  riastrad  */
   1240  1.1  riastrad 	.text
   1241  1.1  riastrad 	_ALIGN_TEXT
   1242  1.1  riastrad 	.type	aesni_dec8,@function
   1243  1.1  riastrad aesni_dec8:
   1244  1.1  riastrad 	movdqa	(%rdi),%xmm8	/* xor in first round key */
   1245  1.1  riastrad 	pxor	%xmm8,%xmm0
   1246  1.1  riastrad 	pxor	%xmm8,%xmm1
   1247  1.1  riastrad 	pxor	%xmm8,%xmm2
   1248  1.1  riastrad 	pxor	%xmm8,%xmm3
   1249  1.1  riastrad 	pxor	%xmm8,%xmm4
   1250  1.1  riastrad 	pxor	%xmm8,%xmm5
   1251  1.1  riastrad 	pxor	%xmm8,%xmm6
   1252  1.1  riastrad 	pxor	%xmm8,%xmm7
   1253  1.1  riastrad 	shl	$4,%ecx		/* ecx := byte offset of round key */
   1254  1.1  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
   1255  1.1  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1256  1.3  riastrad 	jmp	2f
   1257  1.5  riastrad 	_ALIGN_TEXT
   1258  1.3  riastrad 1:	aesdec	%xmm8,%xmm0
   1259  1.1  riastrad 	aesdec	%xmm8,%xmm1
   1260  1.1  riastrad 	aesdec	%xmm8,%xmm2
   1261  1.1  riastrad 	aesdec	%xmm8,%xmm3
   1262  1.1  riastrad 	aesdec	%xmm8,%xmm4
   1263  1.1  riastrad 	aesdec	%xmm8,%xmm5
   1264  1.1  riastrad 	aesdec	%xmm8,%xmm6
   1265  1.1  riastrad 	aesdec	%xmm8,%xmm7
   1266  1.3  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1267  1.3  riastrad 	add	$0x10,%rcx
   1268  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
   1269  1.3  riastrad 	aesdeclast %xmm8,%xmm0
   1270  1.1  riastrad 	aesdeclast %xmm8,%xmm1
   1271  1.1  riastrad 	aesdeclast %xmm8,%xmm2
   1272  1.1  riastrad 	aesdeclast %xmm8,%xmm3
   1273  1.1  riastrad 	aesdeclast %xmm8,%xmm4
   1274  1.1  riastrad 	aesdeclast %xmm8,%xmm5
   1275  1.1  riastrad 	aesdeclast %xmm8,%xmm6
   1276  1.1  riastrad 	aesdeclast %xmm8,%xmm7
   1277  1.1  riastrad 	ret
   1278  1.1  riastrad END(aesni_dec8)
   1279