Home | History | Annotate | Line # | Download | only in x86
aes_ni_64.S revision 1.4
      1  1.4  riastrad /*	$NetBSD: aes_ni_64.S,v 1.4 2020/07/25 22:29:06 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.1  riastrad #include <machine/asm.h>
     30  1.1  riastrad 
     31  1.1  riastrad /*
     32  1.1  riastrad  * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined
     33  1.1  riastrad  * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned)
     34  1.1  riastrad  * Packed Single, defined to operate on binary32 floats.  They have
     35  1.1  riastrad  * exactly the same architectural effects (move a 128-bit quantity from
     36  1.1  riastrad  * memory into an xmm register).
     37  1.1  riastrad  *
     38  1.1  riastrad  * In principle, they might have different microarchitectural effects
     39  1.1  riastrad  * so that MOVAPS/MOVUPS might incur a penalty when the register is
     40  1.1  riastrad  * later used for integer paths, but in practice they don't.  So we use
     41  1.1  riastrad  * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS.
     42  1.1  riastrad  */
     43  1.1  riastrad #define	movdqa	movaps
     44  1.1  riastrad #define	movdqu	movups
     45  1.1  riastrad 
     46  1.1  riastrad /*
     47  1.1  riastrad  * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi)
     48  1.1  riastrad  *
     49  1.1  riastrad  *	Expand a 16-byte AES-128 key into 10 round keys.
     50  1.1  riastrad  *
     51  1.1  riastrad  *	Standard ABI calling convention.
     52  1.1  riastrad  */
     53  1.1  riastrad ENTRY(aesni_setenckey128)
     54  1.1  riastrad 	movdqu	(%rsi),%xmm0	/* load master key into %xmm0 */
     55  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store master key as the first round key */
     56  1.1  riastrad 	lea	0x10(%rdi),%rdi	/* advance %rdi to next round key */
     57  1.1  riastrad 	aeskeygenassist $0x1,%xmm0,%xmm2
     58  1.1  riastrad 	call	aesni_expand128
     59  1.1  riastrad 	aeskeygenassist $0x2,%xmm0,%xmm2
     60  1.1  riastrad 	call	aesni_expand128
     61  1.1  riastrad 	aeskeygenassist $0x4,%xmm0,%xmm2
     62  1.1  riastrad 	call	aesni_expand128
     63  1.1  riastrad 	aeskeygenassist $0x8,%xmm0,%xmm2
     64  1.1  riastrad 	call	aesni_expand128
     65  1.1  riastrad 	aeskeygenassist $0x10,%xmm0,%xmm2
     66  1.1  riastrad 	call	aesni_expand128
     67  1.1  riastrad 	aeskeygenassist $0x20,%xmm0,%xmm2
     68  1.1  riastrad 	call	aesni_expand128
     69  1.1  riastrad 	aeskeygenassist $0x40,%xmm0,%xmm2
     70  1.1  riastrad 	call	aesni_expand128
     71  1.1  riastrad 	aeskeygenassist $0x80,%xmm0,%xmm2
     72  1.1  riastrad 	call	aesni_expand128
     73  1.1  riastrad 	aeskeygenassist $0x1b,%xmm0,%xmm2
     74  1.1  riastrad 	call	aesni_expand128
     75  1.1  riastrad 	aeskeygenassist $0x36,%xmm0,%xmm2
     76  1.1  riastrad 	call	aesni_expand128
     77  1.1  riastrad 	ret
     78  1.1  riastrad END(aesni_setenckey128)
     79  1.1  riastrad 
     80  1.1  riastrad /*
     81  1.1  riastrad  * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi)
     82  1.1  riastrad  *
     83  1.1  riastrad  *	Expand a 24-byte AES-192 key into 12 round keys.
     84  1.1  riastrad  *
     85  1.1  riastrad  *	Standard ABI calling convention.
     86  1.1  riastrad  */
     87  1.1  riastrad ENTRY(aesni_setenckey192)
     88  1.1  riastrad 	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
     89  1.1  riastrad 	movq	0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */
     90  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
     91  1.1  riastrad 	lea	0x10(%rdi),%rdi /* advance %rdi to next round key */
     92  1.1  riastrad 	aeskeygenassist $0x1,%xmm1,%xmm2
     93  1.1  riastrad 	call	aesni_expand192a
     94  1.1  riastrad 	aeskeygenassist $0x2,%xmm0,%xmm2
     95  1.1  riastrad 	call	aesni_expand192b
     96  1.1  riastrad 	aeskeygenassist $0x4,%xmm1,%xmm2
     97  1.1  riastrad 	call	aesni_expand192a
     98  1.1  riastrad 	aeskeygenassist $0x8,%xmm0,%xmm2
     99  1.1  riastrad 	call	aesni_expand192b
    100  1.1  riastrad 	aeskeygenassist $0x10,%xmm1,%xmm2
    101  1.1  riastrad 	call	aesni_expand192a
    102  1.1  riastrad 	aeskeygenassist $0x20,%xmm0,%xmm2
    103  1.1  riastrad 	call	aesni_expand192b
    104  1.1  riastrad 	aeskeygenassist $0x40,%xmm1,%xmm2
    105  1.1  riastrad 	call	aesni_expand192a
    106  1.1  riastrad 	aeskeygenassist $0x80,%xmm0,%xmm2
    107  1.1  riastrad 	call	aesni_expand192b
    108  1.1  riastrad 	ret
    109  1.1  riastrad END(aesni_setenckey192)
    110  1.1  riastrad 
    111  1.1  riastrad /*
    112  1.1  riastrad  * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi)
    113  1.1  riastrad  *
    114  1.1  riastrad  *	Expand a 32-byte AES-256 key into 14 round keys.
    115  1.1  riastrad  *
    116  1.1  riastrad  *	Standard ABI calling convention.
    117  1.1  riastrad  */
    118  1.1  riastrad ENTRY(aesni_setenckey256)
    119  1.1  riastrad 	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
    120  1.1  riastrad 	movdqu	0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */
    121  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
    122  1.1  riastrad 	movdqa	%xmm1,0x10(%rdi) /* store master key [128:256) as round key */
    123  1.1  riastrad 	lea	0x20(%rdi),%rdi	/* advance %rdi to next round key */
    124  1.1  riastrad 	aeskeygenassist $0x1,%xmm1,%xmm2
    125  1.1  riastrad 	call	aesni_expand256a
    126  1.1  riastrad 	aeskeygenassist $0x1,%xmm0,%xmm2
    127  1.1  riastrad 	call	aesni_expand256b
    128  1.1  riastrad 	aeskeygenassist $0x2,%xmm1,%xmm2
    129  1.1  riastrad 	call	aesni_expand256a
    130  1.1  riastrad 	aeskeygenassist $0x2,%xmm0,%xmm2
    131  1.1  riastrad 	call	aesni_expand256b
    132  1.1  riastrad 	aeskeygenassist $0x4,%xmm1,%xmm2
    133  1.1  riastrad 	call	aesni_expand256a
    134  1.1  riastrad 	aeskeygenassist $0x4,%xmm0,%xmm2
    135  1.1  riastrad 	call	aesni_expand256b
    136  1.1  riastrad 	aeskeygenassist $0x8,%xmm1,%xmm2
    137  1.1  riastrad 	call	aesni_expand256a
    138  1.1  riastrad 	aeskeygenassist $0x8,%xmm0,%xmm2
    139  1.1  riastrad 	call	aesni_expand256b
    140  1.1  riastrad 	aeskeygenassist $0x10,%xmm1,%xmm2
    141  1.1  riastrad 	call	aesni_expand256a
    142  1.1  riastrad 	aeskeygenassist $0x10,%xmm0,%xmm2
    143  1.1  riastrad 	call	aesni_expand256b
    144  1.1  riastrad 	aeskeygenassist $0x20,%xmm1,%xmm2
    145  1.1  riastrad 	call	aesni_expand256a
    146  1.1  riastrad 	aeskeygenassist $0x20,%xmm0,%xmm2
    147  1.1  riastrad 	call	aesni_expand256b
    148  1.1  riastrad 	aeskeygenassist $0x40,%xmm1,%xmm2
    149  1.1  riastrad 	call	aesni_expand256a
    150  1.1  riastrad 	ret
    151  1.1  riastrad END(aesni_setenckey256)
    152  1.1  riastrad 
    153  1.1  riastrad /*
    154  1.1  riastrad  * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    155  1.1  riastrad  *     uint128_t keygenassist@xmm2)
    156  1.1  riastrad  *
    157  1.1  riastrad  *	1. Compute the AES-128 round key using the previous round key.
    158  1.1  riastrad  *	2. Store it at *rkp.
    159  1.1  riastrad  *	3. Set %xmm0 to it.
    160  1.1  riastrad  *	4. Advance %rdi to point at the next round key.
    161  1.1  riastrad  *
    162  1.1  riastrad  *	Internal ABI.  On entry:
    163  1.1  riastrad  *
    164  1.1  riastrad  *		%rdi = rkp, pointer to round key to compute
    165  1.1  riastrad  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    166  1.1  riastrad  *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON)
    167  1.1  riastrad  *
    168  1.1  riastrad  *	On exit:
    169  1.1  riastrad  *
    170  1.1  riastrad  *		%rdi = &rkp[1], rkp advanced by one round key
    171  1.1  riastrad  *		%xmm0 = rk, the round key we just computed
    172  1.1  riastrad  *		%xmm2 = garbage
    173  1.1  riastrad  *		%xmm4 = garbage
    174  1.1  riastrad  *		%xmm5 = garbage
    175  1.1  riastrad  *		%xmm6 = garbage
    176  1.1  riastrad  *
    177  1.1  riastrad  *	Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15,
    178  1.1  riastrad  *	and all other registers).
    179  1.1  riastrad  */
    180  1.1  riastrad 	.text
    181  1.1  riastrad 	_ALIGN_TEXT
    182  1.1  riastrad 	.type	aesni_expand128,@function
    183  1.1  riastrad aesni_expand128:
    184  1.1  riastrad 	/*
    185  1.1  riastrad 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    186  1.1  riastrad 	 * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON.
    187  1.1  riastrad 	 */
    188  1.1  riastrad 	pshufd	$0b11111111,%xmm2,%xmm2
    189  1.1  riastrad 
    190  1.1  riastrad 	/*
    191  1.1  riastrad 	 * %xmm4 := (0, prk[0], prk[1], prk[2])
    192  1.1  riastrad 	 * %xmm5 := (0, 0, prk[0], prk[1])
    193  1.1  riastrad 	 * %xmm6 := (0, 0, 0, prk[0])
    194  1.1  riastrad 	 */
    195  1.1  riastrad 	movdqa	%xmm0,%xmm4
    196  1.1  riastrad 	movdqa	%xmm0,%xmm5
    197  1.1  riastrad 	movdqa	%xmm0,%xmm6
    198  1.1  riastrad 	pslldq	$4,%xmm4
    199  1.1  riastrad 	pslldq	$8,%xmm5
    200  1.1  riastrad 	pslldq	$12,%xmm6
    201  1.1  riastrad 
    202  1.1  riastrad 	/*
    203  1.1  riastrad 	 * %xmm0 := (rk[0] = t ^ prk[0],
    204  1.1  riastrad 	 *     rk[1] = t ^ prk[0] ^ prk[1],
    205  1.1  riastrad 	 *     rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2],
    206  1.1  riastrad 	 *     rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3])
    207  1.1  riastrad 	 */
    208  1.1  riastrad 	pxor	%xmm2,%xmm0
    209  1.1  riastrad 	pxor	%xmm4,%xmm0
    210  1.1  riastrad 	pxor	%xmm5,%xmm0
    211  1.1  riastrad 	pxor	%xmm6,%xmm0
    212  1.1  riastrad 
    213  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store round key */
    214  1.1  riastrad 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    215  1.1  riastrad 	ret
    216  1.1  riastrad END(aesni_expand128)
    217  1.1  riastrad 
    218  1.1  riastrad /*
    219  1.1  riastrad  * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    220  1.1  riastrad  *     uint64_t rklo@xmm1, uint128_t keygenassist@xmm2)
    221  1.1  riastrad  *
    222  1.1  riastrad  *	Set even-numbered AES-192 round key.
    223  1.1  riastrad  *
    224  1.1  riastrad  *	Internal ABI.  On entry:
    225  1.1  riastrad  *
    226  1.1  riastrad  *		%rdi = rkp, pointer to two round keys to compute
    227  1.1  riastrad  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    228  1.1  riastrad  *		%xmm1 = (rklo[0], rklo[1], xxx, xxx)
    229  1.1  riastrad  *		%xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx)
    230  1.1  riastrad  *
    231  1.1  riastrad  *	On exit:
    232  1.1  riastrad  *
    233  1.1  riastrad  *		%rdi = &rkp[2], rkp advanced by two round keys
    234  1.1  riastrad  *		%xmm0 = nrk, second round key we just computed
    235  1.1  riastrad  *		%xmm1 = rk, first round key we just computed
    236  1.1  riastrad  *		%xmm2 = garbage
    237  1.1  riastrad  *		%xmm4 = garbage
    238  1.1  riastrad  *		%xmm5 = garbage
    239  1.1  riastrad  *		%xmm6 = garbage
    240  1.1  riastrad  *		%xmm7 = garbage
    241  1.1  riastrad  */
    242  1.1  riastrad 	.text
    243  1.1  riastrad 	_ALIGN_TEXT
    244  1.1  riastrad 	.type	aesni_expand192a,@function
    245  1.1  riastrad aesni_expand192a:
    246  1.1  riastrad 	/*
    247  1.1  riastrad 	 * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]),
    248  1.1  riastrad 	 * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON.
    249  1.1  riastrad 	 */
    250  1.1  riastrad 	pshufd	$0b01010101,%xmm2,%xmm2
    251  1.1  riastrad 
    252  1.1  riastrad 	/*
    253  1.1  riastrad 	 * We need to compute:
    254  1.1  riastrad 	 *
    255  1.1  riastrad 	 * rk[0] := rklo[0]
    256  1.1  riastrad 	 * rk[1] := rklo[1]
    257  1.1  riastrad 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    258  1.1  riastrad 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    259  1.1  riastrad 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    260  1.1  riastrad 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    261  1.1  riastrad 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    262  1.1  riastrad 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    263  1.1  riastrad 	 *     ^ rklo[1]
    264  1.1  riastrad 	 */
    265  1.1  riastrad 
    266  1.1  riastrad 	/*
    267  1.1  riastrad 	 * %xmm4 := (prk[0], prk[1], prk[2], prk[3])
    268  1.1  riastrad 	 * %xmm5 := (0, prk[0], prk[1], prk[2])
    269  1.1  riastrad 	 * %xmm6 := (0, 0, prk[0], prk[1])
    270  1.1  riastrad 	 * %xmm7 := (0, 0, 0, prk[0])
    271  1.1  riastrad 	 */
    272  1.1  riastrad 	movdqa	%xmm0,%xmm4
    273  1.1  riastrad 	movdqa	%xmm0,%xmm5
    274  1.1  riastrad 	movdqa	%xmm0,%xmm6
    275  1.1  riastrad 	movdqa	%xmm0,%xmm7
    276  1.1  riastrad 	pslldq	$4,%xmm5
    277  1.1  riastrad 	pslldq	$8,%xmm6
    278  1.1  riastrad 	pslldq	$12,%xmm7
    279  1.1  riastrad 
    280  1.1  riastrad 	/* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */
    281  1.1  riastrad 	pxor	%xmm2,%xmm4
    282  1.1  riastrad 	pxor	%xmm5,%xmm4
    283  1.1  riastrad 	pxor	%xmm6,%xmm4
    284  1.1  riastrad 	pxor	%xmm7,%xmm4
    285  1.1  riastrad 
    286  1.1  riastrad 	/*
    287  1.1  riastrad 	 * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and
    288  1.1  riastrad 	 * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]);
    289  1.1  riastrad 	 * and we have yet to compute nrk[2] or nrk[3], which requires
    290  1.1  riastrad 	 * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...).  We need
    291  1.1  riastrad 	 * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and
    292  1.1  riastrad 	 * nrk into %xmm0.
    293  1.1  riastrad 	 */
    294  1.1  riastrad 
    295  1.1  riastrad 	/* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    296  1.1  riastrad 	pshufd	$0b11111110,%xmm4,%xmm0
    297  1.1  riastrad 
    298  1.1  riastrad 	/*
    299  1.1  riastrad 	 * %xmm6 := (0, 0, rklo[0], rklo[1])
    300  1.1  riastrad 	 * %xmm7 := (0, 0, 0, rklo[0])
    301  1.1  riastrad 	 */
    302  1.1  riastrad 	movdqa	%xmm1,%xmm6
    303  1.1  riastrad 	movdqa	%xmm1,%xmm7
    304  1.1  riastrad 
    305  1.1  riastrad 	pslldq	$8,%xmm6
    306  1.1  riastrad 	pslldq	$12,%xmm7
    307  1.1  riastrad 
    308  1.1  riastrad 	/*
    309  1.1  riastrad 	 * %xmm0 := (nrk[0],
    310  1.1  riastrad 	 *     nrk[1],
    311  1.1  riastrad 	 *     nrk[2] = nrk[1] ^ rklo[0],
    312  1.1  riastrad 	 *     nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1])
    313  1.1  riastrad 	 */
    314  1.1  riastrad 	pxor	%xmm6,%xmm0
    315  1.1  riastrad 	pxor	%xmm7,%xmm0
    316  1.1  riastrad 
    317  1.1  riastrad 	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */
    318  1.1  riastrad 	shufps	$0b01000100,%xmm4,%xmm1
    319  1.1  riastrad 
    320  1.1  riastrad 	movdqa	%xmm1,(%rdi)		/* store round key */
    321  1.1  riastrad 	movdqa	%xmm0,0x10(%rdi)	/* store next round key */
    322  1.1  riastrad 	lea	0x20(%rdi),%rdi		/* advance two round keys */
    323  1.1  riastrad 	ret
    324  1.1  riastrad END(aesni_expand192a)
    325  1.1  riastrad 
    326  1.1  riastrad /*
    327  1.1  riastrad  * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0,
    328  1.1  riastrad  *     uint128_t keygenassist@xmm2)
    329  1.1  riastrad  *
    330  1.1  riastrad  *	Set odd-numbered AES-192 round key.
    331  1.1  riastrad  *
    332  1.1  riastrad  *	Internal ABI.  On entry:
    333  1.1  riastrad  *
    334  1.1  riastrad  *		%rdi = rkp, pointer to round key to compute
    335  1.1  riastrad  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    336  1.1  riastrad  *		%xmm1 = (xxx, xxx, pprk[2], pprk[3])
    337  1.1  riastrad  *		%xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON)
    338  1.1  riastrad  *
    339  1.1  riastrad  *	On exit:
    340  1.1  riastrad  *
    341  1.1  riastrad  *		%rdi = &rkp[1], rkp advanced by one round key
    342  1.1  riastrad  *		%xmm0 = rk, the round key we just computed
    343  1.1  riastrad  *		%xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key
    344  1.1  riastrad  *		%xmm2 = garbage
    345  1.1  riastrad  *		%xmm4 = garbage
    346  1.1  riastrad  *		%xmm5 = garbage
    347  1.1  riastrad  *		%xmm6 = garbage
    348  1.1  riastrad  *		%xmm7 = garbage
    349  1.1  riastrad  */
    350  1.1  riastrad 	.text
    351  1.1  riastrad 	_ALIGN_TEXT
    352  1.1  riastrad 	.type	aesni_expand192b,@function
    353  1.1  riastrad aesni_expand192b:
    354  1.1  riastrad 	/*
    355  1.1  riastrad 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    356  1.1  riastrad 	 * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON.
    357  1.1  riastrad 	 */
    358  1.1  riastrad 	pshufd	$0b11111111,%xmm2,%xmm2
    359  1.1  riastrad 
    360  1.1  riastrad 	/*
    361  1.1  riastrad 	 * We need to compute:
    362  1.1  riastrad 	 *
    363  1.1  riastrad 	 * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2]
    364  1.1  riastrad 	 * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3]
    365  1.1  riastrad 	 * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    366  1.1  riastrad 	 * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    367  1.1  riastrad 	 *     ^ prk[1]
    368  1.1  riastrad 	 * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    369  1.1  riastrad 	 *     ^ prk[1] ^ prk[2]
    370  1.1  riastrad 	 * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    371  1.1  riastrad 	 *     ^ prk[1] ^ prk[2] ^ prk[3]
    372  1.1  riastrad 	 */
    373  1.1  riastrad 
    374  1.1  riastrad 	/* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */
    375  1.1  riastrad 	shufps	$0b01001110,%xmm0,%xmm1
    376  1.1  riastrad 
    377  1.1  riastrad 	/*
    378  1.1  riastrad 	 * %xmm5 := (0, pprk[2], pprk[3], prk[0])
    379  1.1  riastrad 	 * %xmm6 := (0, 0, pprk[2], pprk[3])
    380  1.1  riastrad 	 * %xmm7 := (0, 0, 0, pprk[2])
    381  1.1  riastrad 	 */
    382  1.1  riastrad 	movdqa	%xmm1,%xmm5
    383  1.1  riastrad 	movdqa	%xmm1,%xmm6
    384  1.1  riastrad 	movdqa	%xmm1,%xmm7
    385  1.1  riastrad 	pslldq	$4,%xmm5
    386  1.1  riastrad 	pslldq	$8,%xmm6
    387  1.1  riastrad 	pslldq	$12,%xmm7
    388  1.1  riastrad 
    389  1.1  riastrad 	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */
    390  1.1  riastrad 	pxor	%xmm2,%xmm1
    391  1.1  riastrad 	pxor	%xmm5,%xmm1
    392  1.1  riastrad 	pxor	%xmm6,%xmm1
    393  1.1  riastrad 	pxor	%xmm7,%xmm1
    394  1.1  riastrad 
    395  1.1  riastrad 	/* %xmm4 := (prk[2], prk[3], xxx, xxx) */
    396  1.1  riastrad 	pshufd	$0b00001110,%xmm0,%xmm4
    397  1.1  riastrad 
    398  1.1  riastrad 	/* %xmm5 := (0, prk[2], xxx, xxx) */
    399  1.1  riastrad 	movdqa	%xmm4,%xmm5
    400  1.1  riastrad 	pslldq	$4,%xmm5
    401  1.1  riastrad 
    402  1.1  riastrad 	/* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */
    403  1.1  riastrad 	movdqa	%xmm1,%xmm0
    404  1.1  riastrad 
    405  1.1  riastrad 	/* %xmm1 := (rk[3], rk[3], xxx, xxx) */
    406  1.1  riastrad 	shufps	$0b00001111,%xmm1,%xmm1
    407  1.1  riastrad 
    408  1.1  riastrad 	/*
    409  1.1  riastrad 	 * %xmm1 := (nrk[0] = rk[3] ^ prk[2],
    410  1.1  riastrad 	 *     nrk[1] = rk[3] ^ prk[2] ^ prk[3],
    411  1.1  riastrad 	 *     xxx,
    412  1.1  riastrad 	 *     xxx)
    413  1.1  riastrad 	 */
    414  1.1  riastrad 	pxor	%xmm4,%xmm1
    415  1.1  riastrad 	pxor	%xmm5,%xmm1
    416  1.1  riastrad 
    417  1.1  riastrad 	movdqa	%xmm0,(%rdi)	/* store round key */
    418  1.1  riastrad 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    419  1.1  riastrad 	ret
    420  1.1  riastrad END(aesni_expand192b)
    421  1.1  riastrad 
    422  1.1  riastrad /*
    423  1.1  riastrad  * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0,
    424  1.1  riastrad  *     uint128_t prk@xmm1, uint128_t keygenassist@xmm2)
    425  1.1  riastrad  *
    426  1.1  riastrad  *	Set even-numbered AES-256 round key.
    427  1.1  riastrad  *
    428  1.1  riastrad  *	Internal ABI.  On entry:
    429  1.1  riastrad  *
    430  1.1  riastrad  *		%rdi = rkp, pointer to round key to compute
    431  1.1  riastrad  *		%xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3])
    432  1.1  riastrad  *		%xmm1 = (prk[0], prk[1], prk[2], prk[3])
    433  1.1  riastrad  *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])))
    434  1.1  riastrad  *
    435  1.1  riastrad  *	On exit:
    436  1.1  riastrad  *
    437  1.1  riastrad  *		%rdi = &rkp[1], rkp advanced by one round key
    438  1.1  riastrad  *		%xmm0 = rk, the round key we just computed
    439  1.1  riastrad  *		%xmm1 = prk, previous round key, preserved from entry
    440  1.1  riastrad  *		%xmm2 = garbage
    441  1.1  riastrad  *		%xmm4 = garbage
    442  1.1  riastrad  *		%xmm5 = garbage
    443  1.1  riastrad  *		%xmm6 = garbage
    444  1.1  riastrad  *
    445  1.1  riastrad  *	The computation turns out to be the same as for AES-128; the
    446  1.1  riastrad  *	previous round key does not figure into it, only the
    447  1.1  riastrad  *	previous-previous round key.
    448  1.1  riastrad  */
    449  1.1  riastrad 	aesni_expand256a = aesni_expand128
    450  1.1  riastrad 
    451  1.1  riastrad /*
    452  1.1  riastrad  * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    453  1.1  riastrad  *     uint128_t pprk@xmm1, uint128_t keygenassist@xmm2)
    454  1.1  riastrad  *
    455  1.1  riastrad  *	Set odd-numbered AES-256 round key.
    456  1.1  riastrad  *
    457  1.1  riastrad  *	Internal ABI.  On entry:
    458  1.1  riastrad  *
    459  1.1  riastrad  *		%rdi = rkp, pointer to round key to compute
    460  1.1  riastrad  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    461  1.1  riastrad  *		%xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3])
    462  1.1  riastrad  *		%xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx)
    463  1.1  riastrad  *
    464  1.1  riastrad  *	On exit:
    465  1.1  riastrad  *
    466  1.1  riastrad  *		%rdi = &rkp[1], rkp advanced by one round key
    467  1.1  riastrad  *		%xmm0 = prk, previous round key, preserved from entry
    468  1.1  riastrad  *		%xmm1 = rk, the round key we just computed
    469  1.1  riastrad  *		%xmm2 = garbage
    470  1.1  riastrad  *		%xmm4 = garbage
    471  1.1  riastrad  *		%xmm5 = garbage
    472  1.1  riastrad  *		%xmm6 = garbage
    473  1.1  riastrad  */
    474  1.1  riastrad 	.text
    475  1.1  riastrad 	_ALIGN_TEXT
    476  1.1  riastrad 	.type	aesni_expand256b,@function
    477  1.1  riastrad aesni_expand256b:
    478  1.1  riastrad 	/*
    479  1.1  riastrad 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    480  1.1  riastrad 	 * i.e., set each word of %xmm2 to t := Sub(prk[3]).
    481  1.1  riastrad 	 */
    482  1.1  riastrad 	pshufd	$0b10101010,%xmm2,%xmm2
    483  1.1  riastrad 
    484  1.1  riastrad 	/*
    485  1.1  riastrad 	 * %xmm4 := (0, pprk[0], pprk[1], pprk[2])
    486  1.1  riastrad 	 * %xmm5 := (0, 0, pprk[0], pprk[1])
    487  1.1  riastrad 	 * %xmm6 := (0, 0, 0, pprk[0])
    488  1.1  riastrad 	 */
    489  1.1  riastrad 	movdqa	%xmm1,%xmm4
    490  1.1  riastrad 	movdqa	%xmm1,%xmm5
    491  1.1  riastrad 	movdqa	%xmm1,%xmm6
    492  1.1  riastrad 	pslldq	$4,%xmm4
    493  1.1  riastrad 	pslldq	$8,%xmm5
    494  1.1  riastrad 	pslldq	$12,%xmm6
    495  1.1  riastrad 
    496  1.1  riastrad 	/*
    497  1.1  riastrad 	 * %xmm0 := (rk[0] = t ^ pprk[0],
    498  1.1  riastrad 	 *     rk[1] = t ^ pprk[0] ^ pprk[1],
    499  1.1  riastrad 	 *     rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2],
    500  1.1  riastrad 	 *     rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3])
    501  1.1  riastrad 	 */
    502  1.1  riastrad 	pxor	%xmm2,%xmm1
    503  1.1  riastrad 	pxor	%xmm4,%xmm1
    504  1.1  riastrad 	pxor	%xmm5,%xmm1
    505  1.1  riastrad 	pxor	%xmm6,%xmm1
    506  1.1  riastrad 
    507  1.1  riastrad 	movdqa	%xmm1,(%rdi)	/* store round key */
    508  1.1  riastrad 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    509  1.1  riastrad 	ret
    510  1.1  riastrad END(aesni_expand256b)
    511  1.1  riastrad 
    512  1.1  riastrad /*
    513  1.1  riastrad  * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi,
    514  1.1  riastrad  *     uint32_t nrounds@rdx)
    515  1.1  riastrad  *
    516  1.1  riastrad  *	Convert AES encryption round keys to AES decryption round keys.
    517  1.1  riastrad  *	`rounds' must be between 10 and 14.
    518  1.1  riastrad  *
    519  1.1  riastrad  *	Standard ABI calling convention.
    520  1.1  riastrad  */
    521  1.1  riastrad ENTRY(aesni_enctodec)
    522  1.1  riastrad 	shl	$4,%edx		/* rdx := byte offset of last round key */
    523  1.1  riastrad 	movdqa	(%rdi,%rdx),%xmm0	/* load last round key */
    524  1.1  riastrad 	movdqa	%xmm0,(%rsi)	/* store last round key verbatim */
    525  1.3  riastrad 	jmp	2f
    526  1.3  riastrad 1:	movdqa	(%rdi,%rdx),%xmm0	/* load round key */
    527  1.1  riastrad 	aesimc	%xmm0,%xmm0	/* convert encryption to decryption */
    528  1.1  riastrad 	movdqa	%xmm0,(%rsi)	/* store round key */
    529  1.3  riastrad 2:	sub	$0x10,%rdx	/* advance to next round key */
    530  1.3  riastrad 	lea	0x10(%rsi),%rsi
    531  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
    532  1.3  riastrad 	movdqa	(%rdi),%xmm0	/* load first round key */
    533  1.1  riastrad 	movdqa	%xmm0,(%rsi)	/* store first round key verbatim */
    534  1.1  riastrad 	ret
    535  1.1  riastrad END(aesni_enctodec)
    536  1.1  riastrad 
    537  1.1  riastrad /*
    538  1.1  riastrad  * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi,
    539  1.1  riastrad  *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
    540  1.1  riastrad  *
    541  1.1  riastrad  *	Encrypt a single block.
    542  1.1  riastrad  *
    543  1.1  riastrad  *	Standard ABI calling convention.
    544  1.1  riastrad  */
    545  1.1  riastrad ENTRY(aesni_enc)
    546  1.1  riastrad 	movdqu	(%rsi),%xmm0
    547  1.1  riastrad 	call	aesni_enc1
    548  1.1  riastrad 	movdqu	%xmm0,(%rdx)
    549  1.1  riastrad 	ret
    550  1.1  riastrad END(aesni_enc)
    551  1.1  riastrad 
    552  1.1  riastrad /*
    553  1.1  riastrad  * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi,
    554  1.1  riastrad  *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
    555  1.1  riastrad  *
    556  1.1  riastrad  *	Decrypt a single block.
    557  1.1  riastrad  *
    558  1.1  riastrad  *	Standard ABI calling convention.
    559  1.1  riastrad  */
    560  1.1  riastrad ENTRY(aesni_dec)
    561  1.1  riastrad 	movdqu	(%rsi),%xmm0
    562  1.1  riastrad 	call	aesni_dec1
    563  1.1  riastrad 	movdqu	%xmm0,(%rdx)
    564  1.1  riastrad 	ret
    565  1.1  riastrad END(aesni_dec)
    566  1.1  riastrad 
    567  1.1  riastrad /*
    568  1.1  riastrad  * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    569  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8,
    570  1.1  riastrad  *     uint32_t nrounds@r9d)
    571  1.1  riastrad  *
    572  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    573  1.1  riastrad  *
    574  1.1  riastrad  *	nbytes must be an integral multiple of 16.
    575  1.1  riastrad  *
    576  1.1  riastrad  *	Standard ABI calling convention.
    577  1.1  riastrad  */
    578  1.1  riastrad ENTRY(aesni_cbc_enc)
    579  1.1  riastrad 	cmp	$0,%rcx
    580  1.1  riastrad 	jz	2f
    581  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    582  1.1  riastrad 	movdqu	(%r8),%xmm0		/* xmm0 := chaining value */
    583  1.1  riastrad 1:	movdqu	(%rsi),%xmm1		/* xmm1 := plaintext block */
    584  1.1  riastrad 	lea	0x10(%rsi),%rsi
    585  1.1  riastrad 	pxor	%xmm1,%xmm0		/* xmm0 := cv ^ ptxt */
    586  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    587  1.1  riastrad 	call	aesni_enc1		/* xmm0 := ciphertext block */
    588  1.1  riastrad 	movdqu	%xmm0,(%rdx)
    589  1.1  riastrad 	lea	0x10(%rdx),%rdx
    590  1.1  riastrad 	sub	$0x10,%r10
    591  1.1  riastrad 	jnz	1b			/* repeat if r10 is nonzero */
    592  1.1  riastrad 	movdqu	%xmm0,(%r8)		/* store chaining value */
    593  1.1  riastrad 2:	ret
    594  1.1  riastrad END(aesni_cbc_enc)
    595  1.1  riastrad 
    596  1.1  riastrad /*
    597  1.1  riastrad  * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    598  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
    599  1.1  riastrad  *     uint32_t nrounds@r9)
    600  1.1  riastrad  *
    601  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    602  1.1  riastrad  *
    603  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    604  1.1  riastrad  *	is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once.
    605  1.1  riastrad  *
    606  1.1  riastrad  *	Standard ABI calling convention.
    607  1.1  riastrad  */
    608  1.1  riastrad ENTRY(aesni_cbc_dec1)
    609  1.1  riastrad 	push	%rbp			/* create stack frame uint128[1] */
    610  1.1  riastrad 	mov	%rsp,%rbp
    611  1.1  riastrad 	sub	$0x10,%rsp
    612  1.1  riastrad 	movdqu	(%r8),%xmm8		/* xmm8 := iv */
    613  1.1  riastrad 	movdqa	%xmm8,(%rsp)		/* save iv */
    614  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    615  1.1  riastrad 	movdqu	-0x10(%rsi,%r10),%xmm0	/* xmm0 := last ciphertext block */
    616  1.1  riastrad 	movdqu	%xmm0,(%r8)		/* update iv */
    617  1.3  riastrad 	jmp	2f
    618  1.3  riastrad 1:	movdqu	-0x10(%rsi,%r10),%xmm8	/* xmm8 := chaining value */
    619  1.1  riastrad 	pxor	%xmm8,%xmm0		/* xmm0 := ptxt */
    620  1.1  riastrad 	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
    621  1.1  riastrad 	movdqa	%xmm8,%xmm0		/* move cv = ciphertext block */
    622  1.3  riastrad 2:	mov	%r9d,%ecx		/* ecx := nrounds */
    623  1.3  riastrad 	call	aesni_dec1		/* xmm0 := cv ^ ptxt */
    624  1.3  riastrad 	sub	$0x10,%r10
    625  1.3  riastrad 	jnz	1b			/* repeat if more blocks */
    626  1.3  riastrad 	pxor	(%rsp),%xmm0		/* xmm0 := ptxt */
    627  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
    628  1.1  riastrad 	leave
    629  1.1  riastrad 	ret
    630  1.1  riastrad END(aesni_cbc_dec1)
    631  1.1  riastrad 
    632  1.1  riastrad /*
    633  1.1  riastrad  * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    634  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
    635  1.1  riastrad  *     uint32_t nrounds@r9)
    636  1.1  riastrad  *
    637  1.1  riastrad  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    638  1.1  riastrad  *
    639  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    640  1.1  riastrad  *
    641  1.1  riastrad  *	Standard ABI calling convention.
    642  1.1  riastrad  */
    643  1.1  riastrad ENTRY(aesni_cbc_dec8)
    644  1.1  riastrad 	push	%rbp			/* create stack frame uint128[1] */
    645  1.1  riastrad 	mov	%rsp,%rbp
    646  1.1  riastrad 	sub	$0x10,%rsp
    647  1.1  riastrad 	movdqu	(%r8),%xmm8		/* xmm8 := iv */
    648  1.1  riastrad 	movdqa	%xmm8,(%rsp)		/* save iv */
    649  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    650  1.1  riastrad 	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := ciphertext block[n-1] */
    651  1.1  riastrad 	movdqu	%xmm7,(%r8)		/* update iv */
    652  1.3  riastrad 	jmp	2f
    653  1.3  riastrad 1:	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := cv[0] */
    654  1.3  riastrad 	pxor	%xmm7,%xmm0		/* xmm0 := ptxt[0] */
    655  1.3  riastrad 	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
    656  1.3  riastrad 2:	movdqu	-0x20(%rsi,%r10),%xmm6	/* xmm6 := ciphertext block[n-2] */
    657  1.1  riastrad 	movdqu	-0x30(%rsi,%r10),%xmm5	/* xmm5 := ciphertext block[n-3] */
    658  1.1  riastrad 	movdqu	-0x40(%rsi,%r10),%xmm4	/* xmm4 := ciphertext block[n-4] */
    659  1.1  riastrad 	movdqu	-0x50(%rsi,%r10),%xmm3	/* xmm3 := ciphertext block[n-5] */
    660  1.1  riastrad 	movdqu	-0x60(%rsi,%r10),%xmm2	/* xmm2 := ciphertext block[n-6] */
    661  1.1  riastrad 	movdqu	-0x70(%rsi,%r10),%xmm1	/* xmm1 := ciphertext block[n-7] */
    662  1.1  riastrad 	movdqu	-0x80(%rsi,%r10),%xmm0	/* xmm0 := ciphertext block[n-8] */
    663  1.1  riastrad 	movdqa	%xmm6,%xmm15		/* xmm[8+i] := cv[i], 0<i<8 */
    664  1.1  riastrad 	movdqa	%xmm5,%xmm14
    665  1.1  riastrad 	movdqa	%xmm4,%xmm13
    666  1.1  riastrad 	movdqa	%xmm3,%xmm12
    667  1.1  riastrad 	movdqa	%xmm2,%xmm11
    668  1.1  riastrad 	movdqa	%xmm1,%xmm10
    669  1.1  riastrad 	movdqa	%xmm0,%xmm9
    670  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    671  1.1  riastrad 	call	aesni_dec8		/* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */
    672  1.1  riastrad 	pxor	%xmm15,%xmm7		/* xmm[i] := ptxt[i], 0<i<8 */
    673  1.1  riastrad 	pxor	%xmm14,%xmm6
    674  1.1  riastrad 	pxor	%xmm13,%xmm5
    675  1.1  riastrad 	pxor	%xmm12,%xmm4
    676  1.1  riastrad 	pxor	%xmm11,%xmm3
    677  1.1  riastrad 	pxor	%xmm10,%xmm2
    678  1.1  riastrad 	pxor	%xmm9,%xmm1
    679  1.1  riastrad 	movdqu	%xmm7,-0x10(%rdx,%r10)	/* store plaintext blocks */
    680  1.1  riastrad 	movdqu	%xmm6,-0x20(%rdx,%r10)
    681  1.1  riastrad 	movdqu	%xmm5,-0x30(%rdx,%r10)
    682  1.1  riastrad 	movdqu	%xmm4,-0x40(%rdx,%r10)
    683  1.1  riastrad 	movdqu	%xmm3,-0x50(%rdx,%r10)
    684  1.1  riastrad 	movdqu	%xmm2,-0x60(%rdx,%r10)
    685  1.1  riastrad 	movdqu	%xmm1,-0x70(%rdx,%r10)
    686  1.1  riastrad 	sub	$0x80,%r10
    687  1.3  riastrad 	jnz	1b			/* repeat if more blocks */
    688  1.3  riastrad 	pxor	(%rsp),%xmm0		/* xmm0 := ptxt[0] */
    689  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
    690  1.1  riastrad 	leave
    691  1.1  riastrad 	ret
    692  1.1  riastrad END(aesni_cbc_dec8)
    693  1.1  riastrad 
    694  1.1  riastrad /*
    695  1.1  riastrad  * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    696  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    697  1.1  riastrad  *     uint32_t nrounds@r9d)
    698  1.1  riastrad  *
    699  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    700  1.1  riastrad  *
    701  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    702  1.1  riastrad  *	is not vectorized; use aesni_xts_enc8 for >=8 blocks at once.
    703  1.1  riastrad  *
    704  1.1  riastrad  *	Standard ABI calling convention.
    705  1.1  riastrad  */
    706  1.1  riastrad ENTRY(aesni_xts_enc1)
    707  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    708  1.1  riastrad 	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
    709  1.1  riastrad 1:	movdqu	(%rsi),%xmm0		/* xmm0 := ptxt */
    710  1.1  riastrad 	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
    711  1.1  riastrad 	pxor	%xmm15,%xmm0		/* xmm0 := ptxt ^ tweak */
    712  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    713  1.1  riastrad 	call	aesni_enc1		/* xmm0 := AES(ptxt ^ tweak) */
    714  1.1  riastrad 	pxor	%xmm15,%xmm0		/* xmm0 := AES(ptxt ^ tweak) ^ tweak */
    715  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store ciphertext block */
    716  1.1  riastrad 	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
    717  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
    718  1.1  riastrad 	sub	$0x10,%r10
    719  1.1  riastrad 	jnz	1b			/* repeat if more blocks */
    720  1.1  riastrad 	movdqu	%xmm15,(%r8)		/* update tweak */
    721  1.1  riastrad 	ret
    722  1.1  riastrad END(aesni_xts_enc1)
    723  1.1  riastrad 
    724  1.1  riastrad /*
    725  1.1  riastrad  * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    726  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    727  1.1  riastrad  *     uint32_t nrounds@r9d)
    728  1.1  riastrad  *
    729  1.1  riastrad  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    730  1.1  riastrad  *
    731  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    732  1.1  riastrad  *
    733  1.1  riastrad  *	Standard ABI calling convention.
    734  1.1  riastrad  */
    735  1.1  riastrad ENTRY(aesni_xts_enc8)
    736  1.1  riastrad 	push	%rbp			/* create stack frame uint128[1] */
    737  1.1  riastrad 	mov	%rsp,%rbp
    738  1.1  riastrad 	sub	$0x10,%rsp
    739  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    740  1.1  riastrad 	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
    741  1.1  riastrad 1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
    742  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
    743  1.1  riastrad 	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
    744  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
    745  1.1  riastrad 	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
    746  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
    747  1.1  riastrad 	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
    748  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[4] */
    749  1.1  riastrad 	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
    750  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
    751  1.1  riastrad 	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
    752  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
    753  1.1  riastrad 	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
    754  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
    755  1.1  riastrad 	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
    756  1.1  riastrad 	movdqu	0x10(%rsi),%xmm1
    757  1.1  riastrad 	movdqu	0x20(%rsi),%xmm2
    758  1.1  riastrad 	movdqu	0x30(%rsi),%xmm3
    759  1.1  riastrad 	movdqu	0x40(%rsi),%xmm4
    760  1.1  riastrad 	movdqu	0x50(%rsi),%xmm5
    761  1.1  riastrad 	movdqu	0x60(%rsi),%xmm6
    762  1.1  riastrad 	movdqu	0x70(%rsi),%xmm7
    763  1.1  riastrad 	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
    764  1.1  riastrad 	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
    765  1.1  riastrad 	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
    766  1.1  riastrad 	pxor	%xmm9,%xmm1
    767  1.1  riastrad 	pxor	%xmm10,%xmm2
    768  1.1  riastrad 	pxor	%xmm11,%xmm3
    769  1.1  riastrad 	pxor	%xmm12,%xmm4
    770  1.1  riastrad 	pxor	%xmm13,%xmm5
    771  1.1  riastrad 	pxor	%xmm14,%xmm6
    772  1.1  riastrad 	pxor	%xmm15,%xmm7
    773  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    774  1.1  riastrad 	call	aesni_enc8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
    775  1.1  riastrad 	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
    776  1.1  riastrad 	pxor	%xmm9,%xmm1
    777  1.1  riastrad 	pxor	%xmm10,%xmm2
    778  1.1  riastrad 	pxor	%xmm11,%xmm3
    779  1.1  riastrad 	pxor	%xmm12,%xmm4
    780  1.1  riastrad 	pxor	%xmm13,%xmm5
    781  1.1  riastrad 	pxor	%xmm14,%xmm6
    782  1.1  riastrad 	pxor	%xmm15,%xmm7
    783  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
    784  1.1  riastrad 	movdqu	%xmm1,0x10(%rdx)
    785  1.1  riastrad 	movdqu	%xmm2,0x20(%rdx)
    786  1.1  riastrad 	movdqu	%xmm3,0x30(%rdx)
    787  1.1  riastrad 	movdqu	%xmm4,0x40(%rdx)
    788  1.1  riastrad 	movdqu	%xmm5,0x50(%rdx)
    789  1.1  riastrad 	movdqu	%xmm6,0x60(%rdx)
    790  1.1  riastrad 	movdqu	%xmm7,0x70(%rdx)
    791  1.1  riastrad 	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
    792  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
    793  1.1  riastrad 	sub	$0x80,%r10
    794  1.1  riastrad 	jnz	1b			/* repeat if more block groups */
    795  1.1  riastrad 	movdqu	%xmm15,(%r8)		/* update tweak */
    796  1.1  riastrad 	leave
    797  1.1  riastrad 	ret
    798  1.1  riastrad END(aesni_xts_enc8)
    799  1.1  riastrad 
    800  1.1  riastrad /*
    801  1.1  riastrad  * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    802  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    803  1.1  riastrad  *     uint32_t nrounds@r9d)
    804  1.1  riastrad  *
    805  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-XTS.
    806  1.1  riastrad  *
    807  1.1  riastrad  *	nbytes must be a positive integral multiple of 16.  This routine
    808  1.1  riastrad  *	is not vectorized; use aesni_xts_dec8 for >=8 blocks at once.
    809  1.1  riastrad  *
    810  1.1  riastrad  *	Standard ABI calling convention.
    811  1.1  riastrad  */
    812  1.1  riastrad ENTRY(aesni_xts_dec1)
    813  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    814  1.1  riastrad 	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
    815  1.1  riastrad 1:	movdqu	(%rsi),%xmm0		/* xmm0 := ctxt */
    816  1.1  riastrad 	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
    817  1.1  riastrad 	pxor	%xmm15,%xmm0		/* xmm0 := ctxt ^ tweak */
    818  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    819  1.1  riastrad 	call	aesni_dec1		/* xmm0 := AES(ctxt ^ tweak) */
    820  1.1  riastrad 	pxor	%xmm15,%xmm0		/* xmm0 := AES(ctxt ^ tweak) ^ tweak */
    821  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store plaintext block */
    822  1.1  riastrad 	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
    823  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
    824  1.1  riastrad 	sub	$0x10,%r10
    825  1.1  riastrad 	jnz	1b			/* repeat if more blocks */
    826  1.1  riastrad 	movdqu	%xmm15,(%r8)		/* update tweak */
    827  1.1  riastrad 	ret
    828  1.1  riastrad END(aesni_xts_dec1)
    829  1.1  riastrad 
    830  1.1  riastrad /*
    831  1.1  riastrad  * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    832  1.1  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    833  1.1  riastrad  *     uint32_t nrounds@r9d)
    834  1.1  riastrad  *
    835  1.1  riastrad  *	Decrypt a contiguous sequence of blocks with AES-XTS.
    836  1.1  riastrad  *
    837  1.1  riastrad  *	nbytes must be a positive integral multiple of 128.
    838  1.1  riastrad  *
    839  1.1  riastrad  *	Standard ABI calling convention.
    840  1.1  riastrad  */
    841  1.1  riastrad ENTRY(aesni_xts_dec8)
    842  1.1  riastrad 	push	%rbp			/* create stack frame uint128[1] */
    843  1.1  riastrad 	mov	%rsp,%rbp
    844  1.1  riastrad 	sub	$0x10,%rsp
    845  1.1  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    846  1.1  riastrad 	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
    847  1.1  riastrad 1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
    848  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
    849  1.1  riastrad 	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
    850  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
    851  1.1  riastrad 	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
    852  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
    853  1.1  riastrad 	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
    854  1.1  riastrad 	call	aesni_xts_mulx		/* xmm51 := tweak[4] */
    855  1.1  riastrad 	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
    856  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
    857  1.1  riastrad 	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
    858  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
    859  1.1  riastrad 	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
    860  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
    861  1.1  riastrad 	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
    862  1.1  riastrad 	movdqu	0x10(%rsi),%xmm1
    863  1.1  riastrad 	movdqu	0x20(%rsi),%xmm2
    864  1.1  riastrad 	movdqu	0x30(%rsi),%xmm3
    865  1.1  riastrad 	movdqu	0x40(%rsi),%xmm4
    866  1.1  riastrad 	movdqu	0x50(%rsi),%xmm5
    867  1.1  riastrad 	movdqu	0x60(%rsi),%xmm6
    868  1.1  riastrad 	movdqu	0x70(%rsi),%xmm7
    869  1.1  riastrad 	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
    870  1.1  riastrad 	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
    871  1.1  riastrad 	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
    872  1.1  riastrad 	pxor	%xmm9,%xmm1
    873  1.1  riastrad 	pxor	%xmm10,%xmm2
    874  1.1  riastrad 	pxor	%xmm11,%xmm3
    875  1.1  riastrad 	pxor	%xmm12,%xmm4
    876  1.1  riastrad 	pxor	%xmm13,%xmm5
    877  1.1  riastrad 	pxor	%xmm14,%xmm6
    878  1.1  riastrad 	pxor	%xmm15,%xmm7
    879  1.1  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
    880  1.1  riastrad 	call	aesni_dec8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
    881  1.1  riastrad 	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
    882  1.1  riastrad 	pxor	%xmm9,%xmm1
    883  1.1  riastrad 	pxor	%xmm10,%xmm2
    884  1.1  riastrad 	pxor	%xmm11,%xmm3
    885  1.1  riastrad 	pxor	%xmm12,%xmm4
    886  1.1  riastrad 	pxor	%xmm13,%xmm5
    887  1.1  riastrad 	pxor	%xmm14,%xmm6
    888  1.1  riastrad 	pxor	%xmm15,%xmm7
    889  1.1  riastrad 	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
    890  1.1  riastrad 	movdqu	%xmm1,0x10(%rdx)
    891  1.1  riastrad 	movdqu	%xmm2,0x20(%rdx)
    892  1.1  riastrad 	movdqu	%xmm3,0x30(%rdx)
    893  1.1  riastrad 	movdqu	%xmm4,0x40(%rdx)
    894  1.1  riastrad 	movdqu	%xmm5,0x50(%rdx)
    895  1.1  riastrad 	movdqu	%xmm6,0x60(%rdx)
    896  1.1  riastrad 	movdqu	%xmm7,0x70(%rdx)
    897  1.1  riastrad 	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
    898  1.1  riastrad 	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
    899  1.1  riastrad 	sub	$0x80,%r10
    900  1.1  riastrad 	jnz	1b			/* repeat if more block groups */
    901  1.1  riastrad 	movdqu	%xmm15,(%r8)		/* update tweak */
    902  1.1  riastrad 	leave
    903  1.1  riastrad 	ret
    904  1.1  riastrad END(aesni_xts_dec8)
    905  1.1  riastrad 
    906  1.1  riastrad /*
    907  1.1  riastrad  * aesni_xts_mulx(tweak@xmm15)
    908  1.1  riastrad  *
    909  1.1  riastrad  *	Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    910  1.1  riastrad  *	Uses %xmm0 as temporary.
    911  1.1  riastrad  */
    912  1.1  riastrad 	.text
    913  1.1  riastrad 	_ALIGN_TEXT
    914  1.1  riastrad 	.type	aesni_xts_mulx,@function
    915  1.1  riastrad aesni_xts_mulx:
    916  1.1  riastrad 	/*
    917  1.1  riastrad 	 * Simultaneously determine
    918  1.1  riastrad 	 * (a) whether the high bit of the low quadword must be
    919  1.1  riastrad 	 *     shifted into the low bit of the high quadword, and
    920  1.1  riastrad 	 * (b) whether the high bit of the high quadword must be
    921  1.1  riastrad 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    922  1.1  riastrad 	 */
    923  1.1  riastrad 	pxor	%xmm0,%xmm0	/* xmm0 := 0 */
    924  1.1  riastrad 	pcmpgtq	%xmm15,%xmm0	/* xmm0[i] := -1 if 0 > xmm15[i] else 0 */
    925  1.1  riastrad 	pshufd	$0b01001110,%xmm0,%xmm0	/* swap halves of xmm0 */
    926  1.1  riastrad 	pand	xtscarry(%rip),%xmm0	/* copy xtscarry according to mask */
    927  1.1  riastrad 	psllq	$1,%xmm15	/* shift */
    928  1.1  riastrad 	pxor	%xmm0,%xmm15	/* incorporate (a) and (b) */
    929  1.1  riastrad 	ret
    930  1.1  riastrad END(aesni_xts_mulx)
    931  1.1  riastrad 
    932  1.1  riastrad 	.section .rodata
    933  1.2  riastrad 	.p2align 4
    934  1.1  riastrad 	.type	xtscarry,@object
    935  1.1  riastrad xtscarry:
    936  1.1  riastrad 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    937  1.1  riastrad END(xtscarry)
    938  1.1  riastrad 
    939  1.1  riastrad /*
    940  1.1  riastrad  * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi)
    941  1.1  riastrad  *
    942  1.1  riastrad  *	Update an AES-XTS tweak.
    943  1.1  riastrad  *
    944  1.1  riastrad  *	Standard ABI calling convention.
    945  1.1  riastrad  */
    946  1.1  riastrad ENTRY(aesni_xts_update)
    947  1.1  riastrad 	movdqu	(%rdi),%xmm15
    948  1.1  riastrad 	call	aesni_xts_mulx
    949  1.1  riastrad 	movdqu	%xmm15,(%rsi)
    950  1.1  riastrad 	ret
    951  1.1  riastrad END(aesni_xts_update)
    952  1.1  riastrad 
    953  1.1  riastrad /*
    954  1.4  riastrad  * aesni_cbcmac_update1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    955  1.4  riastrad  *     size_t nbytes@rdx, uint8_t auth[16] @rcx, uint32_t nrounds@r8d)
    956  1.4  riastrad  *
    957  1.4  riastrad  *	Update CBC-MAC.
    958  1.4  riastrad  *
    959  1.4  riastrad  *	nbytes must be a positive integral multiple of 16.
    960  1.4  riastrad  *
    961  1.4  riastrad  *	Standard ABI calling convention.
    962  1.4  riastrad  */
    963  1.4  riastrad ENTRY(aesni_cbcmac_update1)
    964  1.4  riastrad 	movdqu	(%rcx),%xmm0		/* xmm0 := auth */
    965  1.4  riastrad 	mov	%rdx,%r10		/* r10 := nbytes */
    966  1.4  riastrad 	mov	%rcx,%rdx		/* rdx := &auth */
    967  1.4  riastrad 1:	pxor	(%rsi),%xmm0		/* xmm0 ^= plaintext block */
    968  1.4  riastrad 	lea	0x10(%rsi),%rsi
    969  1.4  riastrad 	mov	%r8d,%ecx		/* ecx := nrounds */
    970  1.4  riastrad 	call	aesni_enc1		/* xmm0 := auth'; trash rax,rcx,xmm8 */
    971  1.4  riastrad 	sub	$0x10,%r10
    972  1.4  riastrad 	jnz	1b
    973  1.4  riastrad 	movdqu	%xmm0,(%rdx)		/* store auth' */
    974  1.4  riastrad 	ret
    975  1.4  riastrad END(aesni_cbcmac_update1)
    976  1.4  riastrad 
    977  1.4  riastrad /*
    978  1.4  riastrad  * aesni_ccm_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    979  1.4  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx,
    980  1.4  riastrad  *     uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
    981  1.4  riastrad  *
    982  1.4  riastrad  *	Update CCM encryption.
    983  1.4  riastrad  *
    984  1.4  riastrad  *	nbytes must be a positive integral multiple of 16.
    985  1.4  riastrad  *
    986  1.4  riastrad  *	Standard ABI calling convention.
    987  1.4  riastrad  */
    988  1.4  riastrad ENTRY(aesni_ccm_enc1)
    989  1.4  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
    990  1.4  riastrad 	movdqu	0x10(%r8),%xmm2		/* xmm2 := ctr (be) */
    991  1.4  riastrad 	movdqa	bswap32(%rip),%xmm4	/* xmm4 := bswap32 table */
    992  1.4  riastrad 	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
    993  1.4  riastrad 	movdqu	(%r8),%xmm0		/* xmm0 := auth */
    994  1.4  riastrad 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
    995  1.4  riastrad 1:	movdqu	(%rsi),%xmm3		/* xmm3 := plaintext block */
    996  1.4  riastrad 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
    997  1.4  riastrad 	lea	0x10(%rsi),%rsi
    998  1.4  riastrad 	movdqa	%xmm2,%xmm1		/* xmm1 := ctr (le) */
    999  1.4  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
   1000  1.4  riastrad 	pshufb	%xmm4,%xmm1		/* xmm1 := ctr (be) */
   1001  1.4  riastrad 	pxor	%xmm3,%xmm0		/* xmm0 := auth ^ ptxt */
   1002  1.4  riastrad 	call	aesni_enc2		/* trash rax/rcx/xmm8 */
   1003  1.4  riastrad 	pxor	%xmm1,%xmm3		/* xmm3 := ciphertext block */
   1004  1.4  riastrad 	sub	$0x10,%r10		/* count down bytes */
   1005  1.4  riastrad 	movdqu	%xmm3,(%rdx)		/* store ciphertext block */
   1006  1.4  riastrad 	lea	0x10(%rdx),%rdx
   1007  1.4  riastrad 	jnz	1b			/* repeat if more blocks */
   1008  1.4  riastrad 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (be) */
   1009  1.4  riastrad 	movdqu	%xmm0,(%r8)		/* store updated auth */
   1010  1.4  riastrad 	movdqu	%xmm2,0x10(%r8)		/* store updated ctr */
   1011  1.4  riastrad 	ret
   1012  1.4  riastrad END(aesni_ccm_enc1)
   1013  1.4  riastrad 
   1014  1.4  riastrad /*
   1015  1.4  riastrad  * aesni_ccm_dec1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
   1016  1.4  riastrad  *     uint8_t *out@rdx, size_t nbytes@rcx,
   1017  1.4  riastrad  *     uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
   1018  1.4  riastrad  *
   1019  1.4  riastrad  *	Update CCM decryption.
   1020  1.4  riastrad  *
   1021  1.4  riastrad  *	nbytes must be a positive integral multiple of 16.
   1022  1.4  riastrad  *
   1023  1.4  riastrad  *	Standard ABI calling convention.
   1024  1.4  riastrad  */
   1025  1.4  riastrad ENTRY(aesni_ccm_dec1)
   1026  1.4  riastrad 	movdqu	0x10(%r8),%xmm2		/* xmm2 := ctr (be) */
   1027  1.4  riastrad 	movdqa	bswap32(%rip),%xmm4	/* xmm4 := bswap32 table */
   1028  1.4  riastrad 	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
   1029  1.4  riastrad 	movdqu	(%r8),%xmm1		/* xmm1 := auth */
   1030  1.4  riastrad 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
   1031  1.4  riastrad 	mov	%rcx,%r10		/* r10 := nbytes */
   1032  1.4  riastrad 
   1033  1.4  riastrad 	/* Decrypt the first block.  */
   1034  1.4  riastrad 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
   1035  1.4  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
   1036  1.4  riastrad 	movdqa	%xmm2,%xmm0		/* xmm0 := ctr (le) */
   1037  1.4  riastrad 	movdqu	(%rsi),%xmm3		/* xmm3 := ctxt */
   1038  1.4  riastrad 	pshufb	%xmm4,%xmm0		/* xmm0 := ctr (be) */
   1039  1.4  riastrad 	lea	0x10(%rsi),%rsi
   1040  1.4  riastrad 	call	aesni_enc1		/* xmm0 := pad; trash rax/rcx/xmm8 */
   1041  1.4  riastrad 	jmp	2f
   1042  1.4  riastrad 
   1043  1.4  riastrad 1:	/*
   1044  1.4  riastrad 	 * Authenticate the last block and decrypt the next block
   1045  1.4  riastrad 	 * simultaneously.
   1046  1.4  riastrad 	 *
   1047  1.4  riastrad 	 *	xmm1 = auth ^ ptxt[-1]
   1048  1.4  riastrad 	 *	xmm2 = ctr[-1] (le)
   1049  1.4  riastrad 	 */
   1050  1.4  riastrad 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
   1051  1.4  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
   1052  1.4  riastrad 	movdqa	%xmm2,%xmm0		/* xmm0 := ctr (le) */
   1053  1.4  riastrad 	movdqu	(%rsi),%xmm3		/* xmm3 := ctxt */
   1054  1.4  riastrad 	pshufb	%xmm4,%xmm0		/* xmm0 := ctr (be) */
   1055  1.4  riastrad 	lea	0x10(%rsi),%rsi
   1056  1.4  riastrad 	call	aesni_enc2		/* xmm0 := pad, xmm1 := auth';
   1057  1.4  riastrad 					 * trash rax/rcx/xmm8 */
   1058  1.4  riastrad 2:	pxor	%xmm0,%xmm3		/* xmm3 := ptxt */
   1059  1.4  riastrad 	sub	$0x10,%r10
   1060  1.4  riastrad 	movdqu	%xmm3,(%rdx)		/* store plaintext */
   1061  1.4  riastrad 	lea	0x10(%rdx),%rdx
   1062  1.4  riastrad 	pxor	%xmm3,%xmm1		/* xmm1 := auth ^ ptxt */
   1063  1.4  riastrad 	jnz	1b
   1064  1.4  riastrad 
   1065  1.4  riastrad 	/* Authenticate the last block.  */
   1066  1.4  riastrad 	movdqa	%xmm1,%xmm0		/* xmm0 := auth ^ ptxt */
   1067  1.4  riastrad 	mov	%r9d,%ecx		/* ecx := nrounds */
   1068  1.4  riastrad 	call	aesni_enc1		/* xmm0 := auth' */
   1069  1.4  riastrad 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (be) */
   1070  1.4  riastrad 	movdqu	%xmm0,(%r8)		/* store updated auth */
   1071  1.4  riastrad 	movdqu	%xmm2,0x10(%r8)		/* store updated ctr */
   1072  1.4  riastrad 	ret
   1073  1.4  riastrad END(aesni_ccm_dec1)
   1074  1.4  riastrad 
   1075  1.4  riastrad 	.section .rodata
   1076  1.4  riastrad 	.p2align 4
   1077  1.4  riastrad 	.type	bswap32,@object
   1078  1.4  riastrad bswap32:
   1079  1.4  riastrad 	.byte	3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
   1080  1.4  riastrad END(bswap32)
   1081  1.4  riastrad 
   1082  1.4  riastrad 	.section .rodata
   1083  1.4  riastrad 	.p2align 4
   1084  1.4  riastrad 	.type	ctr32_inc,@object
   1085  1.4  riastrad ctr32_inc:
   1086  1.4  riastrad 	.byte	0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0
   1087  1.4  riastrad END(ctr32_inc)
   1088  1.4  riastrad 
   1089  1.4  riastrad /*
   1090  1.1  riastrad  * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0,
   1091  1.1  riastrad  *     uint32_t nrounds@ecx)
   1092  1.1  riastrad  *
   1093  1.1  riastrad  *	Encrypt a single AES block in %xmm0.
   1094  1.1  riastrad  *
   1095  1.1  riastrad  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1096  1.1  riastrad  */
   1097  1.1  riastrad 	.text
   1098  1.1  riastrad 	_ALIGN_TEXT
   1099  1.1  riastrad 	.type	aesni_enc1,@function
   1100  1.1  riastrad aesni_enc1:
   1101  1.1  riastrad 	pxor	(%rdi),%xmm0	/* xor in first round key */
   1102  1.1  riastrad 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1103  1.1  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1104  1.1  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1105  1.3  riastrad 	jmp	2f
   1106  1.3  riastrad 1:	aesenc	%xmm8,%xmm0
   1107  1.3  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1108  1.1  riastrad 	add	$0x10,%rcx
   1109  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
   1110  1.3  riastrad 	aesenclast %xmm8,%xmm0
   1111  1.1  riastrad 	ret
   1112  1.1  riastrad END(aesni_enc1)
   1113  1.1  riastrad 
   1114  1.1  riastrad /*
   1115  1.4  riastrad  * aesni_enc2(const struct aesenc *enckey@rdi, uint128_t block0@xmm0,
   1116  1.4  riastrad  *     uint128_t block1@xmm1, uint32_t nrounds@ecx)
   1117  1.4  riastrad  *
   1118  1.4  riastrad  *	Encrypt two AES blocks in %xmm0 and %xmm1.
   1119  1.4  riastrad  *
   1120  1.4  riastrad  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1121  1.4  riastrad  */
   1122  1.4  riastrad 	.text
   1123  1.4  riastrad 	_ALIGN_TEXT
   1124  1.4  riastrad 	.type	aesni_enc2,@function
   1125  1.4  riastrad aesni_enc2:
   1126  1.4  riastrad 	movdqa	(%rdi),%xmm8	/* xmm8 := first round key */
   1127  1.4  riastrad 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1128  1.4  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1129  1.4  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1130  1.4  riastrad 	pxor	%xmm8,%xmm0	/* xor in first round key */
   1131  1.4  riastrad 	pxor	%xmm8,%xmm1
   1132  1.4  riastrad 	jmp	2f
   1133  1.4  riastrad 1:	aesenc	%xmm8,%xmm0
   1134  1.4  riastrad 	aesenc	%xmm8,%xmm1
   1135  1.4  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1136  1.4  riastrad 	add	$0x10,%rcx
   1137  1.4  riastrad 	jnz	1b		/* repeat if there's more */
   1138  1.4  riastrad 	aesenclast %xmm8,%xmm0
   1139  1.4  riastrad 	aesenclast %xmm8,%xmm1
   1140  1.4  riastrad 	ret
   1141  1.4  riastrad END(aesni_enc2)
   1142  1.4  riastrad 
   1143  1.4  riastrad /*
   1144  1.1  riastrad  * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ...,
   1145  1.1  riastrad  *     block7@xmm7, uint32_t nrounds@ecx)
   1146  1.1  riastrad  *
   1147  1.1  riastrad  *	Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
   1148  1.1  riastrad  *
   1149  1.1  riastrad  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1150  1.1  riastrad  */
   1151  1.1  riastrad 	.text
   1152  1.1  riastrad 	_ALIGN_TEXT
   1153  1.1  riastrad 	.type	aesni_enc8,@function
   1154  1.1  riastrad aesni_enc8:
   1155  1.1  riastrad 	movdqa	(%rdi),%xmm8	/* xor in first round key */
   1156  1.1  riastrad 	pxor	%xmm8,%xmm0
   1157  1.1  riastrad 	pxor	%xmm8,%xmm1
   1158  1.1  riastrad 	pxor	%xmm8,%xmm2
   1159  1.1  riastrad 	pxor	%xmm8,%xmm3
   1160  1.1  riastrad 	pxor	%xmm8,%xmm4
   1161  1.1  riastrad 	pxor	%xmm8,%xmm5
   1162  1.1  riastrad 	pxor	%xmm8,%xmm6
   1163  1.1  riastrad 	pxor	%xmm8,%xmm7
   1164  1.1  riastrad 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1165  1.1  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1166  1.1  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1167  1.3  riastrad 	jmp	2f
   1168  1.3  riastrad 1:	aesenc	%xmm8,%xmm0
   1169  1.1  riastrad 	aesenc	%xmm8,%xmm1
   1170  1.1  riastrad 	aesenc	%xmm8,%xmm2
   1171  1.1  riastrad 	aesenc	%xmm8,%xmm3
   1172  1.1  riastrad 	aesenc	%xmm8,%xmm4
   1173  1.1  riastrad 	aesenc	%xmm8,%xmm5
   1174  1.1  riastrad 	aesenc	%xmm8,%xmm6
   1175  1.1  riastrad 	aesenc	%xmm8,%xmm7
   1176  1.3  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1177  1.3  riastrad 	add	$0x10,%rcx
   1178  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
   1179  1.3  riastrad 	aesenclast %xmm8,%xmm0
   1180  1.1  riastrad 	aesenclast %xmm8,%xmm1
   1181  1.1  riastrad 	aesenclast %xmm8,%xmm2
   1182  1.1  riastrad 	aesenclast %xmm8,%xmm3
   1183  1.1  riastrad 	aesenclast %xmm8,%xmm4
   1184  1.1  riastrad 	aesenclast %xmm8,%xmm5
   1185  1.1  riastrad 	aesenclast %xmm8,%xmm6
   1186  1.1  riastrad 	aesenclast %xmm8,%xmm7
   1187  1.1  riastrad 	ret
   1188  1.1  riastrad END(aesni_enc8)
   1189  1.1  riastrad 
   1190  1.1  riastrad /*
   1191  1.1  riastrad  * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0,
   1192  1.1  riastrad  *     uint32_t nrounds@ecx)
   1193  1.1  riastrad  *
   1194  1.1  riastrad  *	Decrypt a single AES block in %xmm0.
   1195  1.1  riastrad  *
   1196  1.1  riastrad  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1197  1.1  riastrad  */
   1198  1.1  riastrad 	.text
   1199  1.1  riastrad 	_ALIGN_TEXT
   1200  1.1  riastrad 	.type	aesni_dec1,@function
   1201  1.1  riastrad aesni_dec1:
   1202  1.1  riastrad 	pxor	(%rdi),%xmm0	/* xor in first round key */
   1203  1.1  riastrad 	shl	$4,%ecx		/* ecx := byte offset of round key */
   1204  1.1  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
   1205  1.1  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1206  1.3  riastrad 	jmp	2f
   1207  1.3  riastrad 1:	aesdec	%xmm8,%xmm0
   1208  1.3  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1209  1.1  riastrad 	add	$0x10,%rcx
   1210  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
   1211  1.3  riastrad 	aesdeclast %xmm8,%xmm0
   1212  1.1  riastrad 	ret
   1213  1.1  riastrad END(aesni_dec1)
   1214  1.1  riastrad 
   1215  1.1  riastrad /*
   1216  1.1  riastrad  * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ...,
   1217  1.1  riastrad  *     block7@xmm7, uint32_t nrounds@ecx)
   1218  1.1  riastrad  *
   1219  1.1  riastrad  *	Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
   1220  1.1  riastrad  *
   1221  1.1  riastrad  *	Internal ABI.  Uses %xmm8 as temporary.  Destroys %rcx.
   1222  1.1  riastrad  */
   1223  1.1  riastrad 	.text
   1224  1.1  riastrad 	_ALIGN_TEXT
   1225  1.1  riastrad 	.type	aesni_dec8,@function
   1226  1.1  riastrad aesni_dec8:
   1227  1.1  riastrad 	movdqa	(%rdi),%xmm8	/* xor in first round key */
   1228  1.1  riastrad 	pxor	%xmm8,%xmm0
   1229  1.1  riastrad 	pxor	%xmm8,%xmm1
   1230  1.1  riastrad 	pxor	%xmm8,%xmm2
   1231  1.1  riastrad 	pxor	%xmm8,%xmm3
   1232  1.1  riastrad 	pxor	%xmm8,%xmm4
   1233  1.1  riastrad 	pxor	%xmm8,%xmm5
   1234  1.1  riastrad 	pxor	%xmm8,%xmm6
   1235  1.1  riastrad 	pxor	%xmm8,%xmm7
   1236  1.1  riastrad 	shl	$4,%ecx		/* ecx := byte offset of round key */
   1237  1.1  riastrad 	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
   1238  1.1  riastrad 	neg	%rcx		/* rcx := byte offset of round key from end */
   1239  1.3  riastrad 	jmp	2f
   1240  1.3  riastrad 1:	aesdec	%xmm8,%xmm0
   1241  1.1  riastrad 	aesdec	%xmm8,%xmm1
   1242  1.1  riastrad 	aesdec	%xmm8,%xmm2
   1243  1.1  riastrad 	aesdec	%xmm8,%xmm3
   1244  1.1  riastrad 	aesdec	%xmm8,%xmm4
   1245  1.1  riastrad 	aesdec	%xmm8,%xmm5
   1246  1.1  riastrad 	aesdec	%xmm8,%xmm6
   1247  1.1  riastrad 	aesdec	%xmm8,%xmm7
   1248  1.3  riastrad 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1249  1.3  riastrad 	add	$0x10,%rcx
   1250  1.3  riastrad 	jnz	1b		/* repeat if more rounds */
   1251  1.3  riastrad 	aesdeclast %xmm8,%xmm0
   1252  1.1  riastrad 	aesdeclast %xmm8,%xmm1
   1253  1.1  riastrad 	aesdeclast %xmm8,%xmm2
   1254  1.1  riastrad 	aesdeclast %xmm8,%xmm3
   1255  1.1  riastrad 	aesdeclast %xmm8,%xmm4
   1256  1.1  riastrad 	aesdeclast %xmm8,%xmm5
   1257  1.1  riastrad 	aesdeclast %xmm8,%xmm6
   1258  1.1  riastrad 	aesdeclast %xmm8,%xmm7
   1259  1.1  riastrad 	ret
   1260  1.1  riastrad END(aesni_dec8)
   1261