Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/asm.h>
     30 
     31 RCSID("$NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $")
     32 
     33 /*
     34  * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined
     35  * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned)
     36  * Packed Single, defined to operate on binary32 floats.  They have
     37  * exactly the same architectural effects (move a 128-bit quantity from
     38  * memory into an xmm register).
     39  *
     40  * In principle, they might have different microarchitectural effects
     41  * so that MOVAPS/MOVUPS might incur a penalty when the register is
     42  * later used for integer paths, but in practice they don't.  So we use
     43  * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS.
     44  */
     45 #define	movdqa	movaps
     46 #define	movdqu	movups
     47 
     48 /*
     49  * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi)
     50  *
     51  *	Expand a 16-byte AES-128 key into 10 round keys.
     52  *
     53  *	Standard ABI calling convention.
     54  */
     55 ENTRY(aesni_setenckey128)
     56 	movdqu	(%rsi),%xmm0	/* load master key into %xmm0 */
     57 	movdqa	%xmm0,(%rdi)	/* store master key as the first round key */
     58 	lea	0x10(%rdi),%rdi	/* advance %rdi to next round key */
     59 	aeskeygenassist $0x1,%xmm0,%xmm2
     60 	call	aesni_expand128
     61 	aeskeygenassist $0x2,%xmm0,%xmm2
     62 	call	aesni_expand128
     63 	aeskeygenassist $0x4,%xmm0,%xmm2
     64 	call	aesni_expand128
     65 	aeskeygenassist $0x8,%xmm0,%xmm2
     66 	call	aesni_expand128
     67 	aeskeygenassist $0x10,%xmm0,%xmm2
     68 	call	aesni_expand128
     69 	aeskeygenassist $0x20,%xmm0,%xmm2
     70 	call	aesni_expand128
     71 	aeskeygenassist $0x40,%xmm0,%xmm2
     72 	call	aesni_expand128
     73 	aeskeygenassist $0x80,%xmm0,%xmm2
     74 	call	aesni_expand128
     75 	aeskeygenassist $0x1b,%xmm0,%xmm2
     76 	call	aesni_expand128
     77 	aeskeygenassist $0x36,%xmm0,%xmm2
     78 	call	aesni_expand128
     79 	ret
     80 END(aesni_setenckey128)
     81 
     82 /*
     83  * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi)
     84  *
     85  *	Expand a 24-byte AES-192 key into 12 round keys.
     86  *
     87  *	Standard ABI calling convention.
     88  */
     89 ENTRY(aesni_setenckey192)
     90 	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
     91 	movq	0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */
     92 	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
     93 	lea	0x10(%rdi),%rdi /* advance %rdi to next round key */
     94 	aeskeygenassist $0x1,%xmm1,%xmm2
     95 	call	aesni_expand192a
     96 	aeskeygenassist $0x2,%xmm0,%xmm2
     97 	call	aesni_expand192b
     98 	aeskeygenassist $0x4,%xmm1,%xmm2
     99 	call	aesni_expand192a
    100 	aeskeygenassist $0x8,%xmm0,%xmm2
    101 	call	aesni_expand192b
    102 	aeskeygenassist $0x10,%xmm1,%xmm2
    103 	call	aesni_expand192a
    104 	aeskeygenassist $0x20,%xmm0,%xmm2
    105 	call	aesni_expand192b
    106 	aeskeygenassist $0x40,%xmm1,%xmm2
    107 	call	aesni_expand192a
    108 	aeskeygenassist $0x80,%xmm0,%xmm2
    109 	call	aesni_expand192b
    110 	ret
    111 END(aesni_setenckey192)
    112 
    113 /*
    114  * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi)
    115  *
    116  *	Expand a 32-byte AES-256 key into 14 round keys.
    117  *
    118  *	Standard ABI calling convention.
    119  */
    120 ENTRY(aesni_setenckey256)
    121 	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
    122 	movdqu	0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */
    123 	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
    124 	movdqa	%xmm1,0x10(%rdi) /* store master key [128:256) as round key */
    125 	lea	0x20(%rdi),%rdi	/* advance %rdi to next round key */
    126 	aeskeygenassist $0x1,%xmm1,%xmm2
    127 	call	aesni_expand256a
    128 	aeskeygenassist $0x1,%xmm0,%xmm2
    129 	call	aesni_expand256b
    130 	aeskeygenassist $0x2,%xmm1,%xmm2
    131 	call	aesni_expand256a
    132 	aeskeygenassist $0x2,%xmm0,%xmm2
    133 	call	aesni_expand256b
    134 	aeskeygenassist $0x4,%xmm1,%xmm2
    135 	call	aesni_expand256a
    136 	aeskeygenassist $0x4,%xmm0,%xmm2
    137 	call	aesni_expand256b
    138 	aeskeygenassist $0x8,%xmm1,%xmm2
    139 	call	aesni_expand256a
    140 	aeskeygenassist $0x8,%xmm0,%xmm2
    141 	call	aesni_expand256b
    142 	aeskeygenassist $0x10,%xmm1,%xmm2
    143 	call	aesni_expand256a
    144 	aeskeygenassist $0x10,%xmm0,%xmm2
    145 	call	aesni_expand256b
    146 	aeskeygenassist $0x20,%xmm1,%xmm2
    147 	call	aesni_expand256a
    148 	aeskeygenassist $0x20,%xmm0,%xmm2
    149 	call	aesni_expand256b
    150 	aeskeygenassist $0x40,%xmm1,%xmm2
    151 	call	aesni_expand256a
    152 	ret
    153 END(aesni_setenckey256)
    154 
    155 /*
    156  * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    157  *     uint128_t keygenassist@xmm2)
    158  *
    159  *	1. Compute the AES-128 round key using the previous round key.
    160  *	2. Store it at *rkp.
    161  *	3. Set %xmm0 to it.
    162  *	4. Advance %rdi to point at the next round key.
    163  *
    164  *	Internal ABI.  On entry:
    165  *
    166  *		%rdi = rkp, pointer to round key to compute
    167  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    168  *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON)
    169  *
    170  *	On exit:
    171  *
    172  *		%rdi = &rkp[1], rkp advanced by one round key
    173  *		%xmm0 = rk, the round key we just computed
    174  *		%xmm2 = garbage
    175  *		%xmm4 = garbage
    176  *		%xmm5 = garbage
    177  *		%xmm6 = garbage
    178  *
    179  *	Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15,
    180  *	and all other registers).
    181  */
    182 	.text
    183 	_ALIGN_TEXT
    184 	.type	aesni_expand128,@function
    185 aesni_expand128:
    186 	/*
    187 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    188 	 * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON.
    189 	 */
    190 	pshufd	$0b11111111,%xmm2,%xmm2
    191 
    192 	/*
    193 	 * %xmm4 := (0, prk[0], prk[1], prk[2])
    194 	 * %xmm5 := (0, 0, prk[0], prk[1])
    195 	 * %xmm6 := (0, 0, 0, prk[0])
    196 	 */
    197 	movdqa	%xmm0,%xmm4
    198 	movdqa	%xmm0,%xmm5
    199 	movdqa	%xmm0,%xmm6
    200 	pslldq	$4,%xmm4
    201 	pslldq	$8,%xmm5
    202 	pslldq	$12,%xmm6
    203 
    204 	/*
    205 	 * %xmm0 := (rk[0] = t ^ prk[0],
    206 	 *     rk[1] = t ^ prk[0] ^ prk[1],
    207 	 *     rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2],
    208 	 *     rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3])
    209 	 */
    210 	pxor	%xmm2,%xmm0
    211 	pxor	%xmm4,%xmm0
    212 	pxor	%xmm5,%xmm0
    213 	pxor	%xmm6,%xmm0
    214 
    215 	movdqa	%xmm0,(%rdi)	/* store round key */
    216 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    217 	ret
    218 END(aesni_expand128)
    219 
    220 /*
    221  * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    222  *     uint64_t rklo@xmm1, uint128_t keygenassist@xmm2)
    223  *
    224  *	Set even-numbered AES-192 round key.
    225  *
    226  *	Internal ABI.  On entry:
    227  *
    228  *		%rdi = rkp, pointer to two round keys to compute
    229  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    230  *		%xmm1 = (rklo[0], rklo[1], xxx, xxx)
    231  *		%xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx)
    232  *
    233  *	On exit:
    234  *
    235  *		%rdi = &rkp[2], rkp advanced by two round keys
    236  *		%xmm0 = nrk, second round key we just computed
    237  *		%xmm1 = rk, first round key we just computed
    238  *		%xmm2 = garbage
    239  *		%xmm4 = garbage
    240  *		%xmm5 = garbage
    241  *		%xmm6 = garbage
    242  *		%xmm7 = garbage
    243  */
    244 	.text
    245 	_ALIGN_TEXT
    246 	.type	aesni_expand192a,@function
    247 aesni_expand192a:
    248 	/*
    249 	 * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]),
    250 	 * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON.
    251 	 */
    252 	pshufd	$0b01010101,%xmm2,%xmm2
    253 
    254 	/*
    255 	 * We need to compute:
    256 	 *
    257 	 * rk[0] := rklo[0]
    258 	 * rk[1] := rklo[1]
    259 	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
    260 	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
    261 	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
    262 	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
    263 	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    264 	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
    265 	 *     ^ rklo[1]
    266 	 */
    267 
    268 	/*
    269 	 * %xmm4 := (prk[0], prk[1], prk[2], prk[3])
    270 	 * %xmm5 := (0, prk[0], prk[1], prk[2])
    271 	 * %xmm6 := (0, 0, prk[0], prk[1])
    272 	 * %xmm7 := (0, 0, 0, prk[0])
    273 	 */
    274 	movdqa	%xmm0,%xmm4
    275 	movdqa	%xmm0,%xmm5
    276 	movdqa	%xmm0,%xmm6
    277 	movdqa	%xmm0,%xmm7
    278 	pslldq	$4,%xmm5
    279 	pslldq	$8,%xmm6
    280 	pslldq	$12,%xmm7
    281 
    282 	/* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */
    283 	pxor	%xmm2,%xmm4
    284 	pxor	%xmm5,%xmm4
    285 	pxor	%xmm6,%xmm4
    286 	pxor	%xmm7,%xmm4
    287 
    288 	/*
    289 	 * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and
    290 	 * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]);
    291 	 * and we have yet to compute nrk[2] or nrk[3], which requires
    292 	 * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...).  We need
    293 	 * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and
    294 	 * nrk into %xmm0.
    295 	 */
    296 
    297 	/* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */
    298 	pshufd	$0b11111110,%xmm4,%xmm0
    299 
    300 	/*
    301 	 * %xmm6 := (0, 0, rklo[0], rklo[1])
    302 	 * %xmm7 := (0, 0, 0, rklo[0])
    303 	 */
    304 	movdqa	%xmm1,%xmm6
    305 	movdqa	%xmm1,%xmm7
    306 
    307 	pslldq	$8,%xmm6
    308 	pslldq	$12,%xmm7
    309 
    310 	/*
    311 	 * %xmm0 := (nrk[0],
    312 	 *     nrk[1],
    313 	 *     nrk[2] = nrk[1] ^ rklo[0],
    314 	 *     nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1])
    315 	 */
    316 	pxor	%xmm6,%xmm0
    317 	pxor	%xmm7,%xmm0
    318 
    319 	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */
    320 	shufps	$0b01000100,%xmm4,%xmm1
    321 
    322 	movdqa	%xmm1,(%rdi)		/* store round key */
    323 	movdqa	%xmm0,0x10(%rdi)	/* store next round key */
    324 	lea	0x20(%rdi),%rdi		/* advance two round keys */
    325 	ret
    326 END(aesni_expand192a)
    327 
    328 /*
    329  * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0,
    330  *     uint128_t keygenassist@xmm2)
    331  *
    332  *	Set odd-numbered AES-192 round key.
    333  *
    334  *	Internal ABI.  On entry:
    335  *
    336  *		%rdi = rkp, pointer to round key to compute
    337  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    338  *		%xmm1 = (xxx, xxx, pprk[2], pprk[3])
    339  *		%xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON)
    340  *
    341  *	On exit:
    342  *
    343  *		%rdi = &rkp[1], rkp advanced by one round key
    344  *		%xmm0 = rk, the round key we just computed
    345  *		%xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key
    346  *		%xmm2 = garbage
    347  *		%xmm4 = garbage
    348  *		%xmm5 = garbage
    349  *		%xmm6 = garbage
    350  *		%xmm7 = garbage
    351  */
    352 	.text
    353 	_ALIGN_TEXT
    354 	.type	aesni_expand192b,@function
    355 aesni_expand192b:
    356 	/*
    357 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    358 	 * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON.
    359 	 */
    360 	pshufd	$0b11111111,%xmm2,%xmm2
    361 
    362 	/*
    363 	 * We need to compute:
    364 	 *
    365 	 * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2]
    366 	 * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3]
    367 	 * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    368 	 * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    369 	 *     ^ prk[1]
    370 	 * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    371 	 *     ^ prk[1] ^ prk[2]
    372 	 * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
    373 	 *     ^ prk[1] ^ prk[2] ^ prk[3]
    374 	 */
    375 
    376 	/* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */
    377 	shufps	$0b01001110,%xmm0,%xmm1
    378 
    379 	/*
    380 	 * %xmm5 := (0, pprk[2], pprk[3], prk[0])
    381 	 * %xmm6 := (0, 0, pprk[2], pprk[3])
    382 	 * %xmm7 := (0, 0, 0, pprk[2])
    383 	 */
    384 	movdqa	%xmm1,%xmm5
    385 	movdqa	%xmm1,%xmm6
    386 	movdqa	%xmm1,%xmm7
    387 	pslldq	$4,%xmm5
    388 	pslldq	$8,%xmm6
    389 	pslldq	$12,%xmm7
    390 
    391 	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */
    392 	pxor	%xmm2,%xmm1
    393 	pxor	%xmm5,%xmm1
    394 	pxor	%xmm6,%xmm1
    395 	pxor	%xmm7,%xmm1
    396 
    397 	/* %xmm4 := (prk[2], prk[3], xxx, xxx) */
    398 	pshufd	$0b00001110,%xmm0,%xmm4
    399 
    400 	/* %xmm5 := (0, prk[2], xxx, xxx) */
    401 	movdqa	%xmm4,%xmm5
    402 	pslldq	$4,%xmm5
    403 
    404 	/* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */
    405 	movdqa	%xmm1,%xmm0
    406 
    407 	/* %xmm1 := (rk[3], rk[3], xxx, xxx) */
    408 	shufps	$0b00001111,%xmm1,%xmm1
    409 
    410 	/*
    411 	 * %xmm1 := (nrk[0] = rk[3] ^ prk[2],
    412 	 *     nrk[1] = rk[3] ^ prk[2] ^ prk[3],
    413 	 *     xxx,
    414 	 *     xxx)
    415 	 */
    416 	pxor	%xmm4,%xmm1
    417 	pxor	%xmm5,%xmm1
    418 
    419 	movdqa	%xmm0,(%rdi)	/* store round key */
    420 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    421 	ret
    422 END(aesni_expand192b)
    423 
    424 /*
    425  * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0,
    426  *     uint128_t prk@xmm1, uint128_t keygenassist@xmm2)
    427  *
    428  *	Set even-numbered AES-256 round key.
    429  *
    430  *	Internal ABI.  On entry:
    431  *
    432  *		%rdi = rkp, pointer to round key to compute
    433  *		%xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3])
    434  *		%xmm1 = (prk[0], prk[1], prk[2], prk[3])
    435  *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])))
    436  *
    437  *	On exit:
    438  *
    439  *		%rdi = &rkp[1], rkp advanced by one round key
    440  *		%xmm0 = rk, the round key we just computed
    441  *		%xmm1 = prk, previous round key, preserved from entry
    442  *		%xmm2 = garbage
    443  *		%xmm4 = garbage
    444  *		%xmm5 = garbage
    445  *		%xmm6 = garbage
    446  *
    447  *	The computation turns out to be the same as for AES-128; the
    448  *	previous round key does not figure into it, only the
    449  *	previous-previous round key.
    450  */
    451 	aesni_expand256a = aesni_expand128
    452 
    453 /*
    454  * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0,
    455  *     uint128_t pprk@xmm1, uint128_t keygenassist@xmm2)
    456  *
    457  *	Set odd-numbered AES-256 round key.
    458  *
    459  *	Internal ABI.  On entry:
    460  *
    461  *		%rdi = rkp, pointer to round key to compute
    462  *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
    463  *		%xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3])
    464  *		%xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx)
    465  *
    466  *	On exit:
    467  *
    468  *		%rdi = &rkp[1], rkp advanced by one round key
    469  *		%xmm0 = prk, previous round key, preserved from entry
    470  *		%xmm1 = rk, the round key we just computed
    471  *		%xmm2 = garbage
    472  *		%xmm4 = garbage
    473  *		%xmm5 = garbage
    474  *		%xmm6 = garbage
    475  */
    476 	.text
    477 	_ALIGN_TEXT
    478 	.type	aesni_expand256b,@function
    479 aesni_expand256b:
    480 	/*
    481 	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
    482 	 * i.e., set each word of %xmm2 to t := Sub(prk[3]).
    483 	 */
    484 	pshufd	$0b10101010,%xmm2,%xmm2
    485 
    486 	/*
    487 	 * %xmm4 := (0, pprk[0], pprk[1], pprk[2])
    488 	 * %xmm5 := (0, 0, pprk[0], pprk[1])
    489 	 * %xmm6 := (0, 0, 0, pprk[0])
    490 	 */
    491 	movdqa	%xmm1,%xmm4
    492 	movdqa	%xmm1,%xmm5
    493 	movdqa	%xmm1,%xmm6
    494 	pslldq	$4,%xmm4
    495 	pslldq	$8,%xmm5
    496 	pslldq	$12,%xmm6
    497 
    498 	/*
    499 	 * %xmm0 := (rk[0] = t ^ pprk[0],
    500 	 *     rk[1] = t ^ pprk[0] ^ pprk[1],
    501 	 *     rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2],
    502 	 *     rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3])
    503 	 */
    504 	pxor	%xmm2,%xmm1
    505 	pxor	%xmm4,%xmm1
    506 	pxor	%xmm5,%xmm1
    507 	pxor	%xmm6,%xmm1
    508 
    509 	movdqa	%xmm1,(%rdi)	/* store round key */
    510 	lea	0x10(%rdi),%rdi	/* advance to next round key address */
    511 	ret
    512 END(aesni_expand256b)
    513 
    514 /*
    515  * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi,
    516  *     uint32_t nrounds@rdx)
    517  *
    518  *	Convert AES encryption round keys to AES decryption round keys.
    519  *	`rounds' must be between 10 and 14.
    520  *
    521  *	Standard ABI calling convention.
    522  */
    523 ENTRY(aesni_enctodec)
    524 	shl	$4,%edx		/* rdx := byte offset of last round key */
    525 	movdqa	(%rdi,%rdx),%xmm0	/* load last round key */
    526 	movdqa	%xmm0,(%rsi)	/* store last round key verbatim */
    527 	jmp	2f
    528 	_ALIGN_TEXT
    529 1:	movdqa	(%rdi,%rdx),%xmm0	/* load round key */
    530 	aesimc	%xmm0,%xmm0	/* convert encryption to decryption */
    531 	movdqa	%xmm0,(%rsi)	/* store round key */
    532 2:	sub	$0x10,%rdx	/* advance to next round key */
    533 	lea	0x10(%rsi),%rsi
    534 	jnz	1b		/* repeat if more rounds */
    535 	movdqa	(%rdi),%xmm0	/* load first round key */
    536 	movdqa	%xmm0,(%rsi)	/* store first round key verbatim */
    537 	ret
    538 END(aesni_enctodec)
    539 
    540 /*
    541  * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi,
    542  *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
    543  *
    544  *	Encrypt a single block.
    545  *
    546  *	Standard ABI calling convention.
    547  */
    548 ENTRY(aesni_enc)
    549 	movdqu	(%rsi),%xmm0
    550 	call	aesni_enc1
    551 	movdqu	%xmm0,(%rdx)
    552 	ret
    553 END(aesni_enc)
    554 
    555 /*
    556  * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi,
    557  *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
    558  *
    559  *	Decrypt a single block.
    560  *
    561  *	Standard ABI calling convention.
    562  */
    563 ENTRY(aesni_dec)
    564 	movdqu	(%rsi),%xmm0
    565 	call	aesni_dec1
    566 	movdqu	%xmm0,(%rdx)
    567 	ret
    568 END(aesni_dec)
    569 
    570 /*
    571  * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    572  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8,
    573  *     uint32_t nrounds@r9d)
    574  *
    575  *	Encrypt a contiguous sequence of blocks with AES-CBC.
    576  *
    577  *	nbytes must be an integral multiple of 16.
    578  *
    579  *	Standard ABI calling convention.
    580  */
    581 ENTRY(aesni_cbc_enc)
    582 	cmp	$0,%rcx
    583 	jz	2f
    584 	mov	%rcx,%r10		/* r10 := nbytes */
    585 	movdqu	(%r8),%xmm0		/* xmm0 := chaining value */
    586 	_ALIGN_TEXT
    587 1:	movdqu	(%rsi),%xmm1		/* xmm1 := plaintext block */
    588 	lea	0x10(%rsi),%rsi
    589 	pxor	%xmm1,%xmm0		/* xmm0 := cv ^ ptxt */
    590 	mov	%r9d,%ecx		/* ecx := nrounds */
    591 	call	aesni_enc1		/* xmm0 := ciphertext block */
    592 	movdqu	%xmm0,(%rdx)
    593 	lea	0x10(%rdx),%rdx
    594 	sub	$0x10,%r10
    595 	jnz	1b			/* repeat if r10 is nonzero */
    596 	movdqu	%xmm0,(%r8)		/* store chaining value */
    597 2:	ret
    598 END(aesni_cbc_enc)
    599 
    600 /*
    601  * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    602  *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
    603  *     uint32_t nrounds@r9)
    604  *
    605  *	Decrypt a contiguous sequence of blocks with AES-CBC.
    606  *
    607  *	nbytes must be a positive integral multiple of 16.  This routine
    608  *	is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once.
    609  *
    610  *	Standard ABI calling convention.
    611  */
    612 ENTRY(aesni_cbc_dec1)
    613 	push	%rbp			/* create stack frame uint128[1] */
    614 	mov	%rsp,%rbp
    615 	sub	$0x10,%rsp
    616 	movdqu	(%r8),%xmm8		/* xmm8 := iv */
    617 	movdqa	%xmm8,(%rsp)		/* save iv */
    618 	mov	%rcx,%r10		/* r10 := nbytes */
    619 	movdqu	-0x10(%rsi,%r10),%xmm0	/* xmm0 := last ciphertext block */
    620 	movdqu	%xmm0,(%r8)		/* update iv */
    621 	jmp	2f
    622 	_ALIGN_TEXT
    623 1:	movdqu	-0x10(%rsi,%r10),%xmm8	/* xmm8 := chaining value */
    624 	pxor	%xmm8,%xmm0		/* xmm0 := ptxt */
    625 	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
    626 	movdqa	%xmm8,%xmm0		/* move cv = ciphertext block */
    627 2:	mov	%r9d,%ecx		/* ecx := nrounds */
    628 	call	aesni_dec1		/* xmm0 := cv ^ ptxt */
    629 	sub	$0x10,%r10
    630 	jnz	1b			/* repeat if more blocks */
    631 	pxor	(%rsp),%xmm0		/* xmm0 := ptxt */
    632 	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
    633 	leave
    634 	ret
    635 END(aesni_cbc_dec1)
    636 
    637 /*
    638  * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    639  *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
    640  *     uint32_t nrounds@r9)
    641  *
    642  *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
    643  *
    644  *	nbytes must be a positive integral multiple of 128.
    645  *
    646  *	Standard ABI calling convention.
    647  */
    648 ENTRY(aesni_cbc_dec8)
    649 	push	%rbp			/* create stack frame uint128[1] */
    650 	mov	%rsp,%rbp
    651 	sub	$0x10,%rsp
    652 	movdqu	(%r8),%xmm8		/* xmm8 := iv */
    653 	movdqa	%xmm8,(%rsp)		/* save iv */
    654 	mov	%rcx,%r10		/* r10 := nbytes */
    655 	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := ciphertext block[n-1] */
    656 	movdqu	%xmm7,(%r8)		/* update iv */
    657 	jmp	2f
    658 	_ALIGN_TEXT
    659 1:	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := cv[0] */
    660 	pxor	%xmm7,%xmm0		/* xmm0 := ptxt[0] */
    661 	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
    662 2:	movdqu	-0x20(%rsi,%r10),%xmm6	/* xmm6 := ciphertext block[n-2] */
    663 	movdqu	-0x30(%rsi,%r10),%xmm5	/* xmm5 := ciphertext block[n-3] */
    664 	movdqu	-0x40(%rsi,%r10),%xmm4	/* xmm4 := ciphertext block[n-4] */
    665 	movdqu	-0x50(%rsi,%r10),%xmm3	/* xmm3 := ciphertext block[n-5] */
    666 	movdqu	-0x60(%rsi,%r10),%xmm2	/* xmm2 := ciphertext block[n-6] */
    667 	movdqu	-0x70(%rsi,%r10),%xmm1	/* xmm1 := ciphertext block[n-7] */
    668 	movdqu	-0x80(%rsi,%r10),%xmm0	/* xmm0 := ciphertext block[n-8] */
    669 	movdqa	%xmm6,%xmm15		/* xmm[8+i] := cv[i], 0<i<8 */
    670 	movdqa	%xmm5,%xmm14
    671 	movdqa	%xmm4,%xmm13
    672 	movdqa	%xmm3,%xmm12
    673 	movdqa	%xmm2,%xmm11
    674 	movdqa	%xmm1,%xmm10
    675 	movdqa	%xmm0,%xmm9
    676 	mov	%r9d,%ecx		/* ecx := nrounds */
    677 	call	aesni_dec8		/* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */
    678 	pxor	%xmm15,%xmm7		/* xmm[i] := ptxt[i], 0<i<8 */
    679 	pxor	%xmm14,%xmm6
    680 	pxor	%xmm13,%xmm5
    681 	pxor	%xmm12,%xmm4
    682 	pxor	%xmm11,%xmm3
    683 	pxor	%xmm10,%xmm2
    684 	pxor	%xmm9,%xmm1
    685 	movdqu	%xmm7,-0x10(%rdx,%r10)	/* store plaintext blocks */
    686 	movdqu	%xmm6,-0x20(%rdx,%r10)
    687 	movdqu	%xmm5,-0x30(%rdx,%r10)
    688 	movdqu	%xmm4,-0x40(%rdx,%r10)
    689 	movdqu	%xmm3,-0x50(%rdx,%r10)
    690 	movdqu	%xmm2,-0x60(%rdx,%r10)
    691 	movdqu	%xmm1,-0x70(%rdx,%r10)
    692 	sub	$0x80,%r10
    693 	jnz	1b			/* repeat if more blocks */
    694 	pxor	(%rsp),%xmm0		/* xmm0 := ptxt[0] */
    695 	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
    696 	leave
    697 	ret
    698 END(aesni_cbc_dec8)
    699 
    700 /*
    701  * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    702  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    703  *     uint32_t nrounds@r9d)
    704  *
    705  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    706  *
    707  *	nbytes must be a positive integral multiple of 16.  This routine
    708  *	is not vectorized; use aesni_xts_enc8 for >=8 blocks at once.
    709  *
    710  *	Standard ABI calling convention.
    711  */
    712 ENTRY(aesni_xts_enc1)
    713 	mov	%rcx,%r10		/* r10 := nbytes */
    714 	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
    715 	_ALIGN_TEXT
    716 1:	movdqu	(%rsi),%xmm0		/* xmm0 := ptxt */
    717 	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
    718 	pxor	%xmm15,%xmm0		/* xmm0 := ptxt ^ tweak */
    719 	mov	%r9d,%ecx		/* ecx := nrounds */
    720 	call	aesni_enc1		/* xmm0 := AES(ptxt ^ tweak) */
    721 	pxor	%xmm15,%xmm0		/* xmm0 := AES(ptxt ^ tweak) ^ tweak */
    722 	movdqu	%xmm0,(%rdx)		/* store ciphertext block */
    723 	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
    724 	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
    725 	sub	$0x10,%r10
    726 	jnz	1b			/* repeat if more blocks */
    727 	movdqu	%xmm15,(%r8)		/* update tweak */
    728 	ret
    729 END(aesni_xts_enc1)
    730 
    731 /*
    732  * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    733  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    734  *     uint32_t nrounds@r9d)
    735  *
    736  *	Encrypt a contiguous sequence of blocks with AES-XTS.
    737  *
    738  *	nbytes must be a positive integral multiple of 128.
    739  *
    740  *	Standard ABI calling convention.
    741  */
    742 ENTRY(aesni_xts_enc8)
    743 	push	%rbp			/* create stack frame uint128[1] */
    744 	mov	%rsp,%rbp
    745 	sub	$0x10,%rsp
    746 	mov	%rcx,%r10		/* r10 := nbytes */
    747 	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
    748 	_ALIGN_TEXT
    749 1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
    750 	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
    751 	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
    752 	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
    753 	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
    754 	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
    755 	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
    756 	call	aesni_xts_mulx		/* xmm15 := tweak[4] */
    757 	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
    758 	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
    759 	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
    760 	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
    761 	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
    762 	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
    763 	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
    764 	movdqu	0x10(%rsi),%xmm1
    765 	movdqu	0x20(%rsi),%xmm2
    766 	movdqu	0x30(%rsi),%xmm3
    767 	movdqu	0x40(%rsi),%xmm4
    768 	movdqu	0x50(%rsi),%xmm5
    769 	movdqu	0x60(%rsi),%xmm6
    770 	movdqu	0x70(%rsi),%xmm7
    771 	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
    772 	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
    773 	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
    774 	pxor	%xmm9,%xmm1
    775 	pxor	%xmm10,%xmm2
    776 	pxor	%xmm11,%xmm3
    777 	pxor	%xmm12,%xmm4
    778 	pxor	%xmm13,%xmm5
    779 	pxor	%xmm14,%xmm6
    780 	pxor	%xmm15,%xmm7
    781 	mov	%r9d,%ecx		/* ecx := nrounds */
    782 	call	aesni_enc8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
    783 	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
    784 	pxor	%xmm9,%xmm1
    785 	pxor	%xmm10,%xmm2
    786 	pxor	%xmm11,%xmm3
    787 	pxor	%xmm12,%xmm4
    788 	pxor	%xmm13,%xmm5
    789 	pxor	%xmm14,%xmm6
    790 	pxor	%xmm15,%xmm7
    791 	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
    792 	movdqu	%xmm1,0x10(%rdx)
    793 	movdqu	%xmm2,0x20(%rdx)
    794 	movdqu	%xmm3,0x30(%rdx)
    795 	movdqu	%xmm4,0x40(%rdx)
    796 	movdqu	%xmm5,0x50(%rdx)
    797 	movdqu	%xmm6,0x60(%rdx)
    798 	movdqu	%xmm7,0x70(%rdx)
    799 	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
    800 	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
    801 	sub	$0x80,%r10
    802 	jnz	1b			/* repeat if more block groups */
    803 	movdqu	%xmm15,(%r8)		/* update tweak */
    804 	leave
    805 	ret
    806 END(aesni_xts_enc8)
    807 
    808 /*
    809  * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    810  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    811  *     uint32_t nrounds@r9d)
    812  *
    813  *	Decrypt a contiguous sequence of blocks with AES-XTS.
    814  *
    815  *	nbytes must be a positive integral multiple of 16.  This routine
    816  *	is not vectorized; use aesni_xts_dec8 for >=8 blocks at once.
    817  *
    818  *	Standard ABI calling convention.
    819  */
    820 ENTRY(aesni_xts_dec1)
    821 	mov	%rcx,%r10		/* r10 := nbytes */
    822 	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
    823 	_ALIGN_TEXT
    824 1:	movdqu	(%rsi),%xmm0		/* xmm0 := ctxt */
    825 	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
    826 	pxor	%xmm15,%xmm0		/* xmm0 := ctxt ^ tweak */
    827 	mov	%r9d,%ecx		/* ecx := nrounds */
    828 	call	aesni_dec1		/* xmm0 := AES(ctxt ^ tweak) */
    829 	pxor	%xmm15,%xmm0		/* xmm0 := AES(ctxt ^ tweak) ^ tweak */
    830 	movdqu	%xmm0,(%rdx)		/* store plaintext block */
    831 	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
    832 	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
    833 	sub	$0x10,%r10
    834 	jnz	1b			/* repeat if more blocks */
    835 	movdqu	%xmm15,(%r8)		/* update tweak */
    836 	ret
    837 END(aesni_xts_dec1)
    838 
    839 /*
    840  * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
    841  *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
    842  *     uint32_t nrounds@r9d)
    843  *
    844  *	Decrypt a contiguous sequence of blocks with AES-XTS.
    845  *
    846  *	nbytes must be a positive integral multiple of 128.
    847  *
    848  *	Standard ABI calling convention.
    849  */
    850 ENTRY(aesni_xts_dec8)
    851 	push	%rbp			/* create stack frame uint128[1] */
    852 	mov	%rsp,%rbp
    853 	sub	$0x10,%rsp
    854 	mov	%rcx,%r10		/* r10 := nbytes */
    855 	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
    856 	_ALIGN_TEXT
    857 1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
    858 	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
    859 	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
    860 	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
    861 	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
    862 	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
    863 	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
    864 	call	aesni_xts_mulx		/* xmm51 := tweak[4] */
    865 	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
    866 	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
    867 	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
    868 	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
    869 	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
    870 	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
    871 	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
    872 	movdqu	0x10(%rsi),%xmm1
    873 	movdqu	0x20(%rsi),%xmm2
    874 	movdqu	0x30(%rsi),%xmm3
    875 	movdqu	0x40(%rsi),%xmm4
    876 	movdqu	0x50(%rsi),%xmm5
    877 	movdqu	0x60(%rsi),%xmm6
    878 	movdqu	0x70(%rsi),%xmm7
    879 	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
    880 	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
    881 	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
    882 	pxor	%xmm9,%xmm1
    883 	pxor	%xmm10,%xmm2
    884 	pxor	%xmm11,%xmm3
    885 	pxor	%xmm12,%xmm4
    886 	pxor	%xmm13,%xmm5
    887 	pxor	%xmm14,%xmm6
    888 	pxor	%xmm15,%xmm7
    889 	mov	%r9d,%ecx		/* ecx := nrounds */
    890 	call	aesni_dec8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
    891 	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
    892 	pxor	%xmm9,%xmm1
    893 	pxor	%xmm10,%xmm2
    894 	pxor	%xmm11,%xmm3
    895 	pxor	%xmm12,%xmm4
    896 	pxor	%xmm13,%xmm5
    897 	pxor	%xmm14,%xmm6
    898 	pxor	%xmm15,%xmm7
    899 	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
    900 	movdqu	%xmm1,0x10(%rdx)
    901 	movdqu	%xmm2,0x20(%rdx)
    902 	movdqu	%xmm3,0x30(%rdx)
    903 	movdqu	%xmm4,0x40(%rdx)
    904 	movdqu	%xmm5,0x50(%rdx)
    905 	movdqu	%xmm6,0x60(%rdx)
    906 	movdqu	%xmm7,0x70(%rdx)
    907 	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
    908 	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
    909 	sub	$0x80,%r10
    910 	jnz	1b			/* repeat if more block groups */
    911 	movdqu	%xmm15,(%r8)		/* update tweak */
    912 	leave
    913 	ret
    914 END(aesni_xts_dec8)
    915 
    916 /*
    917  * aesni_xts_mulx(tweak@xmm15)
    918  *
    919  *	Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
    920  *	Uses %xmm0 as temporary.
    921  */
    922 	.text
    923 	_ALIGN_TEXT
    924 	.type	aesni_xts_mulx,@function
    925 aesni_xts_mulx:
    926 	/*
    927 	 * Simultaneously determine
    928 	 * (a) whether the high bit of the low quadword must be
    929 	 *     shifted into the low bit of the high quadword, and
    930 	 * (b) whether the high bit of the high quadword must be
    931 	 *     carried into x^128 = x^7 + x^2 + x + 1.
    932 	 */
    933 	pxor	%xmm0,%xmm0	/* xmm0 := 0 */
    934 	pcmpgtq	%xmm15,%xmm0	/* xmm0[i] := -1 if 0 > xmm15[i] else 0 */
    935 	pshufd	$0b01001110,%xmm0,%xmm0	/* swap halves of xmm0 */
    936 	pand	xtscarry(%rip),%xmm0	/* copy xtscarry according to mask */
    937 	psllq	$1,%xmm15	/* shift */
    938 	pxor	%xmm0,%xmm15	/* incorporate (a) and (b) */
    939 	ret
    940 END(aesni_xts_mulx)
    941 
    942 	.section .rodata
    943 	.p2align 4
    944 	.type	xtscarry,@object
    945 xtscarry:
    946 	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
    947 END(xtscarry)
    948 
    949 /*
    950  * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi)
    951  *
    952  *	Update an AES-XTS tweak.
    953  *
    954  *	Standard ABI calling convention.
    955  */
    956 ENTRY(aesni_xts_update)
    957 	movdqu	(%rdi),%xmm15
    958 	call	aesni_xts_mulx
    959 	movdqu	%xmm15,(%rsi)
    960 	ret
    961 END(aesni_xts_update)
    962 
    963 /*
    964  * aesni_cbcmac_update1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    965  *     size_t nbytes@rdx, uint8_t auth[16] @rcx, uint32_t nrounds@r8d)
    966  *
    967  *	Update CBC-MAC.
    968  *
    969  *	nbytes must be a positive integral multiple of 16.
    970  *
    971  *	Standard ABI calling convention.
    972  */
    973 ENTRY(aesni_cbcmac_update1)
    974 	movdqu	(%rcx),%xmm0		/* xmm0 := auth */
    975 	mov	%rdx,%r10		/* r10 := nbytes */
    976 	mov	%rcx,%rdx		/* rdx := &auth */
    977 	_ALIGN_TEXT
    978 1:	pxor	(%rsi),%xmm0		/* xmm0 ^= plaintext block */
    979 	lea	0x10(%rsi),%rsi
    980 	mov	%r8d,%ecx		/* ecx := nrounds */
    981 	call	aesni_enc1		/* xmm0 := auth'; trash rax,rcx,xmm8 */
    982 	sub	$0x10,%r10
    983 	jnz	1b
    984 	movdqu	%xmm0,(%rdx)		/* store auth' */
    985 	ret
    986 END(aesni_cbcmac_update1)
    987 
    988 /*
    989  * aesni_ccm_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
    990  *     uint8_t *out@rdx, size_t nbytes@rcx,
    991  *     uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
    992  *
    993  *	Update CCM encryption.
    994  *
    995  *	nbytes must be a positive integral multiple of 16.
    996  *
    997  *	Standard ABI calling convention.
    998  */
    999 ENTRY(aesni_ccm_enc1)
   1000 	mov	%rcx,%r10		/* r10 := nbytes */
   1001 	movdqu	0x10(%r8),%xmm2		/* xmm2 := ctr (be) */
   1002 	movdqa	bswap32(%rip),%xmm4	/* xmm4 := bswap32 table */
   1003 	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
   1004 	movdqu	(%r8),%xmm0		/* xmm0 := auth */
   1005 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
   1006 	_ALIGN_TEXT
   1007 1:	movdqu	(%rsi),%xmm3		/* xmm3 := plaintext block */
   1008 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
   1009 	lea	0x10(%rsi),%rsi
   1010 	movdqa	%xmm2,%xmm1		/* xmm1 := ctr (le) */
   1011 	mov	%r9d,%ecx		/* ecx := nrounds */
   1012 	pshufb	%xmm4,%xmm1		/* xmm1 := ctr (be) */
   1013 	pxor	%xmm3,%xmm0		/* xmm0 := auth ^ ptxt */
   1014 	call	aesni_enc2		/* trash rax/rcx/xmm8 */
   1015 	pxor	%xmm1,%xmm3		/* xmm3 := ciphertext block */
   1016 	sub	$0x10,%r10		/* count down bytes */
   1017 	movdqu	%xmm3,(%rdx)		/* store ciphertext block */
   1018 	lea	0x10(%rdx),%rdx
   1019 	jnz	1b			/* repeat if more blocks */
   1020 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (be) */
   1021 	movdqu	%xmm0,(%r8)		/* store updated auth */
   1022 	movdqu	%xmm2,0x10(%r8)		/* store updated ctr */
   1023 	ret
   1024 END(aesni_ccm_enc1)
   1025 
   1026 /*
   1027  * aesni_ccm_dec1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
   1028  *     uint8_t *out@rdx, size_t nbytes@rcx,
   1029  *     uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
   1030  *
   1031  *	Update CCM decryption.
   1032  *
   1033  *	nbytes must be a positive integral multiple of 16.
   1034  *
   1035  *	Standard ABI calling convention.
   1036  */
   1037 ENTRY(aesni_ccm_dec1)
   1038 	movdqu	0x10(%r8),%xmm2		/* xmm2 := ctr (be) */
   1039 	movdqa	bswap32(%rip),%xmm4	/* xmm4 := bswap32 table */
   1040 	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
   1041 	movdqu	(%r8),%xmm1		/* xmm1 := auth */
   1042 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
   1043 	mov	%rcx,%r10		/* r10 := nbytes */
   1044 
   1045 	/* Decrypt the first block.  */
   1046 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
   1047 	mov	%r9d,%ecx		/* ecx := nrounds */
   1048 	movdqa	%xmm2,%xmm0		/* xmm0 := ctr (le) */
   1049 	movdqu	(%rsi),%xmm3		/* xmm3 := ctxt */
   1050 	pshufb	%xmm4,%xmm0		/* xmm0 := ctr (be) */
   1051 	lea	0x10(%rsi),%rsi
   1052 	call	aesni_enc1		/* xmm0 := pad; trash rax/rcx/xmm8 */
   1053 	jmp	2f
   1054 
   1055 	_ALIGN_TEXT
   1056 1:	/*
   1057 	 * Authenticate the last block and decrypt the next block
   1058 	 * simultaneously.
   1059 	 *
   1060 	 *	xmm1 = auth ^ ptxt[-1]
   1061 	 *	xmm2 = ctr[-1] (le)
   1062 	 */
   1063 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
   1064 	mov	%r9d,%ecx		/* ecx := nrounds */
   1065 	movdqa	%xmm2,%xmm0		/* xmm0 := ctr (le) */
   1066 	movdqu	(%rsi),%xmm3		/* xmm3 := ctxt */
   1067 	pshufb	%xmm4,%xmm0		/* xmm0 := ctr (be) */
   1068 	lea	0x10(%rsi),%rsi
   1069 	call	aesni_enc2		/* xmm0 := pad, xmm1 := auth';
   1070 					 * trash rax/rcx/xmm8 */
   1071 2:	pxor	%xmm0,%xmm3		/* xmm3 := ptxt */
   1072 	sub	$0x10,%r10
   1073 	movdqu	%xmm3,(%rdx)		/* store plaintext */
   1074 	lea	0x10(%rdx),%rdx
   1075 	pxor	%xmm3,%xmm1		/* xmm1 := auth ^ ptxt */
   1076 	jnz	1b
   1077 
   1078 	/* Authenticate the last block.  */
   1079 	movdqa	%xmm1,%xmm0		/* xmm0 := auth ^ ptxt */
   1080 	mov	%r9d,%ecx		/* ecx := nrounds */
   1081 	call	aesni_enc1		/* xmm0 := auth' */
   1082 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (be) */
   1083 	movdqu	%xmm0,(%r8)		/* store updated auth */
   1084 	movdqu	%xmm2,0x10(%r8)		/* store updated ctr */
   1085 	ret
   1086 END(aesni_ccm_dec1)
   1087 
   1088 	.section .rodata
   1089 	.p2align 4
   1090 	.type	bswap32,@object
   1091 bswap32:
   1092 	.byte	3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
   1093 END(bswap32)
   1094 
   1095 	.section .rodata
   1096 	.p2align 4
   1097 	.type	ctr32_inc,@object
   1098 ctr32_inc:
   1099 	.byte	0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0
   1100 END(ctr32_inc)
   1101 
   1102 /*
   1103  * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0,
   1104  *     uint32_t nrounds@ecx)
   1105  *
   1106  *	Encrypt a single AES block in %xmm0.
   1107  *
   1108  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1109  */
   1110 	.text
   1111 	_ALIGN_TEXT
   1112 	.type	aesni_enc1,@function
   1113 aesni_enc1:
   1114 	pxor	(%rdi),%xmm0	/* xor in first round key */
   1115 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1116 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1117 	neg	%rcx		/* rcx := byte offset of round key from end */
   1118 	jmp	2f
   1119 	_ALIGN_TEXT
   1120 1:	aesenc	%xmm8,%xmm0
   1121 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1122 	add	$0x10,%rcx
   1123 	jnz	1b		/* repeat if more rounds */
   1124 	aesenclast %xmm8,%xmm0
   1125 	ret
   1126 END(aesni_enc1)
   1127 
   1128 /*
   1129  * aesni_enc2(const struct aesenc *enckey@rdi, uint128_t block0@xmm0,
   1130  *     uint128_t block1@xmm1, uint32_t nrounds@ecx)
   1131  *
   1132  *	Encrypt two AES blocks in %xmm0 and %xmm1.
   1133  *
   1134  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1135  */
   1136 	.text
   1137 	_ALIGN_TEXT
   1138 	.type	aesni_enc2,@function
   1139 aesni_enc2:
   1140 	movdqa	(%rdi),%xmm8	/* xmm8 := first round key */
   1141 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1142 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1143 	neg	%rcx		/* rcx := byte offset of round key from end */
   1144 	pxor	%xmm8,%xmm0	/* xor in first round key */
   1145 	pxor	%xmm8,%xmm1
   1146 	jmp	2f
   1147 	_ALIGN_TEXT
   1148 1:	aesenc	%xmm8,%xmm0
   1149 	aesenc	%xmm8,%xmm1
   1150 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1151 	add	$0x10,%rcx
   1152 	jnz	1b		/* repeat if there's more */
   1153 	aesenclast %xmm8,%xmm0
   1154 	aesenclast %xmm8,%xmm1
   1155 	ret
   1156 END(aesni_enc2)
   1157 
   1158 /*
   1159  * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ...,
   1160  *     block7@xmm7, uint32_t nrounds@ecx)
   1161  *
   1162  *	Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
   1163  *
   1164  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1165  */
   1166 	.text
   1167 	_ALIGN_TEXT
   1168 	.type	aesni_enc8,@function
   1169 aesni_enc8:
   1170 	movdqa	(%rdi),%xmm8	/* xor in first round key */
   1171 	pxor	%xmm8,%xmm0
   1172 	pxor	%xmm8,%xmm1
   1173 	pxor	%xmm8,%xmm2
   1174 	pxor	%xmm8,%xmm3
   1175 	pxor	%xmm8,%xmm4
   1176 	pxor	%xmm8,%xmm5
   1177 	pxor	%xmm8,%xmm6
   1178 	pxor	%xmm8,%xmm7
   1179 	shl	$4,%ecx		/* ecx := total byte size of round keys */
   1180 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
   1181 	neg	%rcx		/* rcx := byte offset of round key from end */
   1182 	jmp	2f
   1183 	_ALIGN_TEXT
   1184 1:	aesenc	%xmm8,%xmm0
   1185 	aesenc	%xmm8,%xmm1
   1186 	aesenc	%xmm8,%xmm2
   1187 	aesenc	%xmm8,%xmm3
   1188 	aesenc	%xmm8,%xmm4
   1189 	aesenc	%xmm8,%xmm5
   1190 	aesenc	%xmm8,%xmm6
   1191 	aesenc	%xmm8,%xmm7
   1192 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1193 	add	$0x10,%rcx
   1194 	jnz	1b		/* repeat if more rounds */
   1195 	aesenclast %xmm8,%xmm0
   1196 	aesenclast %xmm8,%xmm1
   1197 	aesenclast %xmm8,%xmm2
   1198 	aesenclast %xmm8,%xmm3
   1199 	aesenclast %xmm8,%xmm4
   1200 	aesenclast %xmm8,%xmm5
   1201 	aesenclast %xmm8,%xmm6
   1202 	aesenclast %xmm8,%xmm7
   1203 	ret
   1204 END(aesni_enc8)
   1205 
   1206 /*
   1207  * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0,
   1208  *     uint32_t nrounds@ecx)
   1209  *
   1210  *	Decrypt a single AES block in %xmm0.
   1211  *
   1212  *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
   1213  */
   1214 	.text
   1215 	_ALIGN_TEXT
   1216 	.type	aesni_dec1,@function
   1217 aesni_dec1:
   1218 	pxor	(%rdi),%xmm0	/* xor in first round key */
   1219 	shl	$4,%ecx		/* ecx := byte offset of round key */
   1220 	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
   1221 	neg	%rcx		/* rcx := byte offset of round key from end */
   1222 	jmp	2f
   1223 	_ALIGN_TEXT
   1224 1:	aesdec	%xmm8,%xmm0
   1225 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1226 	add	$0x10,%rcx
   1227 	jnz	1b		/* repeat if more rounds */
   1228 	aesdeclast %xmm8,%xmm0
   1229 	ret
   1230 END(aesni_dec1)
   1231 
   1232 /*
   1233  * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ...,
   1234  *     block7@xmm7, uint32_t nrounds@ecx)
   1235  *
   1236  *	Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
   1237  *
   1238  *	Internal ABI.  Uses %xmm8 as temporary.  Destroys %rcx.
   1239  */
   1240 	.text
   1241 	_ALIGN_TEXT
   1242 	.type	aesni_dec8,@function
   1243 aesni_dec8:
   1244 	movdqa	(%rdi),%xmm8	/* xor in first round key */
   1245 	pxor	%xmm8,%xmm0
   1246 	pxor	%xmm8,%xmm1
   1247 	pxor	%xmm8,%xmm2
   1248 	pxor	%xmm8,%xmm3
   1249 	pxor	%xmm8,%xmm4
   1250 	pxor	%xmm8,%xmm5
   1251 	pxor	%xmm8,%xmm6
   1252 	pxor	%xmm8,%xmm7
   1253 	shl	$4,%ecx		/* ecx := byte offset of round key */
   1254 	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
   1255 	neg	%rcx		/* rcx := byte offset of round key from end */
   1256 	jmp	2f
   1257 	_ALIGN_TEXT
   1258 1:	aesdec	%xmm8,%xmm0
   1259 	aesdec	%xmm8,%xmm1
   1260 	aesdec	%xmm8,%xmm2
   1261 	aesdec	%xmm8,%xmm3
   1262 	aesdec	%xmm8,%xmm4
   1263 	aesdec	%xmm8,%xmm5
   1264 	aesdec	%xmm8,%xmm6
   1265 	aesdec	%xmm8,%xmm7
   1266 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
   1267 	add	$0x10,%rcx
   1268 	jnz	1b		/* repeat if more rounds */
   1269 	aesdeclast %xmm8,%xmm0
   1270 	aesdeclast %xmm8,%xmm1
   1271 	aesdeclast %xmm8,%xmm2
   1272 	aesdeclast %xmm8,%xmm3
   1273 	aesdeclast %xmm8,%xmm4
   1274 	aesdeclast %xmm8,%xmm5
   1275 	aesdeclast %xmm8,%xmm6
   1276 	aesdeclast %xmm8,%xmm7
   1277 	ret
   1278 END(aesni_dec8)
   1279