Home | History | Annotate | Line # | Download | only in x86
aes_sse2_subr.c revision 1.3
      1 /*	$NetBSD: aes_sse2_subr.c,v 1.3 2020/07/25 22:29:56 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.3 2020/07/25 22:29:56 riastradh Exp $");
     31 
     32 #ifdef _KERNEL
     33 #include <sys/systm.h>
     34 #include <lib/libkern/libkern.h>
     35 #else
     36 #include <err.h>
     37 #include <assert.h>
     38 #include <inttypes.h>
     39 #include <stdio.h>
     40 #include <string.h>
     41 #define	KASSERT			assert
     42 #define	panic(fmt, args...)	err(1, fmt, ##args)
     43 #endif
     44 
     45 #include <crypto/aes/aes.h>
     46 #include <crypto/aes/arch/x86/aes_sse2.h>
     47 
     48 #include "aes_sse2_impl.h"
     49 
     50 void
     51 aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
     52 {
     53 	size_t key_len;
     54 
     55 	switch (nrounds) {
     56 	case 10:
     57 		key_len = 16;
     58 		break;
     59 	case 12:
     60 		key_len = 24;
     61 		break;
     62 	case 14:
     63 		key_len = 32;
     64 		break;
     65 	default:
     66 		panic("invalid AES nrounds: %u", nrounds);
     67 	}
     68 
     69 	aes_sse2_keysched(rk, key, key_len);
     70 }
     71 
     72 void
     73 aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
     74     uint8_t out[static 16], uint32_t nrounds)
     75 {
     76 	uint64_t sk_exp[120];
     77 	__m128i q[4];
     78 
     79 	/* Expand round keys for bitslicing.  */
     80 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
     81 
     82 	/* Load input block interleaved with garbage blocks.  */
     83 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
     84 	q[1] = q[2] = q[3] = _mm_setzero_si128();
     85 
     86 	/* Transform to bitslice, decrypt, transform from bitslice.  */
     87 	aes_sse2_ortho(q);
     88 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
     89 	aes_sse2_ortho(q);
     90 
     91 	/* Store output block.  */
     92 	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
     93 
     94 	/* Paranoia: Zero temporary buffers.  */
     95 	explicit_memset(sk_exp, 0, sizeof sk_exp);
     96 	explicit_memset(q, 0, sizeof q);
     97 }
     98 
     99 void
    100 aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
    101     uint8_t out[static 16], uint32_t nrounds)
    102 {
    103 	uint64_t sk_exp[120];
    104 	__m128i q[4];
    105 
    106 	/* Expand round keys for bitslicing.  */
    107 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
    108 
    109 	/* Load input block interleaved with garbage blocks.  */
    110 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
    111 	q[1] = q[2] = q[3] = _mm_setzero_si128();
    112 
    113 	/* Transform to bitslice, decrypt, transform from bitslice.  */
    114 	aes_sse2_ortho(q);
    115 	aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    116 	aes_sse2_ortho(q);
    117 
    118 	/* Store output block.  */
    119 	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
    120 
    121 	/* Paranoia: Zero temporary buffers.  */
    122 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    123 	explicit_memset(q, 0, sizeof q);
    124 }
    125 
    126 void
    127 aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
    128     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    129     uint32_t nrounds)
    130 {
    131 	uint64_t sk_exp[120];
    132 	__m128i q[4];
    133 	__m128i cv;
    134 
    135 	KASSERT(nbytes);
    136 	KASSERT(nbytes % 16 == 0);
    137 
    138 	/* Expand round keys for bitslicing.  */
    139 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    140 
    141 	/* Load the IV.  */
    142 	cv = _mm_loadu_epi8(iv);
    143 
    144 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    145 		/* Load input block and apply CV.  */
    146 		q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
    147 
    148 		/* Transform to bitslice, encrypt, transform from bitslice.  */
    149 		aes_sse2_ortho(q);
    150 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    151 		aes_sse2_ortho(q);
    152 
    153 		/* Remember ciphertext as CV and store output block.  */
    154 		cv = aes_sse2_interleave_out(q[0]);
    155 		_mm_storeu_epi8(out, cv);
    156 	}
    157 
    158 	/* Store updated IV.  */
    159 	_mm_storeu_epi8(iv, cv);
    160 
    161 	/* Paranoia: Zero temporary buffers.  */
    162 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    163 	explicit_memset(q, 0, sizeof q);
    164 }
    165 
    166 void
    167 aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
    168     uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
    169     uint32_t nrounds)
    170 {
    171 	uint64_t sk_exp[120];
    172 	__m128i q[4];
    173 	__m128i cv, iv, w;
    174 
    175 	KASSERT(nbytes);
    176 	KASSERT(nbytes % 16 == 0);
    177 
    178 	/* Expand round keys for bitslicing.  */
    179 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
    180 
    181 	/* Load the IV.  */
    182 	iv = _mm_loadu_epi8(ivp);
    183 
    184 	/* Load the last cipher block.  */
    185 	cv = _mm_loadu_epi8(in + nbytes - 16);
    186 
    187 	/* Store the updated IV.  */
    188 	_mm_storeu_epi8(ivp, cv);
    189 
    190 	/* Process the last blocks if not an even multiple of four.  */
    191 	if (nbytes % (4*16)) {
    192 		unsigned n = (nbytes/16) % 4;
    193 
    194 		KASSERT(n > 0);
    195 		KASSERT(n < 4);
    196 
    197 		q[1] = q[2] = q[3] = _mm_setzero_si128();
    198 		q[n - 1] = aes_sse2_interleave_in(cv);
    199 		switch (nbytes % 64) {
    200 		case 48:
    201 			w = _mm_loadu_epi8(in + nbytes - 32);
    202 			q[1] = aes_sse2_interleave_in(w);
    203 			/*FALLTHROUGH*/
    204 		case 32:
    205 			w = _mm_loadu_epi8(in + nbytes - 48);
    206 			q[0] = aes_sse2_interleave_in(w);
    207 			/*FALLTHROUGH*/
    208 		case 16:
    209 			break;
    210 		}
    211 
    212 		/* Decrypt.  */
    213 		aes_sse2_ortho(q);
    214 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    215 		aes_sse2_ortho(q);
    216 
    217 		do {
    218 			n--;
    219 			w = aes_sse2_interleave_out(q[n]);
    220 			if ((nbytes -= 16) == 0)
    221 				goto out;
    222 			cv = _mm_loadu_epi8(in + nbytes - 16);
    223 			_mm_storeu_epi8(out + nbytes, w ^ cv);
    224 		} while (n);
    225 	}
    226 
    227 	for (;;) {
    228 		KASSERT(nbytes >= 64);
    229 		nbytes -= 64;
    230 
    231 		/*
    232 		 * 1. Set up upper cipher block from cv.
    233 		 * 2. Load lower cipher block into cv and set it up.
    234 		 * 3. Decrypt.
    235 		 */
    236 		q[3] = aes_sse2_interleave_in(cv);
    237 
    238 		w = _mm_loadu_epi8(in + nbytes + 4*8);
    239 		q[2] = aes_sse2_interleave_in(w);
    240 
    241 		w = _mm_loadu_epi8(in + nbytes + 4*4);
    242 		q[1] = aes_sse2_interleave_in(w);
    243 
    244 		w = _mm_loadu_epi8(in + nbytes + 4*0);
    245 		q[0] = aes_sse2_interleave_in(w);
    246 
    247 		aes_sse2_ortho(q);
    248 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    249 		aes_sse2_ortho(q);
    250 
    251 		/* Store the upper output block.  */
    252 		w = aes_sse2_interleave_out(q[3]);
    253 		cv = _mm_loadu_epi8(in + nbytes + 4*8);
    254 		_mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
    255 
    256 		/* Store the middle output blocks.  */
    257 		w = aes_sse2_interleave_out(q[2]);
    258 		cv = _mm_loadu_epi8(in + nbytes + 4*4);
    259 		_mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
    260 
    261 		w = aes_sse2_interleave_out(q[1]);
    262 		cv = _mm_loadu_epi8(in + nbytes + 4*0);
    263 		_mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
    264 
    265 		/*
    266 		 * Get the first output block, but don't load the CV
    267 		 * yet -- it might be the previous ciphertext block, or
    268 		 * it might be the IV.
    269 		 */
    270 		w = aes_sse2_interleave_out(q[0]);
    271 
    272 		/* Stop if we've reached the first output block.  */
    273 		if (nbytes == 0)
    274 			goto out;
    275 
    276 		/*
    277 		 * Load the preceding cipher block, and apply it as the
    278 		 * chaining value to this one.
    279 		 */
    280 		cv = _mm_loadu_epi8(in + nbytes - 16);
    281 		_mm_storeu_epi8(out + nbytes, w ^ cv);
    282 	}
    283 
    284 out:	/* Store the first output block.  */
    285 	_mm_storeu_epi8(out, w ^ iv);
    286 
    287 	/* Paranoia: Zero temporary buffers.  */
    288 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    289 	explicit_memset(q, 0, sizeof q);
    290 }
    291 
    292 static inline __m128i
    293 aes_sse2_xts_update(__m128i t)
    294 {
    295 	const __m128i one = _mm_set_epi64x(1, 1);
    296 	__m128i s, m, c;
    297 
    298 	s = _mm_srli_epi64(t, 63);	/* 1 if high bit set else 0 */
    299 	m = _mm_sub_epi64(s, one);	/* 0 if high bit set else -1 */
    300 	m = _mm_shuffle_epi32(m, 0x4e);	/* swap halves */
    301 	c = _mm_set_epi64x(1, 0x87);	/* carry */
    302 
    303 	return _mm_slli_epi64(t, 1) ^ (c & ~m);
    304 }
    305 
    306 static int
    307 aes_sse2_xts_update_selftest(void)
    308 {
    309 	static const struct {
    310 		uint32_t in[4], out[4];
    311 	} cases[] = {
    312 		[0] = { {1}, {2} },
    313 		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
    314 		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
    315 		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
    316 		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
    317 		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
    318 	};
    319 	unsigned i;
    320 	uint32_t t[4];
    321 	int result = 0;
    322 
    323 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
    324 		t[0] = cases[i].in[0];
    325 		t[1] = cases[i].in[1];
    326 		t[2] = cases[i].in[2];
    327 		t[3] = cases[i].in[3];
    328 		_mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
    329 		if (t[0] != cases[i].out[0] ||
    330 		    t[1] != cases[i].out[1] ||
    331 		    t[2] != cases[i].out[2] ||
    332 		    t[3] != cases[i].out[3]) {
    333 			printf("%s %u:"
    334 			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
    335 			    __func__, i, t[0], t[1], t[2], t[3]);
    336 			result = -1;
    337 		}
    338 	}
    339 
    340 	return result;
    341 }
    342 
    343 void
    344 aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
    345     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    346     uint32_t nrounds)
    347 {
    348 	uint64_t sk_exp[120];
    349 	__m128i q[4];
    350 	__m128i w;
    351 	__m128i t[5];
    352 	unsigned i;
    353 
    354 	KASSERT(nbytes);
    355 	KASSERT(nbytes % 16 == 0);
    356 
    357 	/* Expand round keys for bitslicing.  */
    358 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    359 
    360 	/* Load tweak.  */
    361 	t[0] = _mm_loadu_epi8(tweak);
    362 
    363 	/* Handle the first block separately if odd number.  */
    364 	if (nbytes % (4*16)) {
    365 		/* Load up the tweaked inputs.  */
    366 		for (i = 0; i < (nbytes/16) % 4; i++) {
    367 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
    368 			q[i] = aes_sse2_interleave_in(w);
    369 			t[i + 1] = aes_sse2_xts_update(t[i]);
    370 		}
    371 		for (; i < 4; i++)
    372 			q[i] = _mm_setzero_si128();
    373 
    374 		/* Encrypt up to four blocks.  */
    375 		aes_sse2_ortho(q);
    376 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    377 		aes_sse2_ortho(q);
    378 
    379 		/* Store the tweaked outputs.  */
    380 		for (i = 0; i < (nbytes/16) % 4; i++) {
    381 			w = aes_sse2_interleave_out(q[i]);
    382 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
    383 		}
    384 
    385 		/* Advance to the next block.  */
    386 		t[0] = t[i];
    387 		in += nbytes % (4*16);
    388 		out += nbytes % (4*16);
    389 		nbytes -= nbytes % (4*16);
    390 		if (nbytes == 0)
    391 			goto out;
    392 	}
    393 
    394 	do {
    395 		KASSERT(nbytes % 64 == 0);
    396 		KASSERT(nbytes >= 64);
    397 
    398 		/* Load up the tweaked inputs.  */
    399 		for (i = 0; i < 4; i++) {
    400 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
    401 			q[i] = aes_sse2_interleave_in(w);
    402 			t[i + 1] = aes_sse2_xts_update(t[i]);
    403 		}
    404 
    405 		/* Encrypt four blocks.  */
    406 		aes_sse2_ortho(q);
    407 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    408 		aes_sse2_ortho(q);
    409 
    410 		/* Store the tweaked outputs.  */
    411 		for (i = 0; i < 4; i++) {
    412 			w = aes_sse2_interleave_out(q[i]);
    413 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
    414 		}
    415 
    416 		/* Advance to the next block.  */
    417 		t[0] = t[4];
    418 		in += 64;
    419 		out += 64;
    420 		nbytes -= 64;
    421 	} while (nbytes);
    422 
    423 out:	/* Store the updated tweak.  */
    424 	_mm_storeu_epi8(tweak, t[0]);
    425 
    426 	/* Paranoia: Zero temporary buffers.  */
    427 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    428 	explicit_memset(q, 0, sizeof q);
    429 	explicit_memset(t, 0, sizeof t);
    430 }
    431 
    432 void
    433 aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
    434     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    435     uint32_t nrounds)
    436 {
    437 	uint64_t sk_exp[120];
    438 	__m128i q[4];
    439 	__m128i w;
    440 	__m128i t[5];
    441 	unsigned i;
    442 
    443 	KASSERT(nbytes);
    444 	KASSERT(nbytes % 16 == 0);
    445 
    446 	/* Expand round keys for bitslicing.  */
    447 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
    448 
    449 	/* Load tweak.  */
    450 	t[0] = _mm_loadu_epi8(tweak);
    451 
    452 	/* Handle the first block separately if odd number.  */
    453 	if (nbytes % (4*16)) {
    454 		/* Load up the tweaked inputs.  */
    455 		for (i = 0; i < (nbytes/16) % 4; i++) {
    456 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
    457 			q[i] = aes_sse2_interleave_in(w);
    458 			t[i + 1] = aes_sse2_xts_update(t[i]);
    459 		}
    460 		for (; i < 4; i++)
    461 			q[i] = _mm_setzero_si128();
    462 
    463 		/* Decrypt up to four blocks.  */
    464 		aes_sse2_ortho(q);
    465 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    466 		aes_sse2_ortho(q);
    467 
    468 		/* Store the tweaked outputs.  */
    469 		for (i = 0; i < (nbytes/16) % 4; i++) {
    470 			w = aes_sse2_interleave_out(q[i]);
    471 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
    472 		}
    473 
    474 		/* Advance to the next block.  */
    475 		t[0] = t[i];
    476 		in += nbytes % (4*16);
    477 		out += nbytes % (4*16);
    478 		nbytes -= nbytes % (4*16);
    479 		if (nbytes == 0)
    480 			goto out;
    481 	}
    482 
    483 	do {
    484 		KASSERT(nbytes % 64 == 0);
    485 		KASSERT(nbytes >= 64);
    486 
    487 		/* Load up the tweaked inputs.  */
    488 		for (i = 0; i < 4; i++) {
    489 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
    490 			q[i] = aes_sse2_interleave_in(w);
    491 			t[i + 1] = aes_sse2_xts_update(t[i]);
    492 		}
    493 
    494 		/* Decrypt four blocks.  */
    495 		aes_sse2_ortho(q);
    496 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    497 		aes_sse2_ortho(q);
    498 
    499 		/* Store the tweaked outputs.  */
    500 		for (i = 0; i < 4; i++) {
    501 			w = aes_sse2_interleave_out(q[i]);
    502 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
    503 		}
    504 
    505 		/* Advance to the next block.  */
    506 		t[0] = t[4];
    507 		in += 64;
    508 		out += 64;
    509 		nbytes -= 64;
    510 	} while (nbytes);
    511 
    512 out:	/* Store the updated tweak.  */
    513 	_mm_storeu_epi8(tweak, t[0]);
    514 
    515 	/* Paranoia: Zero temporary buffers.  */
    516 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    517 	explicit_memset(q, 0, sizeof q);
    518 	explicit_memset(t, 0, sizeof t);
    519 }
    520 
    521 void
    522 aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
    523     size_t nbytes, uint8_t auth[static 16], uint32_t nrounds)
    524 {
    525 	uint64_t sk_exp[120];
    526 	__m128i q[4];
    527 
    528 	KASSERT(nbytes);
    529 	KASSERT(nbytes % 16 == 0);
    530 
    531 	/* Expand round keys for bitslicing.  */
    532 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    533 
    534 	/* Load initial authenticator.  */
    535 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth));
    536 
    537 	for (; nbytes; nbytes -= 16, in += 16) {
    538 		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
    539 		aes_sse2_ortho(q);
    540 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    541 		aes_sse2_ortho(q);
    542 	}
    543 
    544 	/* Store updated authenticator.  */
    545 	_mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0]));
    546 
    547 	/* Paranoia: Zero temporary buffers.  */
    548 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    549 	explicit_memset(q, 0, sizeof q);
    550 }
    551 
    552 void
    553 aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
    554     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
    555     uint32_t nrounds)
    556 {
    557 	uint64_t sk_exp[120];
    558 	__m128i q[4];
    559 	__m128i ctr;
    560 	uint32_t c0, c1, c2, c3;
    561 
    562 	KASSERT(nbytes);
    563 	KASSERT(nbytes % 16 == 0);
    564 
    565 	/* Expand round keys for bitslicing.  */
    566 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    567 
    568 	/* Set first block to authenticator.  */
    569 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
    570 
    571 	/* Load initial counter block, big-endian so we can increment it.  */
    572 	c0 = le32dec(authctr + 16 + 4*0);
    573 	c1 = le32dec(authctr + 16 + 4*1);
    574 	c2 = le32dec(authctr + 16 + 4*2);
    575 	c3 = be32dec(authctr + 16 + 4*3);
    576 
    577 	/* Set other blocks to garbage -- can't take advantage.  */
    578 	q[2] = q[3] = _mm_setzero_si128();
    579 
    580 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    581 		/* Update authenticator.  */
    582 		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
    583 
    584 		/* Increment 32-bit counter.  */
    585 		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
    586 		q[1] = aes_sse2_interleave_in(ctr);
    587 
    588 		/* Encrypt authenticator and counter.  */
    589 		aes_sse2_ortho(q);
    590 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    591 		aes_sse2_ortho(q);
    592 
    593 		/* Encrypt with CTR output.  */
    594 		_mm_storeu_epi8(out,
    595 		    _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1]));
    596 	}
    597 
    598 	/* Update authenticator.  */
    599 	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0]));
    600 
    601 	/* Update counter.  */
    602 	be32enc(authctr + 16 + 4*3, c3);
    603 
    604 	/* Paranoia: Zero temporary buffers.  */
    605 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    606 	explicit_memset(q, 0, sizeof q);
    607 }
    608 
    609 void
    610 aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
    611     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
    612     uint32_t nrounds)
    613 {
    614 	uint64_t sk_exp[120];
    615 	__m128i q[4];
    616 	__m128i ctr, block;
    617 	uint32_t c0, c1, c2, c3;
    618 
    619 	KASSERT(nbytes);
    620 	KASSERT(nbytes % 16 == 0);
    621 
    622 	/* Expand round keys for bitslicing.  */
    623 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    624 
    625 	/* Load initial counter block, big-endian so we can increment it.  */
    626 	c0 = le32dec(authctr + 16 + 4*0);
    627 	c1 = le32dec(authctr + 16 + 4*1);
    628 	c2 = le32dec(authctr + 16 + 4*2);
    629 	c3 = be32dec(authctr + 16 + 4*3);
    630 
    631 	/* Increment 32-bit counter.  */
    632 	ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
    633 	q[0] = aes_sse2_interleave_in(ctr);
    634 
    635 	/*
    636 	 * Set the other blocks to garbage -- we don't have any
    637 	 * plaintext to authenticate yet.
    638 	 */
    639 	q[1] = q[2] = q[3] = _mm_setzero_si128();
    640 
    641 	/* Encrypt first CTR.  */
    642 	aes_sse2_ortho(q);
    643 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    644 	aes_sse2_ortho(q);
    645 
    646 	/* Load the initial authenticator.  */
    647 	q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
    648 
    649 	for (;; in += 16, out += 16) {
    650 		/* Decrypt the block.  */
    651 		block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]);
    652 
    653 		/* Update authenticator.  */
    654 		q[1] ^= aes_sse2_interleave_in(block);
    655 
    656 		/* Store plaintext.  */
    657 		_mm_storeu_epi8(out, block);
    658 
    659 		/* If this is the last block, stop.  */
    660 		if ((nbytes -= 16) == 0)
    661 			break;
    662 
    663 		/* Increment 32-bit counter.  */
    664 		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
    665 		q[0] = aes_sse2_interleave_in(ctr);
    666 
    667 		/* Authenticate previous plaintext, encrypt next CTR.  */
    668 		aes_sse2_ortho(q);
    669 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    670 		aes_sse2_ortho(q);
    671 	}
    672 
    673 	/*
    674 	 * Authenticate last plaintext.  We're only doing this for the
    675 	 * authenticator, not for the counter, so don't bother to
    676 	 * initialize q[0], q[2], q[3].  (Even for the sake of
    677 	 * sanitizers, they're already initialized to something by
    678 	 * now.)
    679 	 */
    680 	aes_sse2_ortho(q);
    681 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    682 	aes_sse2_ortho(q);
    683 
    684 	/* Update authenticator.  */
    685 	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1]));
    686 
    687 	/* Update counter.  */
    688 	be32enc(authctr + 16 + 4*3, c3);
    689 
    690 	/* Paranoia: Zero temporary buffers.  */
    691 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    692 	explicit_memset(q, 0, sizeof q);
    693 }
    694 
    695 int
    696 aes_sse2_selftest(void)
    697 {
    698 
    699 	if (aes_sse2_xts_update_selftest())
    700 		return -1;
    701 
    702 	/* XXX test aes_sse2_bitslice_decrypt */
    703 	/* XXX test aes_sse2_bitslice_encrypt */
    704 	/* XXX test aes_sse2_keysched */
    705 	/* XXX test aes_sse2_ortho */
    706 	/* XXX test aes_sse2_skey_expand */
    707 
    708 	return 0;
    709 }
    710