Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $");
     31 
     32 #ifdef _KERNEL
     33 #include <sys/systm.h>
     34 #include <lib/libkern/libkern.h>
     35 #else
     36 #include <err.h>
     37 #include <assert.h>
     38 #include <inttypes.h>
     39 #include <stdio.h>
     40 #include <string.h>
     41 #define	KASSERT			assert
     42 #define	panic(fmt, args...)	err(1, fmt, ##args)
     43 #endif
     44 
     45 #include <crypto/aes/aes.h>
     46 #include <crypto/aes/arch/x86/aes_sse2.h>
     47 
     48 #include "aes_sse2_impl.h"
     49 
     50 void
     51 aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
     52 {
     53 	size_t key_len;
     54 
     55 	switch (nrounds) {
     56 	case 10:
     57 		key_len = 16;
     58 		break;
     59 	case 12:
     60 		key_len = 24;
     61 		break;
     62 	case 14:
     63 		key_len = 32;
     64 		break;
     65 	default:
     66 		panic("invalid AES nrounds: %u", nrounds);
     67 	}
     68 
     69 	aes_sse2_keysched(rk, key, key_len);
     70 }
     71 
     72 void
     73 aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
     74     uint8_t out[static 16], uint32_t nrounds)
     75 {
     76 	uint64_t sk_exp[120];
     77 	__m128i q[4];
     78 
     79 	/* Expand round keys for bitslicing.  */
     80 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
     81 
     82 	/* Load input block interleaved with garbage blocks.  */
     83 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
     84 	q[1] = q[2] = q[3] = _mm_setzero_si128();
     85 
     86 	/* Transform to bitslice, decrypt, transform from bitslice.  */
     87 	aes_sse2_ortho(q);
     88 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
     89 	aes_sse2_ortho(q);
     90 
     91 	/* Store output block.  */
     92 	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
     93 
     94 	/* Paranoia: Zero temporary buffers.  */
     95 	explicit_memset(sk_exp, 0, sizeof sk_exp);
     96 	explicit_memset(q, 0, sizeof q);
     97 }
     98 
     99 void
    100 aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
    101     uint8_t out[static 16], uint32_t nrounds)
    102 {
    103 	uint64_t sk_exp[120];
    104 	__m128i q[4];
    105 
    106 	/* Expand round keys for bitslicing.  */
    107 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
    108 
    109 	/* Load input block interleaved with garbage blocks.  */
    110 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
    111 	q[1] = q[2] = q[3] = _mm_setzero_si128();
    112 
    113 	/* Transform to bitslice, decrypt, transform from bitslice.  */
    114 	aes_sse2_ortho(q);
    115 	aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    116 	aes_sse2_ortho(q);
    117 
    118 	/* Store output block.  */
    119 	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
    120 
    121 	/* Paranoia: Zero temporary buffers.  */
    122 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    123 	explicit_memset(q, 0, sizeof q);
    124 }
    125 
    126 void
    127 aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
    128     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    129     uint32_t nrounds)
    130 {
    131 	uint64_t sk_exp[120];
    132 	__m128i q[4];
    133 	__m128i cv;
    134 
    135 	KASSERT(nbytes);
    136 	KASSERT(nbytes % 16 == 0);
    137 
    138 	/* Expand round keys for bitslicing.  */
    139 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    140 
    141 	/* Load the IV.  */
    142 	cv = _mm_loadu_epi8(iv);
    143 
    144 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    145 		/* Load input block and apply CV.  */
    146 		q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
    147 
    148 		/* Transform to bitslice, encrypt, transform from bitslice.  */
    149 		aes_sse2_ortho(q);
    150 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    151 		aes_sse2_ortho(q);
    152 
    153 		/* Remember ciphertext as CV and store output block.  */
    154 		cv = aes_sse2_interleave_out(q[0]);
    155 		_mm_storeu_epi8(out, cv);
    156 	}
    157 
    158 	/* Store updated IV.  */
    159 	_mm_storeu_epi8(iv, cv);
    160 
    161 	/* Paranoia: Zero temporary buffers.  */
    162 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    163 	explicit_memset(q, 0, sizeof q);
    164 }
    165 
    166 void
    167 aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
    168     uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
    169     uint32_t nrounds)
    170 {
    171 	uint64_t sk_exp[120];
    172 	__m128i q[4];
    173 	__m128i cv, iv, w;
    174 
    175 	KASSERT(nbytes);
    176 	KASSERT(nbytes % 16 == 0);
    177 
    178 	/* Expand round keys for bitslicing.  */
    179 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
    180 
    181 	/* Load the IV.  */
    182 	iv = _mm_loadu_epi8(ivp);
    183 
    184 	/* Load the last cipher block.  */
    185 	cv = _mm_loadu_epi8(in + nbytes - 16);
    186 
    187 	/* Store the updated IV.  */
    188 	_mm_storeu_epi8(ivp, cv);
    189 
    190 	/* Process the last blocks if not an even multiple of four.  */
    191 	if (nbytes % (4*16)) {
    192 		unsigned n = (nbytes/16) % 4;
    193 
    194 		KASSERT(n > 0);
    195 		KASSERT(n < 4);
    196 
    197 		q[1] = q[2] = q[3] = _mm_setzero_si128();
    198 		q[n - 1] = aes_sse2_interleave_in(cv);
    199 		switch (nbytes % 64) {
    200 		case 48:
    201 			w = _mm_loadu_epi8(in + nbytes - 32);
    202 			q[1] = aes_sse2_interleave_in(w);
    203 			w = _mm_loadu_epi8(in + nbytes - 48);
    204 			q[0] = aes_sse2_interleave_in(w);
    205 			break;
    206 		case 32:
    207 			w = _mm_loadu_epi8(in + nbytes - 32);
    208 			q[0] = aes_sse2_interleave_in(w);
    209 			break;
    210 		case 16:
    211 			break;
    212 		}
    213 
    214 		/* Decrypt.  */
    215 		aes_sse2_ortho(q);
    216 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    217 		aes_sse2_ortho(q);
    218 
    219 		do {
    220 			n--;
    221 			w = aes_sse2_interleave_out(q[n]);
    222 			if ((nbytes -= 16) == 0)
    223 				goto out;
    224 			cv = _mm_loadu_epi8(in + nbytes - 16);
    225 			_mm_storeu_epi8(out + nbytes, w ^ cv);
    226 		} while (n);
    227 	}
    228 
    229 	for (;;) {
    230 		KASSERT(nbytes >= 64);
    231 		nbytes -= 64;
    232 
    233 		/*
    234 		 * 1. Set up upper cipher block from cv.
    235 		 * 2. Load lower cipher block into cv and set it up.
    236 		 * 3. Decrypt.
    237 		 */
    238 		q[3] = aes_sse2_interleave_in(cv);
    239 
    240 		w = _mm_loadu_epi8(in + nbytes + 4*8);
    241 		q[2] = aes_sse2_interleave_in(w);
    242 
    243 		w = _mm_loadu_epi8(in + nbytes + 4*4);
    244 		q[1] = aes_sse2_interleave_in(w);
    245 
    246 		w = _mm_loadu_epi8(in + nbytes + 4*0);
    247 		q[0] = aes_sse2_interleave_in(w);
    248 
    249 		aes_sse2_ortho(q);
    250 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    251 		aes_sse2_ortho(q);
    252 
    253 		/* Store the upper output block.  */
    254 		w = aes_sse2_interleave_out(q[3]);
    255 		cv = _mm_loadu_epi8(in + nbytes + 4*8);
    256 		_mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
    257 
    258 		/* Store the middle output blocks.  */
    259 		w = aes_sse2_interleave_out(q[2]);
    260 		cv = _mm_loadu_epi8(in + nbytes + 4*4);
    261 		_mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
    262 
    263 		w = aes_sse2_interleave_out(q[1]);
    264 		cv = _mm_loadu_epi8(in + nbytes + 4*0);
    265 		_mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
    266 
    267 		/*
    268 		 * Get the first output block, but don't load the CV
    269 		 * yet -- it might be the previous ciphertext block, or
    270 		 * it might be the IV.
    271 		 */
    272 		w = aes_sse2_interleave_out(q[0]);
    273 
    274 		/* Stop if we've reached the first output block.  */
    275 		if (nbytes == 0)
    276 			goto out;
    277 
    278 		/*
    279 		 * Load the preceding cipher block, and apply it as the
    280 		 * chaining value to this one.
    281 		 */
    282 		cv = _mm_loadu_epi8(in + nbytes - 16);
    283 		_mm_storeu_epi8(out + nbytes, w ^ cv);
    284 	}
    285 
    286 out:	/* Store the first output block.  */
    287 	_mm_storeu_epi8(out, w ^ iv);
    288 
    289 	/* Paranoia: Zero temporary buffers.  */
    290 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    291 	explicit_memset(q, 0, sizeof q);
    292 }
    293 
    294 static inline __m128i
    295 aes_sse2_xts_update(__m128i t)
    296 {
    297 	const __m128i one = _mm_set_epi64x(1, 1);
    298 	__m128i s, m, c;
    299 
    300 	s = _mm_srli_epi64(t, 63);	/* 1 if high bit set else 0 */
    301 	m = _mm_sub_epi64(s, one);	/* 0 if high bit set else -1 */
    302 	m = _mm_shuffle_epi32(m, 0x4e);	/* swap halves */
    303 	c = _mm_set_epi64x(1, 0x87);	/* carry */
    304 
    305 	return _mm_slli_epi64(t, 1) ^ (c & ~m);
    306 }
    307 
    308 static int
    309 aes_sse2_xts_update_selftest(void)
    310 {
    311 	static const struct {
    312 		uint32_t in[4], out[4];
    313 	} cases[] = {
    314 		[0] = { {1}, {2} },
    315 		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
    316 		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
    317 		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
    318 		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
    319 		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
    320 	};
    321 	unsigned i;
    322 	uint32_t t[4];
    323 	int result = 0;
    324 
    325 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
    326 		t[0] = cases[i].in[0];
    327 		t[1] = cases[i].in[1];
    328 		t[2] = cases[i].in[2];
    329 		t[3] = cases[i].in[3];
    330 		_mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
    331 		if (t[0] != cases[i].out[0] ||
    332 		    t[1] != cases[i].out[1] ||
    333 		    t[2] != cases[i].out[2] ||
    334 		    t[3] != cases[i].out[3]) {
    335 			printf("%s %u:"
    336 			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
    337 			    __func__, i, t[0], t[1], t[2], t[3]);
    338 			result = -1;
    339 		}
    340 	}
    341 
    342 	return result;
    343 }
    344 
    345 void
    346 aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
    347     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    348     uint32_t nrounds)
    349 {
    350 	uint64_t sk_exp[120];
    351 	__m128i q[4];
    352 	__m128i w;
    353 	__m128i t[5];
    354 	unsigned i;
    355 
    356 	KASSERT(nbytes);
    357 	KASSERT(nbytes % 16 == 0);
    358 
    359 	/* Expand round keys for bitslicing.  */
    360 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    361 
    362 	/* Load tweak.  */
    363 	t[0] = _mm_loadu_epi8(tweak);
    364 
    365 	/* Handle the first block separately if odd number.  */
    366 	if (nbytes % (4*16)) {
    367 		/* Load up the tweaked inputs.  */
    368 		for (i = 0; i < (nbytes/16) % 4; i++) {
    369 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
    370 			q[i] = aes_sse2_interleave_in(w);
    371 			t[i + 1] = aes_sse2_xts_update(t[i]);
    372 		}
    373 		for (; i < 4; i++)
    374 			q[i] = _mm_setzero_si128();
    375 
    376 		/* Encrypt up to four blocks.  */
    377 		aes_sse2_ortho(q);
    378 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    379 		aes_sse2_ortho(q);
    380 
    381 		/* Store the tweaked outputs.  */
    382 		for (i = 0; i < (nbytes/16) % 4; i++) {
    383 			w = aes_sse2_interleave_out(q[i]);
    384 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
    385 		}
    386 
    387 		/* Advance to the next block.  */
    388 		t[0] = t[i];
    389 		in += nbytes % (4*16);
    390 		out += nbytes % (4*16);
    391 		nbytes -= nbytes % (4*16);
    392 		if (nbytes == 0)
    393 			goto out;
    394 	}
    395 
    396 	do {
    397 		KASSERT(nbytes % 64 == 0);
    398 		KASSERT(nbytes >= 64);
    399 
    400 		/* Load up the tweaked inputs.  */
    401 		for (i = 0; i < 4; i++) {
    402 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
    403 			q[i] = aes_sse2_interleave_in(w);
    404 			t[i + 1] = aes_sse2_xts_update(t[i]);
    405 		}
    406 
    407 		/* Encrypt four blocks.  */
    408 		aes_sse2_ortho(q);
    409 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    410 		aes_sse2_ortho(q);
    411 
    412 		/* Store the tweaked outputs.  */
    413 		for (i = 0; i < 4; i++) {
    414 			w = aes_sse2_interleave_out(q[i]);
    415 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
    416 		}
    417 
    418 		/* Advance to the next block.  */
    419 		t[0] = t[4];
    420 		in += 64;
    421 		out += 64;
    422 		nbytes -= 64;
    423 	} while (nbytes);
    424 
    425 out:	/* Store the updated tweak.  */
    426 	_mm_storeu_epi8(tweak, t[0]);
    427 
    428 	/* Paranoia: Zero temporary buffers.  */
    429 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    430 	explicit_memset(q, 0, sizeof q);
    431 	explicit_memset(t, 0, sizeof t);
    432 }
    433 
    434 void
    435 aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
    436     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    437     uint32_t nrounds)
    438 {
    439 	uint64_t sk_exp[120];
    440 	__m128i q[4];
    441 	__m128i w;
    442 	__m128i t[5];
    443 	unsigned i;
    444 
    445 	KASSERT(nbytes);
    446 	KASSERT(nbytes % 16 == 0);
    447 
    448 	/* Expand round keys for bitslicing.  */
    449 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
    450 
    451 	/* Load tweak.  */
    452 	t[0] = _mm_loadu_epi8(tweak);
    453 
    454 	/* Handle the first block separately if odd number.  */
    455 	if (nbytes % (4*16)) {
    456 		/* Load up the tweaked inputs.  */
    457 		for (i = 0; i < (nbytes/16) % 4; i++) {
    458 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
    459 			q[i] = aes_sse2_interleave_in(w);
    460 			t[i + 1] = aes_sse2_xts_update(t[i]);
    461 		}
    462 		for (; i < 4; i++)
    463 			q[i] = _mm_setzero_si128();
    464 
    465 		/* Decrypt up to four blocks.  */
    466 		aes_sse2_ortho(q);
    467 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    468 		aes_sse2_ortho(q);
    469 
    470 		/* Store the tweaked outputs.  */
    471 		for (i = 0; i < (nbytes/16) % 4; i++) {
    472 			w = aes_sse2_interleave_out(q[i]);
    473 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
    474 		}
    475 
    476 		/* Advance to the next block.  */
    477 		t[0] = t[i];
    478 		in += nbytes % (4*16);
    479 		out += nbytes % (4*16);
    480 		nbytes -= nbytes % (4*16);
    481 		if (nbytes == 0)
    482 			goto out;
    483 	}
    484 
    485 	do {
    486 		KASSERT(nbytes % 64 == 0);
    487 		KASSERT(nbytes >= 64);
    488 
    489 		/* Load up the tweaked inputs.  */
    490 		for (i = 0; i < 4; i++) {
    491 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
    492 			q[i] = aes_sse2_interleave_in(w);
    493 			t[i + 1] = aes_sse2_xts_update(t[i]);
    494 		}
    495 
    496 		/* Decrypt four blocks.  */
    497 		aes_sse2_ortho(q);
    498 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
    499 		aes_sse2_ortho(q);
    500 
    501 		/* Store the tweaked outputs.  */
    502 		for (i = 0; i < 4; i++) {
    503 			w = aes_sse2_interleave_out(q[i]);
    504 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
    505 		}
    506 
    507 		/* Advance to the next block.  */
    508 		t[0] = t[4];
    509 		in += 64;
    510 		out += 64;
    511 		nbytes -= 64;
    512 	} while (nbytes);
    513 
    514 out:	/* Store the updated tweak.  */
    515 	_mm_storeu_epi8(tweak, t[0]);
    516 
    517 	/* Paranoia: Zero temporary buffers.  */
    518 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    519 	explicit_memset(q, 0, sizeof q);
    520 	explicit_memset(t, 0, sizeof t);
    521 }
    522 
    523 void
    524 aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
    525     size_t nbytes, uint8_t auth[static 16], uint32_t nrounds)
    526 {
    527 	uint64_t sk_exp[120];
    528 	__m128i q[4];
    529 
    530 	KASSERT(nbytes);
    531 	KASSERT(nbytes % 16 == 0);
    532 
    533 	/* Expand round keys for bitslicing.  */
    534 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    535 
    536 	/* Load initial authenticator.  */
    537 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth));
    538 
    539 	for (; nbytes; nbytes -= 16, in += 16) {
    540 		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
    541 		aes_sse2_ortho(q);
    542 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    543 		aes_sse2_ortho(q);
    544 	}
    545 
    546 	/* Store updated authenticator.  */
    547 	_mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0]));
    548 
    549 	/* Paranoia: Zero temporary buffers.  */
    550 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    551 	explicit_memset(q, 0, sizeof q);
    552 }
    553 
    554 void
    555 aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
    556     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
    557     uint32_t nrounds)
    558 {
    559 	uint64_t sk_exp[120];
    560 	__m128i q[4];
    561 	__m128i ctr;
    562 	uint32_t c0, c1, c2, c3;
    563 
    564 	KASSERT(nbytes);
    565 	KASSERT(nbytes % 16 == 0);
    566 
    567 	/* Expand round keys for bitslicing.  */
    568 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    569 
    570 	/* Set first block to authenticator.  */
    571 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
    572 
    573 	/* Load initial counter block, big-endian so we can increment it.  */
    574 	c0 = le32dec(authctr + 16 + 4*0);
    575 	c1 = le32dec(authctr + 16 + 4*1);
    576 	c2 = le32dec(authctr + 16 + 4*2);
    577 	c3 = be32dec(authctr + 16 + 4*3);
    578 
    579 	/* Set other blocks to garbage -- can't take advantage.  */
    580 	q[2] = q[3] = _mm_setzero_si128();
    581 
    582 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    583 		/* Update authenticator.  */
    584 		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
    585 
    586 		/* Increment 32-bit counter.  */
    587 		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
    588 		q[1] = aes_sse2_interleave_in(ctr);
    589 
    590 		/* Encrypt authenticator and counter.  */
    591 		aes_sse2_ortho(q);
    592 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    593 		aes_sse2_ortho(q);
    594 
    595 		/* Encrypt with CTR output.  */
    596 		_mm_storeu_epi8(out,
    597 		    _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1]));
    598 	}
    599 
    600 	/* Update authenticator.  */
    601 	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0]));
    602 
    603 	/* Update counter.  */
    604 	be32enc(authctr + 16 + 4*3, c3);
    605 
    606 	/* Paranoia: Zero temporary buffers.  */
    607 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    608 	explicit_memset(q, 0, sizeof q);
    609 }
    610 
    611 void
    612 aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
    613     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
    614     uint32_t nrounds)
    615 {
    616 	uint64_t sk_exp[120];
    617 	__m128i q[4];
    618 	__m128i ctr, block;
    619 	uint32_t c0, c1, c2, c3;
    620 
    621 	KASSERT(nbytes);
    622 	KASSERT(nbytes % 16 == 0);
    623 
    624 	/* Expand round keys for bitslicing.  */
    625 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
    626 
    627 	/* Load initial counter block, big-endian so we can increment it.  */
    628 	c0 = le32dec(authctr + 16 + 4*0);
    629 	c1 = le32dec(authctr + 16 + 4*1);
    630 	c2 = le32dec(authctr + 16 + 4*2);
    631 	c3 = be32dec(authctr + 16 + 4*3);
    632 
    633 	/* Increment 32-bit counter.  */
    634 	ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
    635 	q[0] = aes_sse2_interleave_in(ctr);
    636 
    637 	/*
    638 	 * Set the other blocks to garbage -- we don't have any
    639 	 * plaintext to authenticate yet.
    640 	 */
    641 	q[1] = q[2] = q[3] = _mm_setzero_si128();
    642 
    643 	/* Encrypt first CTR.  */
    644 	aes_sse2_ortho(q);
    645 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    646 	aes_sse2_ortho(q);
    647 
    648 	/* Load the initial authenticator.  */
    649 	q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
    650 
    651 	for (;; in += 16, out += 16) {
    652 		/* Decrypt the block.  */
    653 		block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]);
    654 
    655 		/* Update authenticator.  */
    656 		q[1] ^= aes_sse2_interleave_in(block);
    657 
    658 		/* Store plaintext.  */
    659 		_mm_storeu_epi8(out, block);
    660 
    661 		/* If this is the last block, stop.  */
    662 		if ((nbytes -= 16) == 0)
    663 			break;
    664 
    665 		/* Increment 32-bit counter.  */
    666 		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
    667 		q[0] = aes_sse2_interleave_in(ctr);
    668 
    669 		/* Authenticate previous plaintext, encrypt next CTR.  */
    670 		aes_sse2_ortho(q);
    671 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    672 		aes_sse2_ortho(q);
    673 	}
    674 
    675 	/*
    676 	 * Authenticate last plaintext.  We're only doing this for the
    677 	 * authenticator, not for the counter, so don't bother to
    678 	 * initialize q[0], q[2], q[3].  (Even for the sake of
    679 	 * sanitizers, they're already initialized to something by
    680 	 * now.)
    681 	 */
    682 	aes_sse2_ortho(q);
    683 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
    684 	aes_sse2_ortho(q);
    685 
    686 	/* Update authenticator.  */
    687 	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1]));
    688 
    689 	/* Update counter.  */
    690 	be32enc(authctr + 16 + 4*3, c3);
    691 
    692 	/* Paranoia: Zero temporary buffers.  */
    693 	explicit_memset(sk_exp, 0, sizeof sk_exp);
    694 	explicit_memset(q, 0, sizeof q);
    695 }
    696 
    697 int
    698 aes_sse2_selftest(void)
    699 {
    700 
    701 	if (aes_sse2_xts_update_selftest())
    702 		return -1;
    703 
    704 	/* XXX test aes_sse2_bitslice_decrypt */
    705 	/* XXX test aes_sse2_bitslice_encrypt */
    706 	/* XXX test aes_sse2_keysched */
    707 	/* XXX test aes_sse2_ortho */
    708 	/* XXX test aes_sse2_skey_expand */
    709 
    710 	return 0;
    711 }
    712