Home | History | Annotate | Line # | Download | only in x86
aes_via.c revision 1.7
      1 /*	$NetBSD: aes_via.c,v 1.7 2024/06/16 13:03:48 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(1, "$NetBSD: aes_via.c,v 1.7 2024/06/16 13:03:48 christos Exp $");
     31 
     32 #ifdef _KERNEL
     33 #include <sys/types.h>
     34 #include <sys/evcnt.h>
     35 #include <sys/systm.h>
     36 #else
     37 #include <assert.h>
     38 #include <err.h>
     39 #include <stdint.h>
     40 #include <string.h>
     41 #define	KASSERT			assert
     42 #define	panic(fmt, args...)	err(1, fmt, args)
     43 struct evcnt { uint64_t ev_count; };
     44 #define	EVCNT_INITIALIZER(a,b,c,d) {0}
     45 #define	EVCNT_ATTACH_STATIC(name)	static char name##_attach __unused = 0
     46 #endif
     47 
     48 #include <crypto/aes/aes.h>
     49 #include <crypto/aes/aes_bear.h>
     50 #include <crypto/aes/aes_impl.h>
     51 
     52 #ifdef _KERNEL
     53 #include <x86/cpufunc.h>
     54 #include <x86/cpuvar.h>
     55 #include <x86/fpu.h>
     56 #include <x86/specialreg.h>
     57 #include <x86/via_padlock.h>
     58 #else
     59 #include <cpuid.h>
     60 #define	fpu_kern_enter()	((void)0)
     61 #define	fpu_kern_leave()	((void)0)
     62 #define C3_CRYPT_CWLO_ROUND_M		0x0000000f
     63 #define C3_CRYPT_CWLO_ALG_M		0x00000070
     64 #define C3_CRYPT_CWLO_ALG_AES		0x00000000
     65 #define C3_CRYPT_CWLO_KEYGEN_M		0x00000080
     66 #define C3_CRYPT_CWLO_KEYGEN_HW		0x00000000
     67 #define C3_CRYPT_CWLO_KEYGEN_SW		0x00000080
     68 #define C3_CRYPT_CWLO_NORMAL		0x00000000
     69 #define C3_CRYPT_CWLO_INTERMEDIATE	0x00000100
     70 #define C3_CRYPT_CWLO_ENCRYPT		0x00000000
     71 #define C3_CRYPT_CWLO_DECRYPT		0x00000200
     72 #define C3_CRYPT_CWLO_KEY128		0x0000000a      /* 128bit, 10 rds */
     73 #define C3_CRYPT_CWLO_KEY192		0x0000040c      /* 192bit, 12 rds */
     74 #define C3_CRYPT_CWLO_KEY256		0x0000080e      /* 256bit, 15 rds */
     75 #endif
     76 
     77 static void
     78 aesvia_reload_keys(void)
     79 {
     80 
     81 	asm volatile("pushf; popf");
     82 }
     83 
     84 static uint32_t
     85 aesvia_keylen_cw0(unsigned nrounds)
     86 {
     87 
     88 	/*
     89 	 * Determine the control word bits for the key size / number of
     90 	 * rounds.  For AES-128, the hardware can do key expansion on
     91 	 * the fly; for AES-192 and AES-256, software must do it.
     92 	 */
     93 	switch (nrounds) {
     94 	case AES_128_NROUNDS:
     95 		return C3_CRYPT_CWLO_KEY128;
     96 	case AES_192_NROUNDS:
     97 		return C3_CRYPT_CWLO_KEY192 | C3_CRYPT_CWLO_KEYGEN_SW;
     98 	case AES_256_NROUNDS:
     99 		return C3_CRYPT_CWLO_KEY256 | C3_CRYPT_CWLO_KEYGEN_SW;
    100 	default:
    101 		panic("invalid AES nrounds: %u", nrounds);
    102 	}
    103 }
    104 
    105 static void
    106 aesvia_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds)
    107 {
    108 	size_t key_len;
    109 
    110 	switch (nrounds) {
    111 	case AES_128_NROUNDS:
    112 		enc->aese_aes.aes_rk[0] = le32dec(key + 4*0);
    113 		enc->aese_aes.aes_rk[1] = le32dec(key + 4*1);
    114 		enc->aese_aes.aes_rk[2] = le32dec(key + 4*2);
    115 		enc->aese_aes.aes_rk[3] = le32dec(key + 4*3);
    116 		return;
    117 	case AES_192_NROUNDS:
    118 		key_len = 24;
    119 		break;
    120 	case AES_256_NROUNDS:
    121 		key_len = 32;
    122 		break;
    123 	default:
    124 		panic("invalid AES nrounds: %u", nrounds);
    125 	}
    126 	br_aes_ct_keysched_stdenc(enc->aese_aes.aes_rk, key, key_len);
    127 }
    128 
    129 static void
    130 aesvia_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
    131 {
    132 	size_t key_len;
    133 
    134 	switch (nrounds) {
    135 	case AES_128_NROUNDS:
    136 		dec->aesd_aes.aes_rk[0] = le32dec(key + 4*0);
    137 		dec->aesd_aes.aes_rk[1] = le32dec(key + 4*1);
    138 		dec->aesd_aes.aes_rk[2] = le32dec(key + 4*2);
    139 		dec->aesd_aes.aes_rk[3] = le32dec(key + 4*3);
    140 		return;
    141 	case AES_192_NROUNDS:
    142 		key_len = 24;
    143 		break;
    144 	case AES_256_NROUNDS:
    145 		key_len = 32;
    146 		break;
    147 	default:
    148 		panic("invalid AES nrounds: %u", nrounds);
    149 	}
    150 	br_aes_ct_keysched_stddec(dec->aesd_aes.aes_rk, key, key_len);
    151 }
    152 
    153 static inline void
    154 aesvia_encN(const struct aesenc *enc, const uint8_t in[static 16],
    155     uint8_t out[static 16], size_t nblocks, uint32_t cw0)
    156 {
    157 	const uint32_t cw[4] __aligned(16) = {
    158 		[0] = (cw0
    159 		    | C3_CRYPT_CWLO_ALG_AES
    160 		    | C3_CRYPT_CWLO_ENCRYPT
    161 		    | C3_CRYPT_CWLO_NORMAL),
    162 	};
    163 
    164 	KASSERT(((uintptr_t)enc & 0xf) == 0);
    165 	KASSERT(((uintptr_t)in & 0xf) == 0);
    166 	KASSERT(((uintptr_t)out & 0xf) == 0);
    167 
    168 	asm volatile("rep xcryptecb"
    169 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    170 	    : "b"(enc), "d"(cw)
    171 	    : "memory", "cc");
    172 }
    173 
    174 static inline void
    175 aesvia_decN(const struct aesdec *dec, const uint8_t in[static 16],
    176     uint8_t out[static 16], size_t nblocks, uint32_t cw0)
    177 {
    178 	const uint32_t cw[4] __aligned(16) = {
    179 		[0] = (cw0
    180 		    | C3_CRYPT_CWLO_ALG_AES
    181 		    | C3_CRYPT_CWLO_DECRYPT
    182 		    | C3_CRYPT_CWLO_NORMAL),
    183 	};
    184 
    185 	KASSERT(((uintptr_t)dec & 0xf) == 0);
    186 	KASSERT(((uintptr_t)in & 0xf) == 0);
    187 	KASSERT(((uintptr_t)out & 0xf) == 0);
    188 
    189 	asm volatile("rep xcryptecb"
    190 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    191 	    : "b"(dec), "d"(cw)
    192 	    : "memory", "cc");
    193 }
    194 
    195 static struct evcnt enc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    196     NULL, "aesvia", "enc aligned");
    197 EVCNT_ATTACH_STATIC(enc_aligned_evcnt);
    198 static struct evcnt enc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    199     NULL, "aesvia", "dec unaligned");
    200 EVCNT_ATTACH_STATIC(enc_unaligned_evcnt);
    201 
    202 static void
    203 aesvia_enc(const struct aesenc *enc, const uint8_t in[static 16],
    204     uint8_t out[static 16], uint32_t nrounds)
    205 {
    206 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    207 
    208 	fpu_kern_enter();
    209 	aesvia_reload_keys();
    210 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
    211 	    ((uintptr_t)in & 0xff0) != 0xff0) {
    212 		enc_aligned_evcnt.ev_count++;
    213 		aesvia_encN(enc, in, out, 1, cw0);
    214 	} else {
    215 		enc_unaligned_evcnt.ev_count++;
    216 		/*
    217 		 * VIA requires 16-byte/128-bit alignment, and
    218 		 * xcrypt-ecb reads one block past the one we're
    219 		 * working on -- which may go past the end of the page
    220 		 * into unmapped territory.  Use a bounce buffer if
    221 		 * either constraint is violated.
    222 		 */
    223 		uint8_t inbuf[16] __aligned(16);
    224 		uint8_t outbuf[16] __aligned(16);
    225 
    226 		memcpy(inbuf, in, 16);
    227 		aesvia_encN(enc, inbuf, outbuf, 1, cw0);
    228 		memcpy(out, outbuf, 16);
    229 
    230 		explicit_memset(inbuf, 0, sizeof inbuf);
    231 		explicit_memset(outbuf, 0, sizeof outbuf);
    232 	}
    233 	fpu_kern_leave();
    234 }
    235 
    236 static struct evcnt dec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    237     NULL, "aesvia", "dec aligned");
    238 EVCNT_ATTACH_STATIC(dec_aligned_evcnt);
    239 static struct evcnt dec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    240     NULL, "aesvia", "dec unaligned");
    241 EVCNT_ATTACH_STATIC(dec_unaligned_evcnt);
    242 
    243 static void
    244 aesvia_dec(const struct aesdec *dec, const uint8_t in[static 16],
    245     uint8_t out[static 16], uint32_t nrounds)
    246 {
    247 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    248 
    249 	fpu_kern_enter();
    250 	aesvia_reload_keys();
    251 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
    252 	    ((uintptr_t)in & 0xff0) != 0xff0) {
    253 		dec_aligned_evcnt.ev_count++;
    254 		aesvia_decN(dec, in, out, 1, cw0);
    255 	} else {
    256 		dec_unaligned_evcnt.ev_count++;
    257 		/*
    258 		 * VIA requires 16-byte/128-bit alignment, and
    259 		 * xcrypt-ecb reads one block past the one we're
    260 		 * working on -- which may go past the end of the page
    261 		 * into unmapped territory.  Use a bounce buffer if
    262 		 * either constraint is violated.
    263 		 */
    264 		uint8_t inbuf[16] __aligned(16);
    265 		uint8_t outbuf[16] __aligned(16);
    266 
    267 		memcpy(inbuf, in, 16);
    268 		aesvia_decN(dec, inbuf, outbuf, 1, cw0);
    269 		memcpy(out, outbuf, 16);
    270 
    271 		explicit_memset(inbuf, 0, sizeof inbuf);
    272 		explicit_memset(outbuf, 0, sizeof outbuf);
    273 	}
    274 	fpu_kern_leave();
    275 }
    276 
    277 static inline void
    278 aesvia_cbc_encN(const struct aesenc *enc, const uint8_t in[static 16],
    279     uint8_t out[static 16], size_t nblocks, uint8_t **ivp, uint32_t cw0)
    280 {
    281 	const uint32_t cw[4] __aligned(16) = {
    282 		[0] = (cw0
    283 		    | C3_CRYPT_CWLO_ALG_AES
    284 		    | C3_CRYPT_CWLO_ENCRYPT
    285 		    | C3_CRYPT_CWLO_NORMAL),
    286 	};
    287 
    288 	KASSERT(((uintptr_t)enc & 0xf) == 0);
    289 	KASSERT(((uintptr_t)in & 0xf) == 0);
    290 	KASSERT(((uintptr_t)out & 0xf) == 0);
    291 	KASSERT(((uintptr_t)*ivp & 0xf) == 0);
    292 
    293 	/*
    294 	 * Register effects:
    295 	 * - Counts nblocks down to zero.
    296 	 * - Advances in by nblocks (units of blocks).
    297 	 * - Advances out by nblocks (units of blocks).
    298 	 * - Updates *ivp to point at the last block of out.
    299 	 */
    300 	asm volatile("rep xcryptcbc"
    301 	    : "+c"(nblocks), "+S"(in), "+D"(out), "+a"(*ivp)
    302 	    : "b"(enc), "d"(cw)
    303 	    : "memory", "cc");
    304 }
    305 
    306 static inline void
    307 aesvia_cbc_decN(const struct aesdec *dec, const uint8_t in[static 16],
    308     uint8_t out[static 16], size_t nblocks, uint8_t iv[static 16],
    309     uint32_t cw0)
    310 {
    311 	const uint32_t cw[4] __aligned(16) = {
    312 		[0] = (cw0
    313 		    | C3_CRYPT_CWLO_ALG_AES
    314 		    | C3_CRYPT_CWLO_DECRYPT
    315 		    | C3_CRYPT_CWLO_NORMAL),
    316 	};
    317 
    318 	KASSERT(((uintptr_t)dec & 0xf) == 0);
    319 	KASSERT(((uintptr_t)in & 0xf) == 0);
    320 	KASSERT(((uintptr_t)out & 0xf) == 0);
    321 	KASSERT(((uintptr_t)iv & 0xf) == 0);
    322 
    323 	/*
    324 	 * Register effects:
    325 	 * - Counts nblocks down to zero.
    326 	 * - Advances in by nblocks (units of blocks).
    327 	 * - Advances out by nblocks (units of blocks).
    328 	 * Memory side effects:
    329 	 * - Writes what was the last block of in at the address iv.
    330 	 */
    331 	asm volatile("rep xcryptcbc"
    332 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    333 	    : "a"(iv), "b"(dec), "d"(cw)
    334 	    : "memory", "cc");
    335 }
    336 
    337 static inline void
    338 xor128(void *x, const void *a, const void *b)
    339 {
    340 	uint32_t *x32 = x;
    341 	const uint32_t *a32 = a;
    342 	const uint32_t *b32 = b;
    343 
    344 	x32[0] = a32[0] ^ b32[0];
    345 	x32[1] = a32[1] ^ b32[1];
    346 	x32[2] = a32[2] ^ b32[2];
    347 	x32[3] = a32[3] ^ b32[3];
    348 }
    349 
    350 static struct evcnt cbcenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    351     NULL, "aesvia", "cbcenc aligned");
    352 EVCNT_ATTACH_STATIC(cbcenc_aligned_evcnt);
    353 static struct evcnt cbcenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    354     NULL, "aesvia", "cbcenc unaligned");
    355 EVCNT_ATTACH_STATIC(cbcenc_unaligned_evcnt);
    356 
    357 static void
    358 aesvia_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
    359     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    360     uint32_t nrounds)
    361 {
    362 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    363 
    364 	KASSERT(nbytes % 16 == 0);
    365 	if (nbytes == 0)
    366 		return;
    367 
    368 	fpu_kern_enter();
    369 	aesvia_reload_keys();
    370 	if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
    371 		cbcenc_aligned_evcnt.ev_count++;
    372 		uint8_t *ivp = iv;
    373 		aesvia_cbc_encN(enc, in, out, nbytes/16, &ivp, cw0);
    374 		memcpy(iv, ivp, 16);
    375 	} else {
    376 		cbcenc_unaligned_evcnt.ev_count++;
    377 		uint8_t cv[16] __aligned(16);
    378 		uint8_t tmp[16] __aligned(16);
    379 
    380 		memcpy(cv, iv, 16);
    381 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    382 			memcpy(tmp, in, 16);
    383 			xor128(tmp, tmp, cv);
    384 			aesvia_encN(enc, tmp, cv, 1, cw0);
    385 			memcpy(out, cv, 16);
    386 		}
    387 		memcpy(iv, cv, 16);
    388 	}
    389 	fpu_kern_leave();
    390 }
    391 
    392 static struct evcnt cbcdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    393     NULL, "aesvia", "cbcdec aligned");
    394 EVCNT_ATTACH_STATIC(cbcdec_aligned_evcnt);
    395 static struct evcnt cbcdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    396     NULL, "aesvia", "cbcdec unaligned");
    397 EVCNT_ATTACH_STATIC(cbcdec_unaligned_evcnt);
    398 
    399 static void
    400 aesvia_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
    401     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    402     uint32_t nrounds)
    403 {
    404 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    405 
    406 	KASSERT(nbytes % 16 == 0);
    407 	if (nbytes == 0)
    408 		return;
    409 
    410 	fpu_kern_enter();
    411 	aesvia_reload_keys();
    412 	if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
    413 		cbcdec_aligned_evcnt.ev_count++;
    414 		aesvia_cbc_decN(dec, in, out, nbytes/16, iv, cw0);
    415 	} else {
    416 		cbcdec_unaligned_evcnt.ev_count++;
    417 		uint8_t iv0[16] __aligned(16);
    418 		uint8_t cv[16] __aligned(16);
    419 		uint8_t tmp[16] __aligned(16);
    420 
    421 		memcpy(iv0, iv, 16);
    422 		memcpy(cv, in + nbytes - 16, 16);
    423 		memcpy(iv, cv, 16);
    424 
    425 		for (;;) {
    426 			aesvia_decN(dec, cv, tmp, 1, cw0);
    427 			if ((nbytes -= 16) == 0)
    428 				break;
    429 			memcpy(cv, in + nbytes - 16, 16);
    430 			xor128(tmp, tmp, cv);
    431 			// XXX: is this right? (subtracting 16)
    432 			memcpy(out + nbytes - 16, tmp, 16);
    433 		}
    434 
    435 		xor128(tmp, tmp, iv0);
    436 		memcpy(out, tmp, 16);
    437 		explicit_memset(tmp, 0, sizeof tmp);
    438 	}
    439 	fpu_kern_leave();
    440 }
    441 
    442 static inline void
    443 aesvia_xts_update(uint32_t *t0, uint32_t *t1, uint32_t *t2, uint32_t *t3)
    444 {
    445 	uint32_t s0, s1, s2, s3;
    446 
    447 	s0 = *t0 >> 31;
    448 	s1 = *t1 >> 31;
    449 	s2 = *t2 >> 31;
    450 	s3 = *t3 >> 31;
    451 	*t0 = (*t0 << 1) ^ (-s3 & 0x87);
    452 	*t1 = (*t1 << 1) ^ s0;
    453 	*t2 = (*t2 << 1) ^ s1;
    454 	*t3 = (*t3 << 1) ^ s2;
    455 }
    456 
    457 static int
    458 aesvia_xts_update_selftest(void)
    459 {
    460 	static const struct {
    461 		uint32_t in[4], out[4];
    462 	} cases[] = {
    463 		{ {1}, {2} },
    464 		{ {0x80000000U,0,0,0}, {0,1,0,0} },
    465 		{ {0,0x80000000U,0,0}, {0,0,1,0} },
    466 		{ {0,0,0x80000000U,0}, {0,0,0,1} },
    467 		{ {0,0,0,0x80000000U}, {0x87,0,0,0} },
    468 		{ {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
    469 	};
    470 	unsigned i;
    471 	uint32_t t0, t1, t2, t3;
    472 
    473 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
    474 		t0 = cases[i].in[0];
    475 		t1 = cases[i].in[1];
    476 		t2 = cases[i].in[2];
    477 		t3 = cases[i].in[3];
    478 		aesvia_xts_update(&t0, &t1, &t2, &t3);
    479 		if (t0 != cases[i].out[0] ||
    480 		    t1 != cases[i].out[1] ||
    481 		    t2 != cases[i].out[2] ||
    482 		    t3 != cases[i].out[3])
    483 			return -1;
    484 	}
    485 
    486 	/* Success!  */
    487 	return 0;
    488 }
    489 
    490 static struct evcnt xtsenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    491     NULL, "aesvia", "xtsenc aligned");
    492 EVCNT_ATTACH_STATIC(xtsenc_aligned_evcnt);
    493 static struct evcnt xtsenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    494     NULL, "aesvia", "xtsenc unaligned");
    495 EVCNT_ATTACH_STATIC(xtsenc_unaligned_evcnt);
    496 
    497 static void
    498 aesvia_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
    499     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    500     uint32_t nrounds)
    501 {
    502 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    503 	uint32_t t[4];
    504 
    505 	KASSERT(nbytes % 16 == 0);
    506 
    507 	memcpy(t, tweak, 16);
    508 
    509 	fpu_kern_enter();
    510 	aesvia_reload_keys();
    511 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
    512 		xtsenc_aligned_evcnt.ev_count++;
    513 		unsigned lastblock = 0;
    514 		uint32_t buf[8*4] __aligned(16);
    515 
    516 		/*
    517 		 * Make sure the last block is not the last block of a
    518 		 * page.  (Note that we store the AES input in `out' as
    519 		 * a temporary buffer, rather than reading it directly
    520 		 * from `in', since we have to combine the tweak
    521 		 * first.)
    522 		 */
    523 		lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
    524 		nbytes -= lastblock;
    525 
    526 		/*
    527 		 * Handle an odd number of initial blocks so we can
    528 		 * process the rest in eight-block (128-byte) chunks.
    529 		 */
    530 		if (nbytes % 128) {
    531 			unsigned nbytes128 = nbytes % 128;
    532 
    533 			nbytes -= nbytes128;
    534 			for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
    535 			{
    536 				xor128(out, in, t);
    537 				aesvia_encN(enc, out, out, 1, cw0);
    538 				xor128(out, out, t);
    539 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    540 			}
    541 		}
    542 
    543 		/* Process eight blocks at a time.  */
    544 		for (; nbytes; nbytes -= 128, in += 128, out += 128) {
    545 			unsigned i;
    546 			for (i = 0; i < 8; i++) {
    547 				memcpy(buf + 4*i, t, 16);
    548 				xor128(out + 4*i, in + 4*i, t);
    549 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    550 			}
    551 			aesvia_encN(enc, out, out, 8, cw0);
    552 			for (i = 0; i < 8; i++)
    553 				xor128(out + 4*i, in + 4*i, buf + 4*i);
    554 		}
    555 
    556 		/* Handle the last block of a page, if necessary.  */
    557 		if (lastblock) {
    558 			xor128(buf, in, t);
    559 			aesvia_encN(enc, (const void *)buf, out, 1, cw0);
    560 		}
    561 
    562 		explicit_memset(buf, 0, sizeof buf);
    563 	} else {
    564 		xtsenc_unaligned_evcnt.ev_count++;
    565 		uint8_t buf[16] __aligned(16);
    566 
    567 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    568 			memcpy(buf, in, 16);
    569 			xor128(buf, buf, t);
    570 			aesvia_encN(enc, buf, buf, 1, cw0);
    571 			xor128(buf, buf, t);
    572 			memcpy(out, buf, 16);
    573 			aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    574 		}
    575 
    576 		explicit_memset(buf, 0, sizeof buf);
    577 	}
    578 	fpu_kern_leave();
    579 
    580 	memcpy(tweak, t, 16);
    581 	explicit_memset(t, 0, sizeof t);
    582 }
    583 
    584 static struct evcnt xtsdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    585     NULL, "aesvia", "xtsdec aligned");
    586 EVCNT_ATTACH_STATIC(xtsdec_aligned_evcnt);
    587 static struct evcnt xtsdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    588     NULL, "aesvia", "xtsdec unaligned");
    589 EVCNT_ATTACH_STATIC(xtsdec_unaligned_evcnt);
    590 
    591 static void
    592 aesvia_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
    593     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    594     uint32_t nrounds)
    595 {
    596 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    597 	uint32_t t[4];
    598 
    599 	KASSERT(nbytes % 16 == 0);
    600 
    601 	memcpy(t, tweak, 16);
    602 
    603 	fpu_kern_enter();
    604 	aesvia_reload_keys();
    605 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
    606 		xtsdec_aligned_evcnt.ev_count++;
    607 		unsigned lastblock = 0;
    608 		uint32_t buf[8*4] __aligned(16);
    609 
    610 		/*
    611 		 * Make sure the last block is not the last block of a
    612 		 * page.  (Note that we store the AES input in `out' as
    613 		 * a temporary buffer, rather than reading it directly
    614 		 * from `in', since we have to combine the tweak
    615 		 * first.)
    616 		 */
    617 		lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
    618 		nbytes -= lastblock;
    619 
    620 		/*
    621 		 * Handle an odd number of initial blocks so we can
    622 		 * process the rest in eight-block (128-byte) chunks.
    623 		 */
    624 		if (nbytes % 128) {
    625 			unsigned nbytes128 = nbytes % 128;
    626 
    627 			nbytes -= nbytes128;
    628 			for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
    629 			{
    630 				xor128(out, in, t);
    631 				aesvia_decN(dec, out, out, 1, cw0);
    632 				xor128(out, out, t);
    633 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    634 			}
    635 		}
    636 
    637 		/* Process eight blocks at a time.  */
    638 		for (; nbytes; nbytes -= 128, in += 128, out += 128) {
    639 			unsigned i;
    640 			for (i = 0; i < 8; i++) {
    641 				memcpy(buf + 4*i, t, 16);
    642 				xor128(out + 4*i, in + 4*i, t);
    643 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    644 			}
    645 			aesvia_decN(dec, out, out, 8, cw0);
    646 			for (i = 0; i < 8; i++)
    647 				xor128(out + 4*i, in + 4*i, buf + 4*i);
    648 		}
    649 
    650 		/* Handle the last block of a page, if necessary.  */
    651 		if (lastblock) {
    652 			xor128(buf, in, t);
    653 			aesvia_decN(dec, (const void *)buf, out, 1, cw0);
    654 		}
    655 
    656 		explicit_memset(buf, 0, sizeof buf);
    657 	} else {
    658 		xtsdec_unaligned_evcnt.ev_count++;
    659 		uint8_t buf[16] __aligned(16);
    660 
    661 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    662 			memcpy(buf, in, 16);
    663 			xor128(buf, buf, t);
    664 			aesvia_decN(dec, buf, buf, 1, cw0);
    665 			xor128(buf, buf, t);
    666 			memcpy(out, buf, 16);
    667 			aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    668 		}
    669 
    670 		explicit_memset(buf, 0, sizeof buf);
    671 	}
    672 	fpu_kern_leave();
    673 
    674 	memcpy(tweak, t, 16);
    675 	explicit_memset(t, 0, sizeof t);
    676 }
    677 
    678 static struct evcnt cbcmac_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    679     NULL, "aesvia", "cbcmac aligned");
    680 EVCNT_ATTACH_STATIC(cbcmac_aligned_evcnt);
    681 static struct evcnt cbcmac_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    682     NULL, "aesvia", "cbcmac unaligned");
    683 EVCNT_ATTACH_STATIC(cbcmac_unaligned_evcnt);
    684 
    685 static void
    686 aesvia_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
    687     size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
    688 {
    689 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    690 	uint8_t authbuf[16] __aligned(16);
    691 	uint8_t *auth = auth0;
    692 
    693 	KASSERT(nbytes);
    694 	KASSERT(nbytes % 16 == 0);
    695 
    696 	if ((uintptr_t)auth0 & 0xf) {
    697 		memcpy(authbuf, auth0, 16);
    698 		auth = authbuf;
    699 		cbcmac_unaligned_evcnt.ev_count++;
    700 	} else {
    701 		cbcmac_aligned_evcnt.ev_count++;
    702 	}
    703 
    704 	fpu_kern_enter();
    705 	aesvia_reload_keys();
    706 	for (; nbytes; nbytes -= 16, in += 16) {
    707 		xor128(auth, auth, in);
    708 		aesvia_encN(enc, auth, auth, 1, cw0);
    709 	}
    710 	fpu_kern_leave();
    711 
    712 	if ((uintptr_t)auth0 & 0xf) {
    713 		memcpy(auth0, authbuf, 16);
    714 		explicit_memset(authbuf, 0, sizeof authbuf);
    715 	}
    716 }
    717 
    718 static struct evcnt ccmenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    719     NULL, "aesvia", "ccmenc aligned");
    720 EVCNT_ATTACH_STATIC(ccmenc_aligned_evcnt);
    721 static struct evcnt ccmenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    722     NULL, "aesvia", "ccmenc unaligned");
    723 EVCNT_ATTACH_STATIC(ccmenc_unaligned_evcnt);
    724 
    725 static void
    726 aesvia_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
    727     uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
    728     uint32_t nrounds)
    729 {
    730 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    731 	uint8_t authctrbuf[32] __aligned(16);
    732 	uint8_t *authctr;
    733 	uint32_t c0, c1, c2, c3;
    734 
    735 	KASSERT(nbytes);
    736 	KASSERT(nbytes % 16 == 0);
    737 
    738 	if ((uintptr_t)authctr0 & 0xf) {
    739 		memcpy(authctrbuf, authctr0, 16);
    740 		authctr = authctrbuf;
    741 		ccmenc_unaligned_evcnt.ev_count++;
    742 	} else {
    743 		authctr = authctr0;
    744 		ccmenc_aligned_evcnt.ev_count++;
    745 	}
    746 	c0 = le32dec(authctr0 + 16 + 4*0);
    747 	c1 = le32dec(authctr0 + 16 + 4*1);
    748 	c2 = le32dec(authctr0 + 16 + 4*2);
    749 	c3 = be32dec(authctr0 + 16 + 4*3);
    750 
    751 	/*
    752 	 * In principle we could use REP XCRYPTCTR here, but that
    753 	 * doesn't help to compute the CBC-MAC step, and certain VIA
    754 	 * CPUs have some weird errata with REP XCRYPTCTR that make it
    755 	 * kind of a pain to use.  So let's just use REP XCRYPTECB to
    756 	 * simultaneously compute the CBC-MAC step and the CTR step.
    757 	 * (Maybe some VIA CPUs will compute REP XCRYPTECB in parallel,
    758 	 * who knows...)
    759 	 */
    760 	fpu_kern_enter();
    761 	aesvia_reload_keys();
    762 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    763 		xor128(authctr, authctr, in);
    764 		le32enc(authctr + 16 + 4*0, c0);
    765 		le32enc(authctr + 16 + 4*1, c1);
    766 		le32enc(authctr + 16 + 4*2, c2);
    767 		be32enc(authctr + 16 + 4*3, ++c3);
    768 		aesvia_encN(enc, authctr, authctr, 2, cw0);
    769 		xor128(out, in, authctr + 16);
    770 	}
    771 	fpu_kern_leave();
    772 
    773 	if ((uintptr_t)authctr0 & 0xf) {
    774 		memcpy(authctr0, authctrbuf, 16);
    775 		explicit_memset(authctrbuf, 0, sizeof authctrbuf);
    776 	}
    777 
    778 	le32enc(authctr0 + 16 + 4*0, c0);
    779 	le32enc(authctr0 + 16 + 4*1, c1);
    780 	le32enc(authctr0 + 16 + 4*2, c2);
    781 	be32enc(authctr0 + 16 + 4*3, c3);
    782 }
    783 
    784 static struct evcnt ccmdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    785     NULL, "aesvia", "ccmdec aligned");
    786 EVCNT_ATTACH_STATIC(ccmdec_aligned_evcnt);
    787 static struct evcnt ccmdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    788     NULL, "aesvia", "ccmdec unaligned");
    789 EVCNT_ATTACH_STATIC(ccmdec_unaligned_evcnt);
    790 
    791 static void
    792 aesvia_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
    793     uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
    794     uint32_t nrounds)
    795 {
    796 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    797 	uint8_t authctrbuf[32] __aligned(16);
    798 	uint8_t *authctr;
    799 	uint32_t c0, c1, c2, c3;
    800 
    801 	KASSERT(nbytes);
    802 	KASSERT(nbytes % 16 == 0);
    803 
    804 	c0 = le32dec(authctr0 + 16 + 4*0);
    805 	c1 = le32dec(authctr0 + 16 + 4*1);
    806 	c2 = le32dec(authctr0 + 16 + 4*2);
    807 	c3 = be32dec(authctr0 + 16 + 4*3);
    808 
    809 	if ((uintptr_t)authctr0 & 0xf) {
    810 		memcpy(authctrbuf, authctr0, 16);
    811 		authctr = authctrbuf;
    812 		le32enc(authctr + 16 + 4*0, c0);
    813 		le32enc(authctr + 16 + 4*1, c1);
    814 		le32enc(authctr + 16 + 4*2, c2);
    815 		ccmdec_unaligned_evcnt.ev_count++;
    816 	} else {
    817 		authctr = authctr0;
    818 		ccmdec_aligned_evcnt.ev_count++;
    819 	}
    820 
    821 	fpu_kern_enter();
    822 	aesvia_reload_keys();
    823 	be32enc(authctr + 16 + 4*3, ++c3);
    824 	aesvia_encN(enc, authctr + 16, authctr + 16, 1, cw0);
    825 	for (;; in += 16, out += 16) {
    826 		xor128(out, authctr + 16, in);
    827 		xor128(authctr, authctr, out);
    828 		if ((nbytes -= 16) == 0)
    829 			break;
    830 		le32enc(authctr + 16 + 4*0, c0);
    831 		le32enc(authctr + 16 + 4*1, c1);
    832 		le32enc(authctr + 16 + 4*2, c2);
    833 		be32enc(authctr + 16 + 4*3, ++c3);
    834 		aesvia_encN(enc, authctr, authctr, 2, cw0);
    835 	}
    836 	aesvia_encN(enc, authctr, authctr, 1, cw0);
    837 	fpu_kern_leave();
    838 
    839 	if ((uintptr_t)authctr0 & 0xf) {
    840 		memcpy(authctr0, authctrbuf, 16);
    841 		explicit_memset(authctrbuf, 0, sizeof authctrbuf);
    842 	}
    843 
    844 	le32enc(authctr0 + 16 + 4*0, c0);
    845 	le32enc(authctr0 + 16 + 4*1, c1);
    846 	le32enc(authctr0 + 16 + 4*2, c2);
    847 	be32enc(authctr0 + 16 + 4*3, c3);
    848 }
    849 
    850 static int
    851 aesvia_probe(void)
    852 {
    853 
    854 	/* Verify that the CPU advertises VIA ACE support.  */
    855 #ifdef _KERNEL
    856 	if ((cpu_feature[4] & CPUID_VIA_HAS_ACE) == 0)
    857 		return -1;
    858 #else
    859 	/*
    860 	 * From the VIA PadLock Programming Guide:
    861 	 * http://linux.via.com.tw/support/beginDownload.action?eleid=181&fid=261
    862 	 */
    863 	unsigned eax, ebx, ecx, edx;
    864 	if (!__get_cpuid(0, &eax, &ebx, &ecx, &edx))
    865 		return -1;
    866 	if (ebx != signature_CENTAUR_ebx ||
    867 	    ecx != signature_CENTAUR_ecx ||
    868 	    edx != signature_CENTAUR_edx)
    869 		return -1;
    870 	if (eax < 0xc0000000)
    871 		return -1;
    872 	if (!__get_cpuid(0xc0000000, &eax, &ebx, &ecx, &edx))
    873 		return -1;
    874 	if (eax < 0xc0000001)
    875 		return -1;
    876 	if (!__get_cpuid(0xc0000001, &eax, &ebx, &ecx, &edx))
    877 		return -1;
    878 	/* Check whether ACE or ACE2 is both supported and enabled.  */
    879 	if ((edx & 0x000000c0) != 0x000000c0 ||
    880 	    (edx & 0x00000300) != 0x00000300)
    881 		return -1;
    882 #endif
    883 
    884 	/* Verify that our XTS tweak update logic works.  */
    885 	if (aesvia_xts_update_selftest())
    886 		return -1;
    887 
    888 	/* Success!  */
    889 	return 0;
    890 }
    891 
    892 struct aes_impl aes_via_impl = {
    893 	.ai_name = "VIA ACE",
    894 	.ai_probe = aesvia_probe,
    895 	.ai_setenckey = aesvia_setenckey,
    896 	.ai_setdeckey = aesvia_setdeckey,
    897 	.ai_enc = aesvia_enc,
    898 	.ai_dec = aesvia_dec,
    899 	.ai_cbc_enc = aesvia_cbc_enc,
    900 	.ai_cbc_dec = aesvia_cbc_dec,
    901 	.ai_xts_enc = aesvia_xts_enc,
    902 	.ai_xts_dec = aesvia_xts_dec,
    903 	.ai_cbcmac_update1 = aesvia_cbcmac_update1,
    904 	.ai_ccm_enc1 = aesvia_ccm_enc1,
    905 	.ai_ccm_dec1 = aesvia_ccm_dec1,
    906 };
    907