Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: aes_via.c,v 1.9 2024/06/16 16:30:52 rillig Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(1, "$NetBSD: aes_via.c,v 1.9 2024/06/16 16:30:52 rillig Exp $");
     31 
     32 #ifdef _KERNEL
     33 #include <sys/types.h>
     34 #include <sys/evcnt.h>
     35 #include <sys/systm.h>
     36 #else
     37 #include <assert.h>
     38 #include <err.h>
     39 #include <stdint.h>
     40 #include <string.h>
     41 #define	KASSERT			assert
     42 #define	panic(fmt, args...)	err(1, fmt, args)
     43 struct evcnt { uint64_t ev_count; };
     44 #define	EVCNT_INITIALIZER(a,b,c,d) {0}
     45 #define	EVCNT_ATTACH_STATIC(name)	static char name##_attach __unused = 0
     46 #endif
     47 
     48 #include <crypto/aes/aes.h>
     49 #include <crypto/aes/aes_bear.h>
     50 #include <crypto/aes/aes_impl.h>
     51 
     52 #ifdef _KERNEL
     53 #include <x86/cpufunc.h>
     54 #include <x86/cpuvar.h>
     55 #include <x86/fpu.h>
     56 #include <x86/specialreg.h>
     57 #include <x86/via_padlock.h>
     58 #else
     59 #include <cpuid.h>
     60 #define	fpu_kern_enter()	((void)0)
     61 #define	fpu_kern_leave()	((void)0)
     62 #define C3_CRYPT_CWLO_ROUND_M		0x0000000f
     63 #define C3_CRYPT_CWLO_ALG_M		0x00000070
     64 #define C3_CRYPT_CWLO_ALG_AES		0x00000000
     65 #define C3_CRYPT_CWLO_KEYGEN_M		0x00000080
     66 #define C3_CRYPT_CWLO_KEYGEN_HW		0x00000000
     67 #define C3_CRYPT_CWLO_KEYGEN_SW		0x00000080
     68 #define C3_CRYPT_CWLO_NORMAL		0x00000000
     69 #define C3_CRYPT_CWLO_INTERMEDIATE	0x00000100
     70 #define C3_CRYPT_CWLO_ENCRYPT		0x00000000
     71 #define C3_CRYPT_CWLO_DECRYPT		0x00000200
     72 #define C3_CRYPT_CWLO_KEY128		0x0000000a      /* 128bit, 10 rds */
     73 #define C3_CRYPT_CWLO_KEY192		0x0000040c      /* 192bit, 12 rds */
     74 #define C3_CRYPT_CWLO_KEY256		0x0000080e      /* 256bit, 15 rds */
     75 #endif
     76 
     77 static void
     78 aesvia_reload_keys(void)
     79 {
     80 
     81 	asm volatile("pushf; popf");
     82 }
     83 
     84 static uint32_t
     85 aesvia_keylen_cw0(unsigned nrounds)
     86 {
     87 
     88 	/*
     89 	 * Determine the control word bits for the key size / number of
     90 	 * rounds.  For AES-128, the hardware can do key expansion on
     91 	 * the fly; for AES-192 and AES-256, software must do it.
     92 	 */
     93 	switch (nrounds) {
     94 	case AES_128_NROUNDS:
     95 		return C3_CRYPT_CWLO_KEY128;
     96 	case AES_192_NROUNDS:
     97 		return C3_CRYPT_CWLO_KEY192 | C3_CRYPT_CWLO_KEYGEN_SW;
     98 	case AES_256_NROUNDS:
     99 		return C3_CRYPT_CWLO_KEY256 | C3_CRYPT_CWLO_KEYGEN_SW;
    100 	default:
    101 		panic("invalid AES nrounds: %u", nrounds);
    102 	}
    103 }
    104 
    105 static void
    106 aesvia_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds)
    107 {
    108 	size_t key_len;
    109 
    110 	switch (nrounds) {
    111 	case AES_128_NROUNDS:
    112 		enc->aese_aes.aes_rk[0] = le32dec(key + 4*0);
    113 		enc->aese_aes.aes_rk[1] = le32dec(key + 4*1);
    114 		enc->aese_aes.aes_rk[2] = le32dec(key + 4*2);
    115 		enc->aese_aes.aes_rk[3] = le32dec(key + 4*3);
    116 		return;
    117 	case AES_192_NROUNDS:
    118 		key_len = 24;
    119 		break;
    120 	case AES_256_NROUNDS:
    121 		key_len = 32;
    122 		break;
    123 	default:
    124 		panic("invalid AES nrounds: %u", nrounds);
    125 	}
    126 	br_aes_ct_keysched_stdenc(enc->aese_aes.aes_rk, key, key_len);
    127 }
    128 
    129 static void
    130 aesvia_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
    131 {
    132 	size_t key_len;
    133 
    134 	switch (nrounds) {
    135 	case AES_128_NROUNDS:
    136 		dec->aesd_aes.aes_rk[0] = le32dec(key + 4*0);
    137 		dec->aesd_aes.aes_rk[1] = le32dec(key + 4*1);
    138 		dec->aesd_aes.aes_rk[2] = le32dec(key + 4*2);
    139 		dec->aesd_aes.aes_rk[3] = le32dec(key + 4*3);
    140 		return;
    141 	case AES_192_NROUNDS:
    142 		key_len = 24;
    143 		break;
    144 	case AES_256_NROUNDS:
    145 		key_len = 32;
    146 		break;
    147 	default:
    148 		panic("invalid AES nrounds: %u", nrounds);
    149 	}
    150 	br_aes_ct_keysched_stddec(dec->aesd_aes.aes_rk, key, key_len);
    151 }
    152 
    153 static inline void
    154 aesvia_encN(const struct aesenc *enc, const uint8_t in[static 16],
    155     uint8_t out[static 16], size_t nblocks, uint32_t cw0)
    156 {
    157 	const uint32_t cw[4] __aligned(16) = {
    158 		[0] = (cw0
    159 		    | C3_CRYPT_CWLO_ALG_AES
    160 		    | C3_CRYPT_CWLO_ENCRYPT
    161 		    | C3_CRYPT_CWLO_NORMAL),
    162 	};
    163 
    164 	KASSERT(((uintptr_t)enc & 0xf) == 0);
    165 	KASSERT(((uintptr_t)in & 0xf) == 0);
    166 	KASSERT(((uintptr_t)out & 0xf) == 0);
    167 
    168 	asm volatile("rep xcryptecb"
    169 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    170 	    : "b"(enc), "d"(cw)
    171 	    : "memory", "cc");
    172 }
    173 
    174 static inline void
    175 aesvia_decN(const struct aesdec *dec, const uint8_t in[static 16],
    176     uint8_t out[static 16], size_t nblocks, uint32_t cw0)
    177 {
    178 	const uint32_t cw[4] __aligned(16) = {
    179 		[0] = (cw0
    180 		    | C3_CRYPT_CWLO_ALG_AES
    181 		    | C3_CRYPT_CWLO_DECRYPT
    182 		    | C3_CRYPT_CWLO_NORMAL),
    183 	};
    184 
    185 	KASSERT(((uintptr_t)dec & 0xf) == 0);
    186 	KASSERT(((uintptr_t)in & 0xf) == 0);
    187 	KASSERT(((uintptr_t)out & 0xf) == 0);
    188 
    189 	asm volatile("rep xcryptecb"
    190 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    191 	    : "b"(dec), "d"(cw)
    192 	    : "memory", "cc");
    193 }
    194 
    195 static struct evcnt enc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    196     NULL, "aesvia", "enc aligned");
    197 EVCNT_ATTACH_STATIC(enc_aligned_evcnt);
    198 static struct evcnt enc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    199     NULL, "aesvia", "dec unaligned");
    200 EVCNT_ATTACH_STATIC(enc_unaligned_evcnt);
    201 
    202 static void
    203 aesvia_enc(const struct aesenc *enc, const uint8_t in[static 16],
    204     uint8_t out[static 16], uint32_t nrounds)
    205 {
    206 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    207 
    208 	fpu_kern_enter();
    209 	aesvia_reload_keys();
    210 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
    211 	    ((uintptr_t)in & 0xff0) != 0xff0) {
    212 		enc_aligned_evcnt.ev_count++;
    213 		aesvia_encN(enc, in, out, 1, cw0);
    214 	} else {
    215 		enc_unaligned_evcnt.ev_count++;
    216 		/*
    217 		 * VIA requires 16-byte/128-bit alignment, and
    218 		 * xcrypt-ecb reads one block past the one we're
    219 		 * working on -- which may go past the end of the page
    220 		 * into unmapped territory.  Use a bounce buffer if
    221 		 * either constraint is violated.
    222 		 */
    223 		uint8_t inbuf[16] __aligned(16);
    224 		uint8_t outbuf[16] __aligned(16);
    225 
    226 		memcpy(inbuf, in, 16);
    227 		aesvia_encN(enc, inbuf, outbuf, 1, cw0);
    228 		memcpy(out, outbuf, 16);
    229 
    230 		explicit_memset(inbuf, 0, sizeof inbuf);
    231 		explicit_memset(outbuf, 0, sizeof outbuf);
    232 	}
    233 	fpu_kern_leave();
    234 }
    235 
    236 static struct evcnt dec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    237     NULL, "aesvia", "dec aligned");
    238 EVCNT_ATTACH_STATIC(dec_aligned_evcnt);
    239 static struct evcnt dec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    240     NULL, "aesvia", "dec unaligned");
    241 EVCNT_ATTACH_STATIC(dec_unaligned_evcnt);
    242 
    243 static void
    244 aesvia_dec(const struct aesdec *dec, const uint8_t in[static 16],
    245     uint8_t out[static 16], uint32_t nrounds)
    246 {
    247 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    248 
    249 	fpu_kern_enter();
    250 	aesvia_reload_keys();
    251 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
    252 	    ((uintptr_t)in & 0xff0) != 0xff0) {
    253 		dec_aligned_evcnt.ev_count++;
    254 		aesvia_decN(dec, in, out, 1, cw0);
    255 	} else {
    256 		dec_unaligned_evcnt.ev_count++;
    257 		/*
    258 		 * VIA requires 16-byte/128-bit alignment, and
    259 		 * xcrypt-ecb reads one block past the one we're
    260 		 * working on -- which may go past the end of the page
    261 		 * into unmapped territory.  Use a bounce buffer if
    262 		 * either constraint is violated.
    263 		 */
    264 		uint8_t inbuf[16] __aligned(16);
    265 		uint8_t outbuf[16] __aligned(16);
    266 
    267 		memcpy(inbuf, in, 16);
    268 		aesvia_decN(dec, inbuf, outbuf, 1, cw0);
    269 		memcpy(out, outbuf, 16);
    270 
    271 		explicit_memset(inbuf, 0, sizeof inbuf);
    272 		explicit_memset(outbuf, 0, sizeof outbuf);
    273 	}
    274 	fpu_kern_leave();
    275 }
    276 
    277 static inline void
    278 aesvia_cbc_encN(const struct aesenc *enc, const uint8_t in[static 16],
    279     uint8_t out[static 16], size_t nblocks, uint8_t **ivp, uint32_t cw0)
    280 {
    281 	const uint32_t cw[4] __aligned(16) = {
    282 		[0] = (cw0
    283 		    | C3_CRYPT_CWLO_ALG_AES
    284 		    | C3_CRYPT_CWLO_ENCRYPT
    285 		    | C3_CRYPT_CWLO_NORMAL),
    286 	};
    287 
    288 	KASSERT(((uintptr_t)enc & 0xf) == 0);
    289 	KASSERT(((uintptr_t)in & 0xf) == 0);
    290 	KASSERT(((uintptr_t)out & 0xf) == 0);
    291 	KASSERT(((uintptr_t)*ivp & 0xf) == 0);
    292 
    293 	/*
    294 	 * Register effects:
    295 	 * - Counts nblocks down to zero.
    296 	 * - Advances in by nblocks (units of blocks).
    297 	 * - Advances out by nblocks (units of blocks).
    298 	 * - Updates *ivp to point at the last block of out.
    299 	 */
    300 	asm volatile("rep xcryptcbc"
    301 	    : "+c"(nblocks), "+S"(in), "+D"(out), "+a"(*ivp)
    302 	    : "b"(enc), "d"(cw)
    303 	    : "memory", "cc");
    304 }
    305 
    306 static inline void
    307 aesvia_cbc_decN(const struct aesdec *dec, const uint8_t in[static 16],
    308     uint8_t out[static 16], size_t nblocks, uint8_t iv[static 16],
    309     uint32_t cw0)
    310 {
    311 	const uint32_t cw[4] __aligned(16) = {
    312 		[0] = (cw0
    313 		    | C3_CRYPT_CWLO_ALG_AES
    314 		    | C3_CRYPT_CWLO_DECRYPT
    315 		    | C3_CRYPT_CWLO_NORMAL),
    316 	};
    317 
    318 	KASSERT(((uintptr_t)dec & 0xf) == 0);
    319 	KASSERT(((uintptr_t)in & 0xf) == 0);
    320 	KASSERT(((uintptr_t)out & 0xf) == 0);
    321 	KASSERT(((uintptr_t)iv & 0xf) == 0);
    322 
    323 	/*
    324 	 * Register effects:
    325 	 * - Counts nblocks down to zero.
    326 	 * - Advances in by nblocks (units of blocks).
    327 	 * - Advances out by nblocks (units of blocks).
    328 	 * Memory side effects:
    329 	 * - Writes what was the last block of in at the address iv.
    330 	 */
    331 	asm volatile("rep xcryptcbc"
    332 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    333 	    : "a"(iv), "b"(dec), "d"(cw)
    334 	    : "memory", "cc");
    335 }
    336 
    337 static inline void
    338 xor128(void *x, const void *a, const void *b)
    339 {
    340 	uint32_t *x32 = x;
    341 	const uint32_t *a32 = a;
    342 	const uint32_t *b32 = b;
    343 
    344 	x32[0] = a32[0] ^ b32[0];
    345 	x32[1] = a32[1] ^ b32[1];
    346 	x32[2] = a32[2] ^ b32[2];
    347 	x32[3] = a32[3] ^ b32[3];
    348 }
    349 
    350 static struct evcnt cbcenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    351     NULL, "aesvia", "cbcenc aligned");
    352 EVCNT_ATTACH_STATIC(cbcenc_aligned_evcnt);
    353 static struct evcnt cbcenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    354     NULL, "aesvia", "cbcenc unaligned");
    355 EVCNT_ATTACH_STATIC(cbcenc_unaligned_evcnt);
    356 
    357 static void
    358 aesvia_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
    359     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    360     uint32_t nrounds)
    361 {
    362 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    363 
    364 	KASSERT(nbytes % 16 == 0);
    365 	if (nbytes == 0)
    366 		return;
    367 
    368 	fpu_kern_enter();
    369 	aesvia_reload_keys();
    370 	if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
    371 		cbcenc_aligned_evcnt.ev_count++;
    372 		uint8_t *ivp = iv;
    373 		aesvia_cbc_encN(enc, in, out, nbytes/16, &ivp, cw0);
    374 		memcpy(iv, ivp, 16);
    375 	} else {
    376 		cbcenc_unaligned_evcnt.ev_count++;
    377 		uint8_t cv[16] __aligned(16);
    378 		uint8_t tmp[16] __aligned(16);
    379 
    380 		memcpy(cv, iv, 16);
    381 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    382 			memcpy(tmp, in, 16);
    383 			xor128(tmp, tmp, cv);
    384 			aesvia_encN(enc, tmp, cv, 1, cw0);
    385 			memcpy(out, cv, 16);
    386 		}
    387 		memcpy(iv, cv, 16);
    388 	}
    389 	fpu_kern_leave();
    390 }
    391 
    392 static struct evcnt cbcdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    393     NULL, "aesvia", "cbcdec aligned");
    394 EVCNT_ATTACH_STATIC(cbcdec_aligned_evcnt);
    395 static struct evcnt cbcdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    396     NULL, "aesvia", "cbcdec unaligned");
    397 EVCNT_ATTACH_STATIC(cbcdec_unaligned_evcnt);
    398 
    399 static void
    400 aesvia_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
    401     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    402     uint32_t nrounds)
    403 {
    404 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    405 
    406 	KASSERT(nbytes % 16 == 0);
    407 	if (nbytes == 0)
    408 		return;
    409 
    410 	fpu_kern_enter();
    411 	aesvia_reload_keys();
    412 	if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
    413 		cbcdec_aligned_evcnt.ev_count++;
    414 		aesvia_cbc_decN(dec, in, out, nbytes/16, iv, cw0);
    415 	} else {
    416 		cbcdec_unaligned_evcnt.ev_count++;
    417 		uint8_t iv0[16] __aligned(16);
    418 		uint8_t cv[16] __aligned(16);
    419 		uint8_t tmp[16] __aligned(16);
    420 
    421 		memcpy(iv0, iv, 16);
    422 		memcpy(cv, in + nbytes - 16, 16);
    423 		memcpy(iv, cv, 16);
    424 
    425 		for (;;) {
    426 			aesvia_decN(dec, cv, tmp, 1, cw0);
    427 			if ((nbytes -= 16) == 0)
    428 				break;
    429 			memcpy(cv, in + nbytes - 16, 16);
    430 			xor128(tmp, tmp, cv);
    431 			memcpy(out + nbytes, tmp, 16);
    432 		}
    433 
    434 		xor128(tmp, tmp, iv0);
    435 		memcpy(out, tmp, 16);
    436 		explicit_memset(tmp, 0, sizeof tmp);
    437 	}
    438 	fpu_kern_leave();
    439 }
    440 
    441 static inline void
    442 aesvia_xts_update(uint32_t *t0, uint32_t *t1, uint32_t *t2, uint32_t *t3)
    443 {
    444 	uint32_t s0, s1, s2, s3;
    445 
    446 	s0 = *t0 >> 31;
    447 	s1 = *t1 >> 31;
    448 	s2 = *t2 >> 31;
    449 	s3 = *t3 >> 31;
    450 	*t0 = (*t0 << 1) ^ (-s3 & 0x87);
    451 	*t1 = (*t1 << 1) ^ s0;
    452 	*t2 = (*t2 << 1) ^ s1;
    453 	*t3 = (*t3 << 1) ^ s2;
    454 }
    455 
    456 static int
    457 aesvia_xts_update_selftest(void)
    458 {
    459 	static const struct {
    460 		uint32_t in[4], out[4];
    461 	} cases[] = {
    462 		{ {1}, {2} },
    463 		{ {0x80000000U,0,0,0}, {0,1,0,0} },
    464 		{ {0,0x80000000U,0,0}, {0,0,1,0} },
    465 		{ {0,0,0x80000000U,0}, {0,0,0,1} },
    466 		{ {0,0,0,0x80000000U}, {0x87,0,0,0} },
    467 		{ {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
    468 	};
    469 	unsigned i;
    470 	uint32_t t0, t1, t2, t3;
    471 
    472 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
    473 		t0 = cases[i].in[0];
    474 		t1 = cases[i].in[1];
    475 		t2 = cases[i].in[2];
    476 		t3 = cases[i].in[3];
    477 		aesvia_xts_update(&t0, &t1, &t2, &t3);
    478 		if (t0 != cases[i].out[0] ||
    479 		    t1 != cases[i].out[1] ||
    480 		    t2 != cases[i].out[2] ||
    481 		    t3 != cases[i].out[3])
    482 			return -1;
    483 	}
    484 
    485 	/* Success!  */
    486 	return 0;
    487 }
    488 
    489 static struct evcnt xtsenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    490     NULL, "aesvia", "xtsenc aligned");
    491 EVCNT_ATTACH_STATIC(xtsenc_aligned_evcnt);
    492 static struct evcnt xtsenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    493     NULL, "aesvia", "xtsenc unaligned");
    494 EVCNT_ATTACH_STATIC(xtsenc_unaligned_evcnt);
    495 
    496 static void
    497 aesvia_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
    498     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    499     uint32_t nrounds)
    500 {
    501 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    502 	uint32_t t[4];
    503 
    504 	KASSERT(nbytes % 16 == 0);
    505 
    506 	memcpy(t, tweak, 16);
    507 
    508 	fpu_kern_enter();
    509 	aesvia_reload_keys();
    510 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
    511 		xtsenc_aligned_evcnt.ev_count++;
    512 		unsigned lastblock = 0;
    513 		uint32_t buf[8*4] __aligned(16);
    514 
    515 		/*
    516 		 * Make sure the last block is not the last block of a
    517 		 * page.  (Note that we store the AES input in `out' as
    518 		 * a temporary buffer, rather than reading it directly
    519 		 * from `in', since we have to combine the tweak
    520 		 * first.)
    521 		 */
    522 		lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
    523 		nbytes -= lastblock;
    524 
    525 		/*
    526 		 * Handle an odd number of initial blocks so we can
    527 		 * process the rest in eight-block (128-byte) chunks.
    528 		 */
    529 		if (nbytes % 128) {
    530 			unsigned nbytes128 = nbytes % 128;
    531 
    532 			nbytes -= nbytes128;
    533 			for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
    534 			{
    535 				xor128(out, in, t);
    536 				aesvia_encN(enc, out, out, 1, cw0);
    537 				xor128(out, out, t);
    538 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    539 			}
    540 		}
    541 
    542 		/* Process eight blocks at a time.  */
    543 		for (; nbytes; nbytes -= 128, in += 128, out += 128) {
    544 			unsigned i;
    545 			for (i = 0; i < 8; i++) {
    546 				memcpy(buf + 4*i, t, 16);
    547 				xor128(out + 4*i, in + 4*i, t);
    548 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    549 			}
    550 			aesvia_encN(enc, out, out, 8, cw0);
    551 			for (i = 0; i < 8; i++)
    552 				xor128(out + 4*i, in + 4*i, buf + 4*i);
    553 		}
    554 
    555 		/* Handle the last block of a page, if necessary.  */
    556 		if (lastblock) {
    557 			xor128(buf, in, t);
    558 			aesvia_encN(enc, (const void *)buf, out, 1, cw0);
    559 		}
    560 
    561 		explicit_memset(buf, 0, sizeof buf);
    562 	} else {
    563 		xtsenc_unaligned_evcnt.ev_count++;
    564 		uint8_t buf[16] __aligned(16);
    565 
    566 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    567 			memcpy(buf, in, 16);
    568 			xor128(buf, buf, t);
    569 			aesvia_encN(enc, buf, buf, 1, cw0);
    570 			xor128(buf, buf, t);
    571 			memcpy(out, buf, 16);
    572 			aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    573 		}
    574 
    575 		explicit_memset(buf, 0, sizeof buf);
    576 	}
    577 	fpu_kern_leave();
    578 
    579 	memcpy(tweak, t, 16);
    580 	explicit_memset(t, 0, sizeof t);
    581 }
    582 
    583 static struct evcnt xtsdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    584     NULL, "aesvia", "xtsdec aligned");
    585 EVCNT_ATTACH_STATIC(xtsdec_aligned_evcnt);
    586 static struct evcnt xtsdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    587     NULL, "aesvia", "xtsdec unaligned");
    588 EVCNT_ATTACH_STATIC(xtsdec_unaligned_evcnt);
    589 
    590 static void
    591 aesvia_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
    592     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    593     uint32_t nrounds)
    594 {
    595 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    596 	uint32_t t[4];
    597 
    598 	KASSERT(nbytes % 16 == 0);
    599 
    600 	memcpy(t, tweak, 16);
    601 
    602 	fpu_kern_enter();
    603 	aesvia_reload_keys();
    604 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
    605 		xtsdec_aligned_evcnt.ev_count++;
    606 		unsigned lastblock = 0;
    607 		uint32_t buf[8*4] __aligned(16);
    608 
    609 		/*
    610 		 * Make sure the last block is not the last block of a
    611 		 * page.  (Note that we store the AES input in `out' as
    612 		 * a temporary buffer, rather than reading it directly
    613 		 * from `in', since we have to combine the tweak
    614 		 * first.)
    615 		 */
    616 		lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
    617 		nbytes -= lastblock;
    618 
    619 		/*
    620 		 * Handle an odd number of initial blocks so we can
    621 		 * process the rest in eight-block (128-byte) chunks.
    622 		 */
    623 		if (nbytes % 128) {
    624 			unsigned nbytes128 = nbytes % 128;
    625 
    626 			nbytes -= nbytes128;
    627 			for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
    628 			{
    629 				xor128(out, in, t);
    630 				aesvia_decN(dec, out, out, 1, cw0);
    631 				xor128(out, out, t);
    632 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    633 			}
    634 		}
    635 
    636 		/* Process eight blocks at a time.  */
    637 		for (; nbytes; nbytes -= 128, in += 128, out += 128) {
    638 			unsigned i;
    639 			for (i = 0; i < 8; i++) {
    640 				memcpy(buf + 4*i, t, 16);
    641 				xor128(out + 4*i, in + 4*i, t);
    642 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    643 			}
    644 			aesvia_decN(dec, out, out, 8, cw0);
    645 			for (i = 0; i < 8; i++)
    646 				xor128(out + 4*i, in + 4*i, buf + 4*i);
    647 		}
    648 
    649 		/* Handle the last block of a page, if necessary.  */
    650 		if (lastblock) {
    651 			xor128(buf, in, t);
    652 			aesvia_decN(dec, (const void *)buf, out, 1, cw0);
    653 		}
    654 
    655 		explicit_memset(buf, 0, sizeof buf);
    656 	} else {
    657 		xtsdec_unaligned_evcnt.ev_count++;
    658 		uint8_t buf[16] __aligned(16);
    659 
    660 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    661 			memcpy(buf, in, 16);
    662 			xor128(buf, buf, t);
    663 			aesvia_decN(dec, buf, buf, 1, cw0);
    664 			xor128(buf, buf, t);
    665 			memcpy(out, buf, 16);
    666 			aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    667 		}
    668 
    669 		explicit_memset(buf, 0, sizeof buf);
    670 	}
    671 	fpu_kern_leave();
    672 
    673 	memcpy(tweak, t, 16);
    674 	explicit_memset(t, 0, sizeof t);
    675 }
    676 
    677 static struct evcnt cbcmac_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    678     NULL, "aesvia", "cbcmac aligned");
    679 EVCNT_ATTACH_STATIC(cbcmac_aligned_evcnt);
    680 static struct evcnt cbcmac_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    681     NULL, "aesvia", "cbcmac unaligned");
    682 EVCNT_ATTACH_STATIC(cbcmac_unaligned_evcnt);
    683 
    684 static void
    685 aesvia_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
    686     size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
    687 {
    688 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    689 	uint8_t authbuf[16] __aligned(16);
    690 	uint8_t *auth = auth0;
    691 
    692 	KASSERT(nbytes);
    693 	KASSERT(nbytes % 16 == 0);
    694 
    695 	if ((uintptr_t)auth0 & 0xf) {
    696 		memcpy(authbuf, auth0, 16);
    697 		auth = authbuf;
    698 		cbcmac_unaligned_evcnt.ev_count++;
    699 	} else {
    700 		cbcmac_aligned_evcnt.ev_count++;
    701 	}
    702 
    703 	fpu_kern_enter();
    704 	aesvia_reload_keys();
    705 	for (; nbytes; nbytes -= 16, in += 16) {
    706 		xor128(auth, auth, in);
    707 		aesvia_encN(enc, auth, auth, 1, cw0);
    708 	}
    709 	fpu_kern_leave();
    710 
    711 	if ((uintptr_t)auth0 & 0xf) {
    712 		memcpy(auth0, authbuf, 16);
    713 		explicit_memset(authbuf, 0, sizeof authbuf);
    714 	}
    715 }
    716 
    717 static struct evcnt ccmenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    718     NULL, "aesvia", "ccmenc aligned");
    719 EVCNT_ATTACH_STATIC(ccmenc_aligned_evcnt);
    720 static struct evcnt ccmenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    721     NULL, "aesvia", "ccmenc unaligned");
    722 EVCNT_ATTACH_STATIC(ccmenc_unaligned_evcnt);
    723 
    724 static void
    725 aesvia_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
    726     uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
    727     uint32_t nrounds)
    728 {
    729 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    730 	uint8_t authctrbuf[32] __aligned(16);
    731 	uint8_t *authctr;
    732 	uint32_t c0, c1, c2, c3;
    733 
    734 	KASSERT(nbytes);
    735 	KASSERT(nbytes % 16 == 0);
    736 
    737 	if ((uintptr_t)authctr0 & 0xf) {
    738 		memcpy(authctrbuf, authctr0, 16);
    739 		authctr = authctrbuf;
    740 		ccmenc_unaligned_evcnt.ev_count++;
    741 	} else {
    742 		authctr = authctr0;
    743 		ccmenc_aligned_evcnt.ev_count++;
    744 	}
    745 	c0 = le32dec(authctr0 + 16 + 4*0);
    746 	c1 = le32dec(authctr0 + 16 + 4*1);
    747 	c2 = le32dec(authctr0 + 16 + 4*2);
    748 	c3 = be32dec(authctr0 + 16 + 4*3);
    749 
    750 	/*
    751 	 * In principle we could use REP XCRYPTCTR here, but that
    752 	 * doesn't help to compute the CBC-MAC step, and certain VIA
    753 	 * CPUs have some weird errata with REP XCRYPTCTR that make it
    754 	 * kind of a pain to use.  So let's just use REP XCRYPTECB to
    755 	 * simultaneously compute the CBC-MAC step and the CTR step.
    756 	 * (Maybe some VIA CPUs will compute REP XCRYPTECB in parallel,
    757 	 * who knows...)
    758 	 */
    759 	fpu_kern_enter();
    760 	aesvia_reload_keys();
    761 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    762 		xor128(authctr, authctr, in);
    763 		le32enc(authctr + 16 + 4*0, c0);
    764 		le32enc(authctr + 16 + 4*1, c1);
    765 		le32enc(authctr + 16 + 4*2, c2);
    766 		be32enc(authctr + 16 + 4*3, ++c3);
    767 		aesvia_encN(enc, authctr, authctr, 2, cw0);
    768 		xor128(out, in, authctr + 16);
    769 	}
    770 	fpu_kern_leave();
    771 
    772 	if ((uintptr_t)authctr0 & 0xf) {
    773 		memcpy(authctr0, authctrbuf, 16);
    774 		explicit_memset(authctrbuf, 0, sizeof authctrbuf);
    775 	}
    776 
    777 	le32enc(authctr0 + 16 + 4*0, c0);
    778 	le32enc(authctr0 + 16 + 4*1, c1);
    779 	le32enc(authctr0 + 16 + 4*2, c2);
    780 	be32enc(authctr0 + 16 + 4*3, c3);
    781 }
    782 
    783 static struct evcnt ccmdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    784     NULL, "aesvia", "ccmdec aligned");
    785 EVCNT_ATTACH_STATIC(ccmdec_aligned_evcnt);
    786 static struct evcnt ccmdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    787     NULL, "aesvia", "ccmdec unaligned");
    788 EVCNT_ATTACH_STATIC(ccmdec_unaligned_evcnt);
    789 
    790 static void
    791 aesvia_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
    792     uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
    793     uint32_t nrounds)
    794 {
    795 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    796 	uint8_t authctrbuf[32] __aligned(16);
    797 	uint8_t *authctr;
    798 	uint32_t c0, c1, c2, c3;
    799 
    800 	KASSERT(nbytes);
    801 	KASSERT(nbytes % 16 == 0);
    802 
    803 	c0 = le32dec(authctr0 + 16 + 4*0);
    804 	c1 = le32dec(authctr0 + 16 + 4*1);
    805 	c2 = le32dec(authctr0 + 16 + 4*2);
    806 	c3 = be32dec(authctr0 + 16 + 4*3);
    807 
    808 	if ((uintptr_t)authctr0 & 0xf) {
    809 		memcpy(authctrbuf, authctr0, 16);
    810 		authctr = authctrbuf;
    811 		le32enc(authctr + 16 + 4*0, c0);
    812 		le32enc(authctr + 16 + 4*1, c1);
    813 		le32enc(authctr + 16 + 4*2, c2);
    814 		ccmdec_unaligned_evcnt.ev_count++;
    815 	} else {
    816 		authctr = authctr0;
    817 		ccmdec_aligned_evcnt.ev_count++;
    818 	}
    819 
    820 	fpu_kern_enter();
    821 	aesvia_reload_keys();
    822 	be32enc(authctr + 16 + 4*3, ++c3);
    823 	aesvia_encN(enc, authctr + 16, authctr + 16, 1, cw0);
    824 	for (;; in += 16, out += 16) {
    825 		xor128(out, authctr + 16, in);
    826 		xor128(authctr, authctr, out);
    827 		if ((nbytes -= 16) == 0)
    828 			break;
    829 		le32enc(authctr + 16 + 4*0, c0);
    830 		le32enc(authctr + 16 + 4*1, c1);
    831 		le32enc(authctr + 16 + 4*2, c2);
    832 		be32enc(authctr + 16 + 4*3, ++c3);
    833 		aesvia_encN(enc, authctr, authctr, 2, cw0);
    834 	}
    835 	aesvia_encN(enc, authctr, authctr, 1, cw0);
    836 	fpu_kern_leave();
    837 
    838 	if ((uintptr_t)authctr0 & 0xf) {
    839 		memcpy(authctr0, authctrbuf, 16);
    840 		explicit_memset(authctrbuf, 0, sizeof authctrbuf);
    841 	}
    842 
    843 	le32enc(authctr0 + 16 + 4*0, c0);
    844 	le32enc(authctr0 + 16 + 4*1, c1);
    845 	le32enc(authctr0 + 16 + 4*2, c2);
    846 	be32enc(authctr0 + 16 + 4*3, c3);
    847 }
    848 
    849 static int
    850 aesvia_probe(void)
    851 {
    852 
    853 	/* Verify that the CPU advertises VIA ACE support.  */
    854 #ifdef _KERNEL
    855 	if ((cpu_feature[4] & CPUID_VIA_HAS_ACE) == 0)
    856 		return -1;
    857 #else
    858 	/*
    859 	 * From the VIA PadLock Programming Guide:
    860 	 * https://web.archive.org/web/20220104214041/http://linux.via.com.tw/support/beginDownload.action?eleid=181&fid=261
    861 	 */
    862 	unsigned eax, ebx, ecx, edx;
    863 	if (!__get_cpuid(0, &eax, &ebx, &ecx, &edx))
    864 		return -1;
    865 	if (ebx != signature_CENTAUR_ebx ||
    866 	    ecx != signature_CENTAUR_ecx ||
    867 	    edx != signature_CENTAUR_edx)
    868 		return -1;
    869 	if (eax < 0xc0000000)
    870 		return -1;
    871 	if (!__get_cpuid(0xc0000000, &eax, &ebx, &ecx, &edx))
    872 		return -1;
    873 	if (eax < 0xc0000001)
    874 		return -1;
    875 	if (!__get_cpuid(0xc0000001, &eax, &ebx, &ecx, &edx))
    876 		return -1;
    877 	/* Check whether ACE or ACE2 is both supported and enabled.  */
    878 	if ((edx & 0x000000c0) != 0x000000c0 ||
    879 	    (edx & 0x00000300) != 0x00000300)
    880 		return -1;
    881 #endif
    882 
    883 	/* Verify that our XTS tweak update logic works.  */
    884 	if (aesvia_xts_update_selftest())
    885 		return -1;
    886 
    887 	/* Success!  */
    888 	return 0;
    889 }
    890 
    891 struct aes_impl aes_via_impl = {
    892 	.ai_name = "VIA ACE",
    893 	.ai_probe = aesvia_probe,
    894 	.ai_setenckey = aesvia_setenckey,
    895 	.ai_setdeckey = aesvia_setdeckey,
    896 	.ai_enc = aesvia_enc,
    897 	.ai_dec = aesvia_dec,
    898 	.ai_cbc_enc = aesvia_cbc_enc,
    899 	.ai_cbc_dec = aesvia_cbc_dec,
    900 	.ai_xts_enc = aesvia_xts_enc,
    901 	.ai_xts_dec = aesvia_xts_dec,
    902 	.ai_cbcmac_update1 = aesvia_cbcmac_update1,
    903 	.ai_ccm_enc1 = aesvia_ccm_enc1,
    904 	.ai_ccm_dec1 = aesvia_ccm_dec1,
    905 };
    906