Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: aes_via.c,v 1.11 2026/01/08 11:25:59 nia Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(1, "$NetBSD: aes_via.c,v 1.11 2026/01/08 11:25:59 nia Exp $");
     31 
     32 #ifdef _KERNEL
     33 #include <sys/types.h>
     34 #include <sys/endian.h>
     35 #include <sys/evcnt.h>
     36 #include <sys/systm.h>
     37 #else
     38 #include <assert.h>
     39 #include <err.h>
     40 #include <stdint.h>
     41 #include <string.h>
     42 #include <endian.h>
     43 #define	KASSERT			assert
     44 #define	panic(fmt, args...)	err(1, fmt, args)
     45 struct evcnt { uint64_t ev_count; };
     46 #define	EVCNT_INITIALIZER(a,b,c,d) {0}
     47 #define	EVCNT_ATTACH_STATIC(name)	static char name##_attach __unused = 0
     48 #endif
     49 
     50 #include <crypto/aes/aes.h>
     51 #include <crypto/aes/aes_impl.h>
     52 #include <crypto/aes/aes_keysched.h>
     53 
     54 #ifdef _KERNEL
     55 #include <x86/cpufunc.h>
     56 #include <x86/cpuvar.h>
     57 #include <x86/fpu.h>
     58 #include <x86/specialreg.h>
     59 #include <x86/via_padlock.h>
     60 #else
     61 #include <cpuid.h>
     62 #define	fpu_kern_enter()	((void)0)
     63 #define	fpu_kern_leave()	((void)0)
     64 #define C3_CRYPT_CWLO_ROUND_M		0x0000000f
     65 #define C3_CRYPT_CWLO_ALG_M		0x00000070
     66 #define C3_CRYPT_CWLO_ALG_AES		0x00000000
     67 #define C3_CRYPT_CWLO_KEYGEN_M		0x00000080
     68 #define C3_CRYPT_CWLO_KEYGEN_HW		0x00000000
     69 #define C3_CRYPT_CWLO_KEYGEN_SW		0x00000080
     70 #define C3_CRYPT_CWLO_NORMAL		0x00000000
     71 #define C3_CRYPT_CWLO_INTERMEDIATE	0x00000100
     72 #define C3_CRYPT_CWLO_ENCRYPT		0x00000000
     73 #define C3_CRYPT_CWLO_DECRYPT		0x00000200
     74 #define C3_CRYPT_CWLO_KEY128		0x0000000a      /* 128bit, 10 rds */
     75 #define C3_CRYPT_CWLO_KEY192		0x0000040c      /* 192bit, 12 rds */
     76 #define C3_CRYPT_CWLO_KEY256		0x0000080e      /* 256bit, 15 rds */
     77 #endif
     78 
     79 static void
     80 aesvia_reload_keys(void)
     81 {
     82 
     83 	asm volatile("pushf; popf");
     84 }
     85 
     86 static uint32_t
     87 aesvia_keylen_cw0(unsigned nrounds)
     88 {
     89 
     90 	/*
     91 	 * Determine the control word bits for the key size / number of
     92 	 * rounds.  For AES-128, the hardware can do key expansion on
     93 	 * the fly; for AES-192 and AES-256, software must do it.
     94 	 */
     95 	switch (nrounds) {
     96 	case AES_128_NROUNDS:
     97 		return C3_CRYPT_CWLO_KEY128;
     98 	case AES_192_NROUNDS:
     99 		return C3_CRYPT_CWLO_KEY192 | C3_CRYPT_CWLO_KEYGEN_SW;
    100 	case AES_256_NROUNDS:
    101 		return C3_CRYPT_CWLO_KEY256 | C3_CRYPT_CWLO_KEYGEN_SW;
    102 	default:
    103 		panic("invalid AES nrounds: %u", nrounds);
    104 	}
    105 }
    106 
    107 static void
    108 aesvia_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds)
    109 {
    110 	size_t key_len;
    111 
    112 	/*
    113 	 * For AES-128, VIA PadLock only needs the original key itself.
    114 	 *
    115 	 * For AES-192 and AES-256, VIA PadLock needs software to
    116 	 * compute the standard AES key schedule.
    117 	 */
    118 	switch (nrounds) {
    119 	case AES_128_NROUNDS:
    120 		enc->aese_aes.aes_rk[0] = le32dec(key + 4*0);
    121 		enc->aese_aes.aes_rk[1] = le32dec(key + 4*1);
    122 		enc->aese_aes.aes_rk[2] = le32dec(key + 4*2);
    123 		enc->aese_aes.aes_rk[3] = le32dec(key + 4*3);
    124 		return;
    125 	case AES_192_NROUNDS:
    126 		key_len = 24;
    127 		break;
    128 	case AES_256_NROUNDS:
    129 		key_len = 32;
    130 		break;
    131 	default:
    132 		panic("invalid AES nrounds: %u", nrounds);
    133 	}
    134 	aes_keysched_enc(enc->aese_aes.aes_rk, key, key_len);
    135 }
    136 
    137 static void
    138 aesvia_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
    139 {
    140 	size_t key_len;
    141 
    142 	switch (nrounds) {
    143 	case AES_128_NROUNDS:
    144 		dec->aesd_aes.aes_rk[0] = le32dec(key + 4*0);
    145 		dec->aesd_aes.aes_rk[1] = le32dec(key + 4*1);
    146 		dec->aesd_aes.aes_rk[2] = le32dec(key + 4*2);
    147 		dec->aesd_aes.aes_rk[3] = le32dec(key + 4*3);
    148 		return;
    149 	case AES_192_NROUNDS:
    150 		key_len = 24;
    151 		break;
    152 	case AES_256_NROUNDS:
    153 		key_len = 32;
    154 		break;
    155 	default:
    156 		panic("invalid AES nrounds: %u", nrounds);
    157 	}
    158 	aes_keysched_dec(dec->aesd_aes.aes_rk, key, key_len);
    159 }
    160 
    161 static inline void
    162 aesvia_encN(const struct aesenc *enc, const uint8_t in[static 16],
    163     uint8_t out[static 16], size_t nblocks, uint32_t cw0)
    164 {
    165 	const uint32_t cw[4] __aligned(16) = {
    166 		[0] = (cw0
    167 		    | C3_CRYPT_CWLO_ALG_AES
    168 		    | C3_CRYPT_CWLO_ENCRYPT
    169 		    | C3_CRYPT_CWLO_NORMAL),
    170 	};
    171 
    172 	KASSERT(((uintptr_t)enc & 0xf) == 0);
    173 	KASSERT(((uintptr_t)in & 0xf) == 0);
    174 	KASSERT(((uintptr_t)out & 0xf) == 0);
    175 
    176 	asm volatile("rep xcryptecb"
    177 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    178 	    : "b"(enc), "d"(cw)
    179 	    : "memory", "cc");
    180 }
    181 
    182 static inline void
    183 aesvia_decN(const struct aesdec *dec, const uint8_t in[static 16],
    184     uint8_t out[static 16], size_t nblocks, uint32_t cw0)
    185 {
    186 	const uint32_t cw[4] __aligned(16) = {
    187 		[0] = (cw0
    188 		    | C3_CRYPT_CWLO_ALG_AES
    189 		    | C3_CRYPT_CWLO_DECRYPT
    190 		    | C3_CRYPT_CWLO_NORMAL),
    191 	};
    192 
    193 	KASSERT(((uintptr_t)dec & 0xf) == 0);
    194 	KASSERT(((uintptr_t)in & 0xf) == 0);
    195 	KASSERT(((uintptr_t)out & 0xf) == 0);
    196 
    197 	asm volatile("rep xcryptecb"
    198 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    199 	    : "b"(dec), "d"(cw)
    200 	    : "memory", "cc");
    201 }
    202 
    203 static struct evcnt enc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    204     NULL, "aesvia", "enc aligned");
    205 EVCNT_ATTACH_STATIC(enc_aligned_evcnt);
    206 static struct evcnt enc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    207     NULL, "aesvia", "dec unaligned");
    208 EVCNT_ATTACH_STATIC(enc_unaligned_evcnt);
    209 
    210 static void
    211 aesvia_enc(const struct aesenc *enc, const uint8_t in[static 16],
    212     uint8_t out[static 16], uint32_t nrounds)
    213 {
    214 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    215 
    216 	fpu_kern_enter();
    217 	aesvia_reload_keys();
    218 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
    219 	    ((uintptr_t)in & 0xff0) != 0xff0) {
    220 		enc_aligned_evcnt.ev_count++;
    221 		aesvia_encN(enc, in, out, 1, cw0);
    222 	} else {
    223 		enc_unaligned_evcnt.ev_count++;
    224 		/*
    225 		 * VIA requires 16-byte/128-bit alignment, and
    226 		 * xcrypt-ecb reads one block past the one we're
    227 		 * working on -- which may go past the end of the page
    228 		 * into unmapped territory.  Use a bounce buffer if
    229 		 * either constraint is violated.
    230 		 */
    231 		uint8_t inbuf[16] __aligned(16);
    232 		uint8_t outbuf[16] __aligned(16);
    233 
    234 		memcpy(inbuf, in, 16);
    235 		aesvia_encN(enc, inbuf, outbuf, 1, cw0);
    236 		memcpy(out, outbuf, 16);
    237 
    238 		explicit_memset(inbuf, 0, sizeof inbuf);
    239 		explicit_memset(outbuf, 0, sizeof outbuf);
    240 	}
    241 	fpu_kern_leave();
    242 }
    243 
    244 static struct evcnt dec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    245     NULL, "aesvia", "dec aligned");
    246 EVCNT_ATTACH_STATIC(dec_aligned_evcnt);
    247 static struct evcnt dec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    248     NULL, "aesvia", "dec unaligned");
    249 EVCNT_ATTACH_STATIC(dec_unaligned_evcnt);
    250 
    251 static void
    252 aesvia_dec(const struct aesdec *dec, const uint8_t in[static 16],
    253     uint8_t out[static 16], uint32_t nrounds)
    254 {
    255 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    256 
    257 	fpu_kern_enter();
    258 	aesvia_reload_keys();
    259 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
    260 	    ((uintptr_t)in & 0xff0) != 0xff0) {
    261 		dec_aligned_evcnt.ev_count++;
    262 		aesvia_decN(dec, in, out, 1, cw0);
    263 	} else {
    264 		dec_unaligned_evcnt.ev_count++;
    265 		/*
    266 		 * VIA requires 16-byte/128-bit alignment, and
    267 		 * xcrypt-ecb reads one block past the one we're
    268 		 * working on -- which may go past the end of the page
    269 		 * into unmapped territory.  Use a bounce buffer if
    270 		 * either constraint is violated.
    271 		 */
    272 		uint8_t inbuf[16] __aligned(16);
    273 		uint8_t outbuf[16] __aligned(16);
    274 
    275 		memcpy(inbuf, in, 16);
    276 		aesvia_decN(dec, inbuf, outbuf, 1, cw0);
    277 		memcpy(out, outbuf, 16);
    278 
    279 		explicit_memset(inbuf, 0, sizeof inbuf);
    280 		explicit_memset(outbuf, 0, sizeof outbuf);
    281 	}
    282 	fpu_kern_leave();
    283 }
    284 
    285 static inline void
    286 aesvia_cbc_encN(const struct aesenc *enc, const uint8_t in[static 16],
    287     uint8_t out[static 16], size_t nblocks, uint8_t **ivp, uint32_t cw0)
    288 {
    289 	const uint32_t cw[4] __aligned(16) = {
    290 		[0] = (cw0
    291 		    | C3_CRYPT_CWLO_ALG_AES
    292 		    | C3_CRYPT_CWLO_ENCRYPT
    293 		    | C3_CRYPT_CWLO_NORMAL),
    294 	};
    295 
    296 	KASSERT(((uintptr_t)enc & 0xf) == 0);
    297 	KASSERT(((uintptr_t)in & 0xf) == 0);
    298 	KASSERT(((uintptr_t)out & 0xf) == 0);
    299 	KASSERT(((uintptr_t)*ivp & 0xf) == 0);
    300 
    301 	/*
    302 	 * Register effects:
    303 	 * - Counts nblocks down to zero.
    304 	 * - Advances in by nblocks (units of blocks).
    305 	 * - Advances out by nblocks (units of blocks).
    306 	 * - Updates *ivp to point at the last block of out.
    307 	 */
    308 	asm volatile("rep xcryptcbc"
    309 	    : "+c"(nblocks), "+S"(in), "+D"(out), "+a"(*ivp)
    310 	    : "b"(enc), "d"(cw)
    311 	    : "memory", "cc");
    312 }
    313 
    314 static inline void
    315 aesvia_cbc_decN(const struct aesdec *dec, const uint8_t in[static 16],
    316     uint8_t out[static 16], size_t nblocks, uint8_t iv[static 16],
    317     uint32_t cw0)
    318 {
    319 	const uint32_t cw[4] __aligned(16) = {
    320 		[0] = (cw0
    321 		    | C3_CRYPT_CWLO_ALG_AES
    322 		    | C3_CRYPT_CWLO_DECRYPT
    323 		    | C3_CRYPT_CWLO_NORMAL),
    324 	};
    325 
    326 	KASSERT(((uintptr_t)dec & 0xf) == 0);
    327 	KASSERT(((uintptr_t)in & 0xf) == 0);
    328 	KASSERT(((uintptr_t)out & 0xf) == 0);
    329 	KASSERT(((uintptr_t)iv & 0xf) == 0);
    330 
    331 	/*
    332 	 * Register effects:
    333 	 * - Counts nblocks down to zero.
    334 	 * - Advances in by nblocks (units of blocks).
    335 	 * - Advances out by nblocks (units of blocks).
    336 	 * Memory side effects:
    337 	 * - Writes what was the last block of in at the address iv.
    338 	 */
    339 	asm volatile("rep xcryptcbc"
    340 	    : "+c"(nblocks), "+S"(in), "+D"(out)
    341 	    : "a"(iv), "b"(dec), "d"(cw)
    342 	    : "memory", "cc");
    343 }
    344 
    345 static inline void
    346 xor128(void *x, const void *a, const void *b)
    347 {
    348 	uint32_t *x32 = x;
    349 	const uint32_t *a32 = a;
    350 	const uint32_t *b32 = b;
    351 
    352 	x32[0] = a32[0] ^ b32[0];
    353 	x32[1] = a32[1] ^ b32[1];
    354 	x32[2] = a32[2] ^ b32[2];
    355 	x32[3] = a32[3] ^ b32[3];
    356 }
    357 
    358 static struct evcnt cbcenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    359     NULL, "aesvia", "cbcenc aligned");
    360 EVCNT_ATTACH_STATIC(cbcenc_aligned_evcnt);
    361 static struct evcnt cbcenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    362     NULL, "aesvia", "cbcenc unaligned");
    363 EVCNT_ATTACH_STATIC(cbcenc_unaligned_evcnt);
    364 
    365 static void
    366 aesvia_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
    367     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    368     uint32_t nrounds)
    369 {
    370 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    371 
    372 	KASSERT(nbytes % 16 == 0);
    373 	if (nbytes == 0)
    374 		return;
    375 
    376 	fpu_kern_enter();
    377 	aesvia_reload_keys();
    378 	if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
    379 		cbcenc_aligned_evcnt.ev_count++;
    380 		uint8_t *ivp = iv;
    381 		aesvia_cbc_encN(enc, in, out, nbytes/16, &ivp, cw0);
    382 		memcpy(iv, ivp, 16);
    383 	} else {
    384 		cbcenc_unaligned_evcnt.ev_count++;
    385 		uint8_t cv[16] __aligned(16);
    386 		uint8_t tmp[16] __aligned(16);
    387 
    388 		memcpy(cv, iv, 16);
    389 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    390 			memcpy(tmp, in, 16);
    391 			xor128(tmp, tmp, cv);
    392 			aesvia_encN(enc, tmp, cv, 1, cw0);
    393 			memcpy(out, cv, 16);
    394 		}
    395 		memcpy(iv, cv, 16);
    396 	}
    397 	fpu_kern_leave();
    398 }
    399 
    400 static struct evcnt cbcdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    401     NULL, "aesvia", "cbcdec aligned");
    402 EVCNT_ATTACH_STATIC(cbcdec_aligned_evcnt);
    403 static struct evcnt cbcdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    404     NULL, "aesvia", "cbcdec unaligned");
    405 EVCNT_ATTACH_STATIC(cbcdec_unaligned_evcnt);
    406 
    407 static void
    408 aesvia_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
    409     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    410     uint32_t nrounds)
    411 {
    412 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    413 
    414 	KASSERT(nbytes % 16 == 0);
    415 	if (nbytes == 0)
    416 		return;
    417 
    418 	fpu_kern_enter();
    419 	aesvia_reload_keys();
    420 	if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
    421 		cbcdec_aligned_evcnt.ev_count++;
    422 		aesvia_cbc_decN(dec, in, out, nbytes/16, iv, cw0);
    423 	} else {
    424 		cbcdec_unaligned_evcnt.ev_count++;
    425 		uint8_t iv0[16] __aligned(16);
    426 		uint8_t cv[16] __aligned(16);
    427 		uint8_t tmp[16] __aligned(16);
    428 
    429 		memcpy(iv0, iv, 16);
    430 		memcpy(cv, in + nbytes - 16, 16);
    431 		memcpy(iv, cv, 16);
    432 
    433 		for (;;) {
    434 			aesvia_decN(dec, cv, tmp, 1, cw0);
    435 			if ((nbytes -= 16) == 0)
    436 				break;
    437 			memcpy(cv, in + nbytes - 16, 16);
    438 			xor128(tmp, tmp, cv);
    439 			memcpy(out + nbytes, tmp, 16);
    440 		}
    441 
    442 		xor128(tmp, tmp, iv0);
    443 		memcpy(out, tmp, 16);
    444 		explicit_memset(tmp, 0, sizeof tmp);
    445 	}
    446 	fpu_kern_leave();
    447 }
    448 
    449 static inline void
    450 aesvia_xts_update(uint32_t *t0, uint32_t *t1, uint32_t *t2, uint32_t *t3)
    451 {
    452 	uint32_t s0, s1, s2, s3;
    453 
    454 	s0 = *t0 >> 31;
    455 	s1 = *t1 >> 31;
    456 	s2 = *t2 >> 31;
    457 	s3 = *t3 >> 31;
    458 	*t0 = (*t0 << 1) ^ (-s3 & 0x87);
    459 	*t1 = (*t1 << 1) ^ s0;
    460 	*t2 = (*t2 << 1) ^ s1;
    461 	*t3 = (*t3 << 1) ^ s2;
    462 }
    463 
    464 static int
    465 aesvia_xts_update_selftest(void)
    466 {
    467 	static const struct {
    468 		uint32_t in[4], out[4];
    469 	} cases[] = {
    470 		{ {1}, {2} },
    471 		{ {0x80000000U,0,0,0}, {0,1,0,0} },
    472 		{ {0,0x80000000U,0,0}, {0,0,1,0} },
    473 		{ {0,0,0x80000000U,0}, {0,0,0,1} },
    474 		{ {0,0,0,0x80000000U}, {0x87,0,0,0} },
    475 		{ {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
    476 	};
    477 	unsigned i;
    478 	uint32_t t0, t1, t2, t3;
    479 
    480 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
    481 		t0 = cases[i].in[0];
    482 		t1 = cases[i].in[1];
    483 		t2 = cases[i].in[2];
    484 		t3 = cases[i].in[3];
    485 		aesvia_xts_update(&t0, &t1, &t2, &t3);
    486 		if (t0 != cases[i].out[0] ||
    487 		    t1 != cases[i].out[1] ||
    488 		    t2 != cases[i].out[2] ||
    489 		    t3 != cases[i].out[3])
    490 			return -1;
    491 	}
    492 
    493 	/* Success!  */
    494 	return 0;
    495 }
    496 
    497 static struct evcnt xtsenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    498     NULL, "aesvia", "xtsenc aligned");
    499 EVCNT_ATTACH_STATIC(xtsenc_aligned_evcnt);
    500 static struct evcnt xtsenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    501     NULL, "aesvia", "xtsenc unaligned");
    502 EVCNT_ATTACH_STATIC(xtsenc_unaligned_evcnt);
    503 
    504 static void
    505 aesvia_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
    506     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    507     uint32_t nrounds)
    508 {
    509 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    510 	uint32_t t[4];
    511 
    512 	KASSERT(nbytes % 16 == 0);
    513 
    514 	memcpy(t, tweak, 16);
    515 
    516 	fpu_kern_enter();
    517 	aesvia_reload_keys();
    518 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
    519 		xtsenc_aligned_evcnt.ev_count++;
    520 		unsigned lastblock = 0;
    521 		uint32_t buf[8*4] __aligned(16);
    522 
    523 		/*
    524 		 * Make sure the last block is not the last block of a
    525 		 * page.  (Note that we store the AES input in `out' as
    526 		 * a temporary buffer, rather than reading it directly
    527 		 * from `in', since we have to combine the tweak
    528 		 * first.)
    529 		 */
    530 		lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
    531 		nbytes -= lastblock;
    532 
    533 		/*
    534 		 * Handle an odd number of initial blocks so we can
    535 		 * process the rest in eight-block (128-byte) chunks.
    536 		 */
    537 		if (nbytes % 128) {
    538 			unsigned nbytes128 = nbytes % 128;
    539 
    540 			nbytes -= nbytes128;
    541 			for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
    542 			{
    543 				xor128(out, in, t);
    544 				aesvia_encN(enc, out, out, 1, cw0);
    545 				xor128(out, out, t);
    546 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    547 			}
    548 		}
    549 
    550 		/* Process eight blocks at a time.  */
    551 		for (; nbytes; nbytes -= 128, in += 128, out += 128) {
    552 			unsigned i;
    553 			for (i = 0; i < 8; i++) {
    554 				memcpy(buf + 4*i, t, 16);
    555 				xor128(out + 4*i, in + 4*i, t);
    556 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    557 			}
    558 			aesvia_encN(enc, out, out, 8, cw0);
    559 			for (i = 0; i < 8; i++)
    560 				xor128(out + 4*i, in + 4*i, buf + 4*i);
    561 		}
    562 
    563 		/* Handle the last block of a page, if necessary.  */
    564 		if (lastblock) {
    565 			xor128(buf, in, t);
    566 			aesvia_encN(enc, (const void *)buf, out, 1, cw0);
    567 		}
    568 
    569 		explicit_memset(buf, 0, sizeof buf);
    570 	} else {
    571 		xtsenc_unaligned_evcnt.ev_count++;
    572 		uint8_t buf[16] __aligned(16);
    573 
    574 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    575 			memcpy(buf, in, 16);
    576 			xor128(buf, buf, t);
    577 			aesvia_encN(enc, buf, buf, 1, cw0);
    578 			xor128(buf, buf, t);
    579 			memcpy(out, buf, 16);
    580 			aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    581 		}
    582 
    583 		explicit_memset(buf, 0, sizeof buf);
    584 	}
    585 	fpu_kern_leave();
    586 
    587 	memcpy(tweak, t, 16);
    588 	explicit_memset(t, 0, sizeof t);
    589 }
    590 
    591 static struct evcnt xtsdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    592     NULL, "aesvia", "xtsdec aligned");
    593 EVCNT_ATTACH_STATIC(xtsdec_aligned_evcnt);
    594 static struct evcnt xtsdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    595     NULL, "aesvia", "xtsdec unaligned");
    596 EVCNT_ATTACH_STATIC(xtsdec_unaligned_evcnt);
    597 
    598 static void
    599 aesvia_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
    600     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    601     uint32_t nrounds)
    602 {
    603 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    604 	uint32_t t[4];
    605 
    606 	KASSERT(nbytes % 16 == 0);
    607 
    608 	memcpy(t, tweak, 16);
    609 
    610 	fpu_kern_enter();
    611 	aesvia_reload_keys();
    612 	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
    613 		xtsdec_aligned_evcnt.ev_count++;
    614 		unsigned lastblock = 0;
    615 		uint32_t buf[8*4] __aligned(16);
    616 
    617 		/*
    618 		 * Make sure the last block is not the last block of a
    619 		 * page.  (Note that we store the AES input in `out' as
    620 		 * a temporary buffer, rather than reading it directly
    621 		 * from `in', since we have to combine the tweak
    622 		 * first.)
    623 		 */
    624 		lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
    625 		nbytes -= lastblock;
    626 
    627 		/*
    628 		 * Handle an odd number of initial blocks so we can
    629 		 * process the rest in eight-block (128-byte) chunks.
    630 		 */
    631 		if (nbytes % 128) {
    632 			unsigned nbytes128 = nbytes % 128;
    633 
    634 			nbytes -= nbytes128;
    635 			for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
    636 			{
    637 				xor128(out, in, t);
    638 				aesvia_decN(dec, out, out, 1, cw0);
    639 				xor128(out, out, t);
    640 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    641 			}
    642 		}
    643 
    644 		/* Process eight blocks at a time.  */
    645 		for (; nbytes; nbytes -= 128, in += 128, out += 128) {
    646 			unsigned i;
    647 			for (i = 0; i < 8; i++) {
    648 				memcpy(buf + 4*i, t, 16);
    649 				xor128(out + 4*i, in + 4*i, t);
    650 				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    651 			}
    652 			aesvia_decN(dec, out, out, 8, cw0);
    653 			for (i = 0; i < 8; i++)
    654 				xor128(out + 4*i, in + 4*i, buf + 4*i);
    655 		}
    656 
    657 		/* Handle the last block of a page, if necessary.  */
    658 		if (lastblock) {
    659 			xor128(buf, in, t);
    660 			aesvia_decN(dec, (const void *)buf, out, 1, cw0);
    661 		}
    662 
    663 		explicit_memset(buf, 0, sizeof buf);
    664 	} else {
    665 		xtsdec_unaligned_evcnt.ev_count++;
    666 		uint8_t buf[16] __aligned(16);
    667 
    668 		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    669 			memcpy(buf, in, 16);
    670 			xor128(buf, buf, t);
    671 			aesvia_decN(dec, buf, buf, 1, cw0);
    672 			xor128(buf, buf, t);
    673 			memcpy(out, buf, 16);
    674 			aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
    675 		}
    676 
    677 		explicit_memset(buf, 0, sizeof buf);
    678 	}
    679 	fpu_kern_leave();
    680 
    681 	memcpy(tweak, t, 16);
    682 	explicit_memset(t, 0, sizeof t);
    683 }
    684 
    685 static struct evcnt cbcmac_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    686     NULL, "aesvia", "cbcmac aligned");
    687 EVCNT_ATTACH_STATIC(cbcmac_aligned_evcnt);
    688 static struct evcnt cbcmac_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    689     NULL, "aesvia", "cbcmac unaligned");
    690 EVCNT_ATTACH_STATIC(cbcmac_unaligned_evcnt);
    691 
    692 static void
    693 aesvia_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
    694     size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
    695 {
    696 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    697 	uint8_t authbuf[16] __aligned(16);
    698 	uint8_t *auth = auth0;
    699 
    700 	KASSERT(nbytes);
    701 	KASSERT(nbytes % 16 == 0);
    702 
    703 	if ((uintptr_t)auth0 & 0xf) {
    704 		memcpy(authbuf, auth0, 16);
    705 		auth = authbuf;
    706 		cbcmac_unaligned_evcnt.ev_count++;
    707 	} else {
    708 		cbcmac_aligned_evcnt.ev_count++;
    709 	}
    710 
    711 	fpu_kern_enter();
    712 	aesvia_reload_keys();
    713 	for (; nbytes; nbytes -= 16, in += 16) {
    714 		xor128(auth, auth, in);
    715 		aesvia_encN(enc, auth, auth, 1, cw0);
    716 	}
    717 	fpu_kern_leave();
    718 
    719 	if ((uintptr_t)auth0 & 0xf) {
    720 		memcpy(auth0, authbuf, 16);
    721 		explicit_memset(authbuf, 0, sizeof authbuf);
    722 	}
    723 }
    724 
    725 static struct evcnt ccmenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    726     NULL, "aesvia", "ccmenc aligned");
    727 EVCNT_ATTACH_STATIC(ccmenc_aligned_evcnt);
    728 static struct evcnt ccmenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    729     NULL, "aesvia", "ccmenc unaligned");
    730 EVCNT_ATTACH_STATIC(ccmenc_unaligned_evcnt);
    731 
    732 static void
    733 aesvia_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
    734     uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
    735     uint32_t nrounds)
    736 {
    737 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    738 	uint8_t authctrbuf[32] __aligned(16);
    739 	uint8_t *authctr;
    740 	uint32_t c0, c1, c2, c3;
    741 
    742 	KASSERT(nbytes);
    743 	KASSERT(nbytes % 16 == 0);
    744 
    745 	if ((uintptr_t)authctr0 & 0xf) {
    746 		memcpy(authctrbuf, authctr0, 16);
    747 		authctr = authctrbuf;
    748 		ccmenc_unaligned_evcnt.ev_count++;
    749 	} else {
    750 		authctr = authctr0;
    751 		ccmenc_aligned_evcnt.ev_count++;
    752 	}
    753 	c0 = le32dec(authctr0 + 16 + 4*0);
    754 	c1 = le32dec(authctr0 + 16 + 4*1);
    755 	c2 = le32dec(authctr0 + 16 + 4*2);
    756 	c3 = be32dec(authctr0 + 16 + 4*3);
    757 
    758 	/*
    759 	 * In principle we could use REP XCRYPTCTR here, but that
    760 	 * doesn't help to compute the CBC-MAC step, and certain VIA
    761 	 * CPUs have some weird errata with REP XCRYPTCTR that make it
    762 	 * kind of a pain to use.  So let's just use REP XCRYPTECB to
    763 	 * simultaneously compute the CBC-MAC step and the CTR step.
    764 	 * (Maybe some VIA CPUs will compute REP XCRYPTECB in parallel,
    765 	 * who knows...)
    766 	 */
    767 	fpu_kern_enter();
    768 	aesvia_reload_keys();
    769 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    770 		xor128(authctr, authctr, in);
    771 		le32enc(authctr + 16 + 4*0, c0);
    772 		le32enc(authctr + 16 + 4*1, c1);
    773 		le32enc(authctr + 16 + 4*2, c2);
    774 		be32enc(authctr + 16 + 4*3, ++c3);
    775 		aesvia_encN(enc, authctr, authctr, 2, cw0);
    776 		xor128(out, in, authctr + 16);
    777 	}
    778 	fpu_kern_leave();
    779 
    780 	if ((uintptr_t)authctr0 & 0xf) {
    781 		memcpy(authctr0, authctrbuf, 16);
    782 		explicit_memset(authctrbuf, 0, sizeof authctrbuf);
    783 	}
    784 
    785 	le32enc(authctr0 + 16 + 4*0, c0);
    786 	le32enc(authctr0 + 16 + 4*1, c1);
    787 	le32enc(authctr0 + 16 + 4*2, c2);
    788 	be32enc(authctr0 + 16 + 4*3, c3);
    789 }
    790 
    791 static struct evcnt ccmdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    792     NULL, "aesvia", "ccmdec aligned");
    793 EVCNT_ATTACH_STATIC(ccmdec_aligned_evcnt);
    794 static struct evcnt ccmdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    795     NULL, "aesvia", "ccmdec unaligned");
    796 EVCNT_ATTACH_STATIC(ccmdec_unaligned_evcnt);
    797 
    798 static void
    799 aesvia_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
    800     uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
    801     uint32_t nrounds)
    802 {
    803 	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
    804 	uint8_t authctrbuf[32] __aligned(16);
    805 	uint8_t *authctr;
    806 	uint32_t c0, c1, c2, c3;
    807 
    808 	KASSERT(nbytes);
    809 	KASSERT(nbytes % 16 == 0);
    810 
    811 	c0 = le32dec(authctr0 + 16 + 4*0);
    812 	c1 = le32dec(authctr0 + 16 + 4*1);
    813 	c2 = le32dec(authctr0 + 16 + 4*2);
    814 	c3 = be32dec(authctr0 + 16 + 4*3);
    815 
    816 	if ((uintptr_t)authctr0 & 0xf) {
    817 		memcpy(authctrbuf, authctr0, 16);
    818 		authctr = authctrbuf;
    819 		le32enc(authctr + 16 + 4*0, c0);
    820 		le32enc(authctr + 16 + 4*1, c1);
    821 		le32enc(authctr + 16 + 4*2, c2);
    822 		ccmdec_unaligned_evcnt.ev_count++;
    823 	} else {
    824 		authctr = authctr0;
    825 		ccmdec_aligned_evcnt.ev_count++;
    826 	}
    827 
    828 	fpu_kern_enter();
    829 	aesvia_reload_keys();
    830 	be32enc(authctr + 16 + 4*3, ++c3);
    831 	aesvia_encN(enc, authctr + 16, authctr + 16, 1, cw0);
    832 	for (;; in += 16, out += 16) {
    833 		xor128(out, authctr + 16, in);
    834 		xor128(authctr, authctr, out);
    835 		if ((nbytes -= 16) == 0)
    836 			break;
    837 		le32enc(authctr + 16 + 4*0, c0);
    838 		le32enc(authctr + 16 + 4*1, c1);
    839 		le32enc(authctr + 16 + 4*2, c2);
    840 		be32enc(authctr + 16 + 4*3, ++c3);
    841 		aesvia_encN(enc, authctr, authctr, 2, cw0);
    842 	}
    843 	aesvia_encN(enc, authctr, authctr, 1, cw0);
    844 	fpu_kern_leave();
    845 
    846 	if ((uintptr_t)authctr0 & 0xf) {
    847 		memcpy(authctr0, authctrbuf, 16);
    848 		explicit_memset(authctrbuf, 0, sizeof authctrbuf);
    849 	}
    850 
    851 	le32enc(authctr0 + 16 + 4*0, c0);
    852 	le32enc(authctr0 + 16 + 4*1, c1);
    853 	le32enc(authctr0 + 16 + 4*2, c2);
    854 	be32enc(authctr0 + 16 + 4*3, c3);
    855 }
    856 
    857 static int
    858 aesvia_probe(void)
    859 {
    860 
    861 	/* Verify that the CPU advertises VIA ACE support.  */
    862 #ifdef _KERNEL
    863 	if ((cpu_feature[4] & CPUID_VIA_HAS_ACE) == 0)
    864 		return -1;
    865 #else
    866 	/*
    867 	 * From the VIA PadLock Programming Guide:
    868 	 * https://web.archive.org/web/20220104214041/http://linux.via.com.tw/support/beginDownload.action?eleid=181&fid=261
    869 	 */
    870 	unsigned eax, ebx, ecx, edx;
    871 	if (!__get_cpuid(0, &eax, &ebx, &ecx, &edx))
    872 		return -1;
    873 	if (ebx != signature_CENTAUR_ebx ||
    874 	    ecx != signature_CENTAUR_ecx ||
    875 	    edx != signature_CENTAUR_edx)
    876 		return -1;
    877 	if (eax < 0xc0000000)
    878 		return -1;
    879 	if (!__get_cpuid(0xc0000000, &eax, &ebx, &ecx, &edx))
    880 		return -1;
    881 	if (eax < 0xc0000001)
    882 		return -1;
    883 	if (!__get_cpuid(0xc0000001, &eax, &ebx, &ecx, &edx))
    884 		return -1;
    885 	/* Check whether ACE or ACE2 is both supported and enabled.  */
    886 	if ((edx & 0x000000c0) != 0x000000c0 ||
    887 	    (edx & 0x00000300) != 0x00000300)
    888 		return -1;
    889 #endif
    890 
    891 	/* Verify that our XTS tweak update logic works.  */
    892 	if (aesvia_xts_update_selftest())
    893 		return -1;
    894 
    895 	/* Success!  */
    896 	return 0;
    897 }
    898 
    899 struct aes_impl aes_via_impl = {
    900 	.ai_name = "VIA ACE",
    901 	.ai_probe = aesvia_probe,
    902 	.ai_setenckey = aesvia_setenckey,
    903 	.ai_setdeckey = aesvia_setdeckey,
    904 	.ai_enc = aesvia_enc,
    905 	.ai_dec = aesvia_dec,
    906 	.ai_cbc_enc = aesvia_cbc_enc,
    907 	.ai_cbc_dec = aesvia_cbc_dec,
    908 	.ai_xts_enc = aesvia_xts_enc,
    909 	.ai_xts_dec = aesvia_xts_dec,
    910 	.ai_cbcmac_update1 = aesvia_cbcmac_update1,
    911 	.ai_ccm_enc1 = aesvia_ccm_enc1,
    912 	.ai_ccm_dec1 = aesvia_ccm_dec1,
    913 };
    914