11.1Sjmcneill/* $NetBSD: hwaes.c,v 1.1 2026/01/09 22:54:30 jmcneill Exp $ */
21.1Sjmcneill
31.1Sjmcneill/*-
41.1Sjmcneill * Copyright (c) 2025 Jared McNeill <jmcneill@invisible.ca>
51.1Sjmcneill * Copyright (c) 2020 The NetBSD Foundation, Inc.
61.1Sjmcneill * All rights reserved.
71.1Sjmcneill *
81.1Sjmcneill * Redistribution and use in source and binary forms, with or without
91.1Sjmcneill * modification, are permitted provided that the following conditions
101.1Sjmcneill * are met:
111.1Sjmcneill * 1. Redistributions of source code must retain the above copyright
121.1Sjmcneill *    notice, this list of conditions and the following disclaimer.
131.1Sjmcneill * 2. Redistributions in binary form must reproduce the above copyright
141.1Sjmcneill *    notice, this list of conditions and the following disclaimer in the
151.1Sjmcneill *    documentation and/or other materials provided with the distribution.
161.1Sjmcneill *
171.1Sjmcneill * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
181.1Sjmcneill * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
191.1Sjmcneill * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
201.1Sjmcneill * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
211.1Sjmcneill * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
221.1Sjmcneill * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
231.1Sjmcneill * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
241.1Sjmcneill * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
251.1Sjmcneill * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
261.1Sjmcneill * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
271.1Sjmcneill * POSSIBILITY OF SUCH DAMAGE.
281.1Sjmcneill */
291.1Sjmcneill
301.1Sjmcneill/*
311.1Sjmcneill * A driver for the Nintendo Wii's AES engine. The driver registers an AES
321.1Sjmcneill * implementation for kernel use via aes_md_init(). AES-128 requests are
331.1Sjmcneill * accelerated by hardware and all other requests are passed through to the
341.1Sjmcneill * default (BearSSL aes_ct) implementation.
351.1Sjmcneill */
361.1Sjmcneill
371.1Sjmcneill#include <sys/cdefs.h>
381.1Sjmcneill__KERNEL_RCSID(0, "$NetBSD: hwaes.c,v 1.1 2026/01/09 22:54:30 jmcneill Exp $");
391.1Sjmcneill
401.1Sjmcneill#include <sys/param.h>
411.1Sjmcneill#include <sys/bus.h>
421.1Sjmcneill#include <sys/device.h>
431.1Sjmcneill#include <sys/systm.h>
441.1Sjmcneill#include <sys/callout.h>
451.1Sjmcneill#include <sys/buf.h>
461.1Sjmcneill#include <sys/cpu.h>
471.1Sjmcneill
481.1Sjmcneill#include <machine/wii.h>
491.1Sjmcneill#include <machine/wiiu.h>
501.1Sjmcneill#include <machine/pio.h>
511.1Sjmcneill#include "ahb.h"
521.1Sjmcneill
531.1Sjmcneill#include <crypto/aes/aes.h>
541.1Sjmcneill#include <crypto/aes/aes_bear.h>
551.1Sjmcneill#include <crypto/aes/aes_impl.h>
561.1Sjmcneill
571.1Sjmcneill/* AES engine registers */
581.1Sjmcneill#define AES_CTRL		0x00
591.1Sjmcneill#define  AES_CTRL_EXEC		__BIT(31)
601.1Sjmcneill#define  AES_CTRL_IRQ		__BIT(30)
611.1Sjmcneill#define  AES_CTRL_ERR		__BIT(29)
621.1Sjmcneill#define  AES_CTRL_ENA		__BIT(28)
631.1Sjmcneill#define  AES_CTRL_DEC		__BIT(27)
641.1Sjmcneill#define  AES_CTRL_IV		__BIT(12)
651.1Sjmcneill#define  AES_CTRL_BLOCKS	__BITS(11, 0)
661.1Sjmcneill#define AES_SRC			0x04
671.1Sjmcneill#define AES_DEST		0x08
681.1Sjmcneill#define AES_KEY			0x0c
691.1Sjmcneill#define AES_IV			0x10
701.1Sjmcneill
711.1Sjmcneill/* Register frame size */
721.1Sjmcneill#define AES_REG_SIZE		0x14
731.1Sjmcneill
741.1Sjmcneill/* Device limits */
751.1Sjmcneill#define HWAES_BLOCK_LEN		16
761.1Sjmcneill#define HWAES_ALIGN		16
771.1Sjmcneill#define HWAES_MAX_BLOCKS	4096
781.1Sjmcneill#define HWAES_MAX_AES_LEN	(HWAES_BLOCK_LEN * HWAES_MAX_BLOCKS)
791.1Sjmcneill
801.1Sjmcneillstatic int	hwaes_match(device_t, cfdata_t, void *);
811.1Sjmcneillstatic void	hwaes_attach(device_t, device_t, void *);
821.1Sjmcneill
831.1Sjmcneillstruct hwaes_softc;
841.1Sjmcneill
851.1Sjmcneillstruct hwaes_dma {
861.1Sjmcneill	bus_dmamap_t		dma_map;
871.1Sjmcneill	void			*dma_addr;
881.1Sjmcneill	size_t			dma_size;
891.1Sjmcneill	bus_dma_segment_t	dma_segs[1];
901.1Sjmcneill};
911.1Sjmcneill
921.1Sjmcneillstruct hwaes_softc {
931.1Sjmcneill	device_t		sc_dev;
941.1Sjmcneill	bus_space_tag_t		sc_bst;
951.1Sjmcneill	bus_space_handle_t	sc_bsh;
961.1Sjmcneill	bus_dma_tag_t		sc_dmat;
971.1Sjmcneill	struct hwaes_dma	sc_dma_bounce;
981.1Sjmcneill};
991.1Sjmcneill
1001.1Sjmcneillstruct hwaes_softc *hwaes_sc;
1011.1Sjmcneill
1021.1Sjmcneill#define WR4(sc, reg, val)	\
1031.1Sjmcneill	bus_space_write_4((sc)->sc_bst, (sc)->sc_bsh, (reg), (val))
1041.1Sjmcneill#define RD4(sc, reg)		\
1051.1Sjmcneill	bus_space_read_4((sc)->sc_bst, (sc)->sc_bsh, (reg))
1061.1Sjmcneill
1071.1SjmcneillCFATTACH_DECL_NEW(hwaes, sizeof(struct hwaes_softc),
1081.1Sjmcneill    hwaes_match, hwaes_attach, NULL, NULL);
1091.1Sjmcneill
1101.1Sjmcneillstatic int	hwaes_dma_alloc(struct hwaes_softc *, struct hwaes_dma *,
1111.1Sjmcneill				size_t, int);
1121.1Sjmcneillstatic void	hwaes_register(void);
1131.1Sjmcneill
1141.1Sjmcneillstatic int
1151.1Sjmcneillhwaes_match(device_t parent, cfdata_t cf, void *aux)
1161.1Sjmcneill{
1171.1Sjmcneill	return 1;
1181.1Sjmcneill}
1191.1Sjmcneill
1201.1Sjmcneillstatic void
1211.1Sjmcneillhwaes_attach(device_t parent, device_t self, void *aux)
1221.1Sjmcneill{
1231.1Sjmcneill	struct ahb_attach_args *aaa = aux;
1241.1Sjmcneill	struct hwaes_softc *sc = device_private(self);
1251.1Sjmcneill	bool enabled;
1261.1Sjmcneill	int error;
1271.1Sjmcneill
1281.1Sjmcneill	/*
1291.1Sjmcneill	 * Since aes_md_init() expects per-CPU engines and we only have one,
1301.1Sjmcneill	 * only enable AES offload in single CPU configurations.
1311.1Sjmcneill	 */
1321.1Sjmcneill	enabled = kcpuset_countset(kcpuset_attached) == 1;
1331.1Sjmcneill
1341.1Sjmcneill	aprint_naive("\n");
1351.1Sjmcneill	aprint_normal(": AES engine%s\n", enabled ? "" : " (disabled)");
1361.1Sjmcneill	if (!enabled) {
1371.1Sjmcneill		return;
1381.1Sjmcneill	}
1391.1Sjmcneill
1401.1Sjmcneill	sc->sc_dev = self;
1411.1Sjmcneill	sc->sc_dmat = aaa->aaa_dmat;
1421.1Sjmcneill	sc->sc_bst = aaa->aaa_bst;
1431.1Sjmcneill	error = bus_space_map(sc->sc_bst, aaa->aaa_addr, AES_REG_SIZE,
1441.1Sjmcneill	    0, &sc->sc_bsh);
1451.1Sjmcneill	if (error != 0) {
1461.1Sjmcneill		aprint_error_dev(self, "couldn't map registers (%d)\n", error);
1471.1Sjmcneill		return;
1481.1Sjmcneill	}
1491.1Sjmcneill
1501.1Sjmcneill	ahb_claim_device(self, IOPAESEN);
1511.1Sjmcneill
1521.1Sjmcneill	error = hwaes_dma_alloc(sc, &sc->sc_dma_bounce, HWAES_MAX_AES_LEN,
1531.1Sjmcneill	    BUS_DMA_WAITOK);
1541.1Sjmcneill	if (error != 0) {
1551.1Sjmcneill		return;
1561.1Sjmcneill	}
1571.1Sjmcneill
1581.1Sjmcneill	WR4(sc, AES_CTRL, 0);
1591.1Sjmcneill	for (;;) {
1601.1Sjmcneill		if (RD4(sc, AES_CTRL) == 0) {
1611.1Sjmcneill			break;
1621.1Sjmcneill		}
1631.1Sjmcneill	}
1641.1Sjmcneill
1651.1Sjmcneill	hwaes_sc = sc;
1661.1Sjmcneill	hwaes_register();
1671.1Sjmcneill}
1681.1Sjmcneill
1691.1Sjmcneillstatic int
1701.1Sjmcneillhwaes_dma_alloc(struct hwaes_softc *sc, struct hwaes_dma *dma, size_t size,
1711.1Sjmcneill    int flags)
1721.1Sjmcneill{
1731.1Sjmcneill	int error, nsegs;
1741.1Sjmcneill
1751.1Sjmcneill	dma->dma_size = size;
1761.1Sjmcneill
1771.1Sjmcneill	error = bus_dmamem_alloc(sc->sc_dmat, dma->dma_size, HWAES_ALIGN, 0,
1781.1Sjmcneill	    dma->dma_segs, 1, &nsegs, flags);
1791.1Sjmcneill	if (error != 0) {
1801.1Sjmcneill		aprint_error_dev(sc->sc_dev,
1811.1Sjmcneill		    "bus_dmamem_alloc failed: %d\n", error);
1821.1Sjmcneill		goto alloc_failed;
1831.1Sjmcneill	}
1841.1Sjmcneill	error = bus_dmamem_map(sc->sc_dmat, dma->dma_segs, nsegs,
1851.1Sjmcneill	    dma->dma_size, &dma->dma_addr, flags);
1861.1Sjmcneill	if (error != 0) {
1871.1Sjmcneill		aprint_error_dev(sc->sc_dev,
1881.1Sjmcneill		    "bus_dmamem_map failed: %d\n", error);
1891.1Sjmcneill		goto map_failed;
1901.1Sjmcneill	}
1911.1Sjmcneill	error = bus_dmamap_create(sc->sc_dmat, dma->dma_size, nsegs,
1921.1Sjmcneill	    dma->dma_size, 0, flags, &dma->dma_map);
1931.1Sjmcneill	if (error != 0) {
1941.1Sjmcneill		aprint_error_dev(sc->sc_dev,
1951.1Sjmcneill		    "bus_dmamap_create failed: %d\n", error);
1961.1Sjmcneill		goto create_failed;
1971.1Sjmcneill	}
1981.1Sjmcneill	error = bus_dmamap_load(sc->sc_dmat, dma->dma_map, dma->dma_addr,
1991.1Sjmcneill	    dma->dma_size, NULL, flags);
2001.1Sjmcneill	if (error != 0) {
2011.1Sjmcneill		aprint_error_dev(sc->sc_dev,
2021.1Sjmcneill		    "bus_dmamap_load failed: %d\n", error);
2031.1Sjmcneill		goto load_failed;
2041.1Sjmcneill	}
2051.1Sjmcneill
2061.1Sjmcneill	return 0;
2071.1Sjmcneill
2081.1Sjmcneillload_failed:
2091.1Sjmcneill	bus_dmamap_destroy(sc->sc_dmat, dma->dma_map);
2101.1Sjmcneillcreate_failed:
2111.1Sjmcneill	bus_dmamem_unmap(sc->sc_dmat, dma->dma_addr, dma->dma_size);
2121.1Sjmcneillmap_failed:
2131.1Sjmcneill	bus_dmamem_free(sc->sc_dmat, dma->dma_segs, nsegs);
2141.1Sjmcneillalloc_failed:
2151.1Sjmcneill	return error;
2161.1Sjmcneill}
2171.1Sjmcneill
2181.1Sjmcneillstatic int
2191.1Sjmcneillhwaes_probe(void)
2201.1Sjmcneill{
2211.1Sjmcneill	return 0;
2221.1Sjmcneill}
2231.1Sjmcneill
2241.1Sjmcneillstatic void
2251.1Sjmcneillhwaes_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds)
2261.1Sjmcneill{
2271.1Sjmcneill	if (nrounds == AES_128_NROUNDS) {
2281.1Sjmcneill		enc->aese_aes.aes_rk[0] = be32dec(key + 4*0);
2291.1Sjmcneill		enc->aese_aes.aes_rk[1] = be32dec(key + 4*1);
2301.1Sjmcneill		enc->aese_aes.aes_rk[2] = be32dec(key + 4*2);
2311.1Sjmcneill		enc->aese_aes.aes_rk[3] = be32dec(key + 4*3);
2321.1Sjmcneill	} else {
2331.1Sjmcneill		aes_bear_impl.ai_setenckey(enc, key, nrounds);
2341.1Sjmcneill	}
2351.1Sjmcneill}
2361.1Sjmcneill
2371.1Sjmcneillstatic void
2381.1Sjmcneillhwaes_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
2391.1Sjmcneill{
2401.1Sjmcneill	if (nrounds == AES_128_NROUNDS) {
2411.1Sjmcneill		dec->aesd_aes.aes_rk[0] = be32dec(key + 4*0);
2421.1Sjmcneill		dec->aesd_aes.aes_rk[1] = be32dec(key + 4*1);
2431.1Sjmcneill		dec->aesd_aes.aes_rk[2] = be32dec(key + 4*2);
2441.1Sjmcneill		dec->aesd_aes.aes_rk[3] = be32dec(key + 4*3);
2451.1Sjmcneill	} else {
2461.1Sjmcneill		aes_bear_impl.ai_setdeckey(dec, key, nrounds);
2471.1Sjmcneill	}
2481.1Sjmcneill}
2491.1Sjmcneill
2501.1Sjmcneillstatic void
2511.1Sjmcneillhwaes_exec_sync(uint32_t flags, uint16_t blocks)
2521.1Sjmcneill{
2531.1Sjmcneill	struct hwaes_softc *sc = hwaes_sc;
2541.1Sjmcneill	uint32_t ctrl;
2551.1Sjmcneill
2561.1Sjmcneill	KASSERT(blocks > 0);
2571.1Sjmcneill	KASSERT(blocks <= HWAES_MAX_BLOCKS);
2581.1Sjmcneill
2591.1Sjmcneill	WR4(sc, AES_SRC, sc->sc_dma_bounce.dma_segs[0].ds_addr);
2601.1Sjmcneill	WR4(sc, AES_DEST, sc->sc_dma_bounce.dma_segs[0].ds_addr);
2611.1Sjmcneill
2621.1Sjmcneill	ctrl = AES_CTRL_EXEC | AES_CTRL_ENA | flags;
2631.1Sjmcneill	ctrl |= __SHIFTIN(blocks - 1, AES_CTRL_BLOCKS);
2641.1Sjmcneill
2651.1Sjmcneill	WR4(sc, AES_CTRL, ctrl);
2661.1Sjmcneill	for (;;) {
2671.1Sjmcneill		ctrl = RD4(sc, AES_CTRL);
2681.1Sjmcneill		if ((ctrl & AES_CTRL_ERR) != 0) {
2691.1Sjmcneill			printf("AES error, AES_CTRL = %#x\n", ctrl);
2701.1Sjmcneill			break;
2711.1Sjmcneill		}
2721.1Sjmcneill		if ((ctrl & AES_CTRL_EXEC) == 0) {
2731.1Sjmcneill			break;
2741.1Sjmcneill		}
2751.1Sjmcneill	}
2761.1Sjmcneill}
2771.1Sjmcneill
2781.1Sjmcneillstatic void
2791.1Sjmcneillhwaes_enc(const struct aesenc *enc, const uint8_t in[static 16],
2801.1Sjmcneill    uint8_t out[static 16], uint32_t nrounds)
2811.1Sjmcneill{
2821.1Sjmcneill	struct hwaes_softc *sc = hwaes_sc;
2831.1Sjmcneill	unsigned n;
2841.1Sjmcneill	int s;
2851.1Sjmcneill
2861.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
2871.1Sjmcneill		aes_bear_impl.ai_enc(enc, in, out, nrounds);
2881.1Sjmcneill		return;
2891.1Sjmcneill	}
2901.1Sjmcneill
2911.1Sjmcneill	s = splvm();
2921.1Sjmcneill
2931.1Sjmcneill	for (n = 0; n < 4; n++) {
2941.1Sjmcneill		WR4(sc, AES_IV, 0);
2951.1Sjmcneill	}
2961.1Sjmcneill	for (n = 0; n < 4; n++) {
2971.1Sjmcneill		WR4(sc, AES_KEY, enc->aese_aes.aes_rk[n]);
2981.1Sjmcneill	}
2991.1Sjmcneill	memcpy(sc->sc_dma_bounce.dma_addr, in, HWAES_BLOCK_LEN);
3001.1Sjmcneill	bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_bounce.dma_map,
3011.1Sjmcneill	    0, HWAES_BLOCK_LEN, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
3021.1Sjmcneill	hwaes_exec_sync(0, 1);
3031.1Sjmcneill	bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_bounce.dma_map,
3041.1Sjmcneill	    0, HWAES_BLOCK_LEN, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
3051.1Sjmcneill	memcpy(out, sc->sc_dma_bounce.dma_addr, HWAES_BLOCK_LEN);
3061.1Sjmcneill
3071.1Sjmcneill	splx(s);
3081.1Sjmcneill}
3091.1Sjmcneill
3101.1Sjmcneillstatic void
3111.1Sjmcneillhwaes_encN(const struct aesenc *enc, const uint8_t in[static 16],
3121.1Sjmcneill    uint8_t out[static 16], size_t nblocks)
3131.1Sjmcneill{
3141.1Sjmcneill	for (size_t n = 0; n < nblocks; n++) {
3151.1Sjmcneill		hwaes_enc(enc, &in[n * HWAES_BLOCK_LEN],
3161.1Sjmcneill		    &out[n * HWAES_BLOCK_LEN], AES_128_NROUNDS);
3171.1Sjmcneill	}
3181.1Sjmcneill}
3191.1Sjmcneill
3201.1Sjmcneillstatic void
3211.1Sjmcneillhwaes_dec(const struct aesdec *dec, const uint8_t in[static 16],
3221.1Sjmcneill    uint8_t out[static 16], uint32_t nrounds)
3231.1Sjmcneill{
3241.1Sjmcneill	struct hwaes_softc *sc = hwaes_sc;
3251.1Sjmcneill	unsigned n;
3261.1Sjmcneill	int s;
3271.1Sjmcneill
3281.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
3291.1Sjmcneill		aes_bear_impl.ai_dec(dec, in, out, nrounds);
3301.1Sjmcneill		return;
3311.1Sjmcneill	}
3321.1Sjmcneill
3331.1Sjmcneill	s = splvm();
3341.1Sjmcneill
3351.1Sjmcneill	for (n = 0; n < 4; n++) {
3361.1Sjmcneill		WR4(sc, AES_IV, 0);
3371.1Sjmcneill	}
3381.1Sjmcneill	for (n = 0; n < 4; n++) {
3391.1Sjmcneill		WR4(sc, AES_KEY, dec->aesd_aes.aes_rk[n]);
3401.1Sjmcneill	}
3411.1Sjmcneill	memcpy(sc->sc_dma_bounce.dma_addr, in, HWAES_BLOCK_LEN);
3421.1Sjmcneill	bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_bounce.dma_map,
3431.1Sjmcneill	    0, HWAES_BLOCK_LEN, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
3441.1Sjmcneill	hwaes_exec_sync(AES_CTRL_DEC, 1);
3451.1Sjmcneill	bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_bounce.dma_map,
3461.1Sjmcneill	    0, HWAES_BLOCK_LEN, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
3471.1Sjmcneill	memcpy(out, sc->sc_dma_bounce.dma_addr, HWAES_BLOCK_LEN);
3481.1Sjmcneill
3491.1Sjmcneill	splx(s);
3501.1Sjmcneill}
3511.1Sjmcneill
3521.1Sjmcneillstatic void
3531.1Sjmcneillhwaes_decN(const struct aesdec *dec, const uint8_t in[static 16],
3541.1Sjmcneill    uint8_t out[static 16], size_t nblocks)
3551.1Sjmcneill{
3561.1Sjmcneill	for (size_t n = 0; n < nblocks; n++) {
3571.1Sjmcneill		hwaes_dec(dec, &in[n * HWAES_BLOCK_LEN],
3581.1Sjmcneill		    &out[n * HWAES_BLOCK_LEN], AES_128_NROUNDS);
3591.1Sjmcneill	}
3601.1Sjmcneill}
3611.1Sjmcneill
3621.1Sjmcneillstatic void
3631.1Sjmcneillhwaes_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
3641.1Sjmcneill    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
3651.1Sjmcneill    uint32_t nrounds)
3661.1Sjmcneill{
3671.1Sjmcneill	struct hwaes_softc *sc = hwaes_sc;
3681.1Sjmcneill	const uint8_t *inp = in;
3691.1Sjmcneill	uint8_t *outp = out;
3701.1Sjmcneill	uint32_t flags;
3711.1Sjmcneill	unsigned n;
3721.1Sjmcneill	int s;
3731.1Sjmcneill
3741.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
3751.1Sjmcneill		aes_bear_impl.ai_cbc_enc(enc, in, out, nbytes, iv, nrounds);
3761.1Sjmcneill		return;
3771.1Sjmcneill	}
3781.1Sjmcneill
3791.1Sjmcneill	KASSERT(nbytes % HWAES_BLOCK_LEN == 0);
3801.1Sjmcneill	if (nbytes == 0) {
3811.1Sjmcneill		return;
3821.1Sjmcneill	}
3831.1Sjmcneill
3841.1Sjmcneill	s = splvm();
3851.1Sjmcneill
3861.1Sjmcneill	for (n = 0; n < 4; n++) {
3871.1Sjmcneill		WR4(sc, AES_IV, be32dec(&iv[n * 4]));
3881.1Sjmcneill	}
3891.1Sjmcneill	for (n = 0; n < 4; n++) {
3901.1Sjmcneill		WR4(sc, AES_KEY, enc->aese_aes.aes_rk[n]);
3911.1Sjmcneill	}
3921.1Sjmcneill	flags = 0;
3931.1Sjmcneill	while (nbytes > 0) {
3941.1Sjmcneill		const size_t blocks = MIN(nbytes / HWAES_BLOCK_LEN,
3951.1Sjmcneill					  HWAES_MAX_BLOCKS);
3961.1Sjmcneill
3971.1Sjmcneill		memcpy(sc->sc_dma_bounce.dma_addr, inp,
3981.1Sjmcneill		    blocks * HWAES_BLOCK_LEN);
3991.1Sjmcneill		bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_bounce.dma_map,
4001.1Sjmcneill		    0, blocks * HWAES_BLOCK_LEN,
4011.1Sjmcneill		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
4021.1Sjmcneill		hwaes_exec_sync(flags, blocks);
4031.1Sjmcneill		bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_bounce.dma_map,
4041.1Sjmcneill		    0, blocks * HWAES_BLOCK_LEN,
4051.1Sjmcneill		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
4061.1Sjmcneill		memcpy(outp, sc->sc_dma_bounce.dma_addr,
4071.1Sjmcneill		    blocks * HWAES_BLOCK_LEN);
4081.1Sjmcneill
4091.1Sjmcneill		nbytes -= blocks * HWAES_BLOCK_LEN;
4101.1Sjmcneill		inp += blocks * HWAES_BLOCK_LEN;
4111.1Sjmcneill		outp += blocks * HWAES_BLOCK_LEN;
4121.1Sjmcneill		flags |= AES_CTRL_IV;
4131.1Sjmcneill	}
4141.1Sjmcneill
4151.1Sjmcneill	memcpy(iv, outp - HWAES_BLOCK_LEN, HWAES_BLOCK_LEN);
4161.1Sjmcneill
4171.1Sjmcneill	splx(s);
4181.1Sjmcneill}
4191.1Sjmcneill
4201.1Sjmcneillstatic void
4211.1Sjmcneillhwaes_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
4221.1Sjmcneill    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
4231.1Sjmcneill    uint32_t nrounds)
4241.1Sjmcneill{
4251.1Sjmcneill	struct hwaes_softc *sc = hwaes_sc;
4261.1Sjmcneill	const uint8_t *inp = in;
4271.1Sjmcneill	uint8_t *outp = out;
4281.1Sjmcneill	uint32_t flags;
4291.1Sjmcneill	unsigned n;
4301.1Sjmcneill	int s;
4311.1Sjmcneill
4321.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
4331.1Sjmcneill		aes_bear_impl.ai_cbc_dec(dec, in, out, nbytes, iv, nrounds);
4341.1Sjmcneill		return;
4351.1Sjmcneill	}
4361.1Sjmcneill
4371.1Sjmcneill	KASSERT(nbytes % HWAES_BLOCK_LEN == 0);
4381.1Sjmcneill	if (nbytes == 0) {
4391.1Sjmcneill		return;
4401.1Sjmcneill	}
4411.1Sjmcneill
4421.1Sjmcneill	s = splvm();
4431.1Sjmcneill
4441.1Sjmcneill	for (n = 0; n < 4; n++) {
4451.1Sjmcneill		WR4(sc, AES_IV, be32dec(&iv[n * 4]));
4461.1Sjmcneill	}
4471.1Sjmcneill
4481.1Sjmcneill	memcpy(iv, inp + nbytes - HWAES_BLOCK_LEN, HWAES_BLOCK_LEN);
4491.1Sjmcneill
4501.1Sjmcneill	for (n = 0; n < 4; n++) {
4511.1Sjmcneill		WR4(sc, AES_KEY, dec->aesd_aes.aes_rk[n]);
4521.1Sjmcneill	}
4531.1Sjmcneill	flags = AES_CTRL_DEC;
4541.1Sjmcneill	while (nbytes > 0) {
4551.1Sjmcneill		const size_t blocks = MIN(nbytes / HWAES_BLOCK_LEN,
4561.1Sjmcneill					  HWAES_MAX_BLOCKS);
4571.1Sjmcneill
4581.1Sjmcneill		memcpy(sc->sc_dma_bounce.dma_addr, inp,
4591.1Sjmcneill		    blocks * HWAES_BLOCK_LEN);
4601.1Sjmcneill		bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_bounce.dma_map,
4611.1Sjmcneill		    0, blocks * HWAES_BLOCK_LEN,
4621.1Sjmcneill		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
4631.1Sjmcneill		hwaes_exec_sync(flags, blocks);
4641.1Sjmcneill		bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_bounce.dma_map,
4651.1Sjmcneill		    0, blocks * HWAES_BLOCK_LEN,
4661.1Sjmcneill		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
4671.1Sjmcneill		memcpy(outp, sc->sc_dma_bounce.dma_addr,
4681.1Sjmcneill		    blocks * HWAES_BLOCK_LEN);
4691.1Sjmcneill
4701.1Sjmcneill		nbytes -= blocks * HWAES_BLOCK_LEN;
4711.1Sjmcneill		inp += blocks * HWAES_BLOCK_LEN;
4721.1Sjmcneill		outp += blocks * HWAES_BLOCK_LEN;
4731.1Sjmcneill		flags |= AES_CTRL_IV;
4741.1Sjmcneill	}
4751.1Sjmcneill
4761.1Sjmcneill	splx(s);
4771.1Sjmcneill}
4781.1Sjmcneill
4791.1Sjmcneillstatic void
4801.1Sjmcneillhwaes_xts_update(uint32_t *t0, uint32_t *t1, uint32_t *t2, uint32_t *t3)
4811.1Sjmcneill{
4821.1Sjmcneill	uint32_t s0, s1, s2, s3;
4831.1Sjmcneill
4841.1Sjmcneill	s0 = *t0 >> 31;
4851.1Sjmcneill	s1 = *t1 >> 31;
4861.1Sjmcneill	s2 = *t2 >> 31;
4871.1Sjmcneill	s3 = *t3 >> 31;
4881.1Sjmcneill	*t0 = (*t0 << 1) ^ (-s3 & 0x87);
4891.1Sjmcneill	*t1 = (*t1 << 1) ^ s0;
4901.1Sjmcneill	*t2 = (*t2 << 1) ^ s1;
4911.1Sjmcneill	*t3 = (*t3 << 1) ^ s2;
4921.1Sjmcneill}
4931.1Sjmcneill
4941.1Sjmcneillstatic void
4951.1Sjmcneillhwaes_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
4961.1Sjmcneill    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
4971.1Sjmcneill    uint32_t nrounds)
4981.1Sjmcneill{
4991.1Sjmcneill	uint8_t block[16];
5001.1Sjmcneill	uint8_t tle[16];
5011.1Sjmcneill	uint32_t t[4];
5021.1Sjmcneill	const uint8_t *inp = in;
5031.1Sjmcneill	uint8_t *outp = out;
5041.1Sjmcneill
5051.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
5061.1Sjmcneill		aes_bear_impl.ai_xts_enc(enc, in, out, nbytes, tweak, nrounds);
5071.1Sjmcneill		return;
5081.1Sjmcneill	}
5091.1Sjmcneill
5101.1Sjmcneill	KASSERT(nbytes % 16 == 0);
5111.1Sjmcneill
5121.1Sjmcneill	t[0] = le32dec(tweak + 4*0);
5131.1Sjmcneill	t[1] = le32dec(tweak + 4*1);
5141.1Sjmcneill	t[2] = le32dec(tweak + 4*2);
5151.1Sjmcneill	t[3] = le32dec(tweak + 4*3);
5161.1Sjmcneill
5171.1Sjmcneill	while (nbytes > 0) {
5181.1Sjmcneill		le32enc(tle + 4*0, t[0]);
5191.1Sjmcneill		le32enc(tle + 4*1, t[1]);
5201.1Sjmcneill		le32enc(tle + 4*2, t[2]);
5211.1Sjmcneill		le32enc(tle + 4*3, t[3]);
5221.1Sjmcneill
5231.1Sjmcneill		for (unsigned n = 0; n < 16; n++) {
5241.1Sjmcneill			block[n] = inp[n] ^ tle[n];
5251.1Sjmcneill		}
5261.1Sjmcneill
5271.1Sjmcneill		hwaes_encN(enc, block, block, 1);
5281.1Sjmcneill
5291.1Sjmcneill		for (unsigned n = 0; n < 16; n++) {
5301.1Sjmcneill			outp[n] = block[n] ^ tle[n];
5311.1Sjmcneill		}
5321.1Sjmcneill
5331.1Sjmcneill		hwaes_xts_update(&t[0], &t[1], &t[2], &t[3]);
5341.1Sjmcneill
5351.1Sjmcneill		nbytes -= HWAES_BLOCK_LEN;
5361.1Sjmcneill		inp += HWAES_BLOCK_LEN;
5371.1Sjmcneill		outp += HWAES_BLOCK_LEN;
5381.1Sjmcneill	}
5391.1Sjmcneill
5401.1Sjmcneill	le32enc(tweak + 4*0, t[0]);
5411.1Sjmcneill	le32enc(tweak + 4*1, t[1]);
5421.1Sjmcneill	le32enc(tweak + 4*2, t[2]);
5431.1Sjmcneill	le32enc(tweak + 4*3, t[3]);
5441.1Sjmcneill
5451.1Sjmcneill	explicit_memset(t, 0, sizeof(t));
5461.1Sjmcneill	explicit_memset(block, 0, sizeof(block));
5471.1Sjmcneill	explicit_memset(tle, 0, sizeof(tle));
5481.1Sjmcneill}
5491.1Sjmcneill
5501.1Sjmcneillstatic void
5511.1Sjmcneillhwaes_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
5521.1Sjmcneill    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
5531.1Sjmcneill    uint32_t nrounds)
5541.1Sjmcneill{
5551.1Sjmcneill	uint8_t block[16];
5561.1Sjmcneill	uint8_t tle[16];
5571.1Sjmcneill	uint32_t t[4];
5581.1Sjmcneill	const uint8_t *inp = in;
5591.1Sjmcneill	uint8_t *outp = out;
5601.1Sjmcneill
5611.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
5621.1Sjmcneill		aes_bear_impl.ai_xts_dec(dec, in, out, nbytes, tweak, nrounds);
5631.1Sjmcneill		return;
5641.1Sjmcneill	}
5651.1Sjmcneill
5661.1Sjmcneill	KASSERT(nbytes % 16 == 0);
5671.1Sjmcneill
5681.1Sjmcneill	t[0] = le32dec(tweak + 4*0);
5691.1Sjmcneill	t[1] = le32dec(tweak + 4*1);
5701.1Sjmcneill	t[2] = le32dec(tweak + 4*2);
5711.1Sjmcneill	t[3] = le32dec(tweak + 4*3);
5721.1Sjmcneill
5731.1Sjmcneill	while (nbytes > 0) {
5741.1Sjmcneill		le32enc(tle + 4*0, t[0]);
5751.1Sjmcneill		le32enc(tle + 4*1, t[1]);
5761.1Sjmcneill		le32enc(tle + 4*2, t[2]);
5771.1Sjmcneill		le32enc(tle + 4*3, t[3]);
5781.1Sjmcneill
5791.1Sjmcneill		for (unsigned n = 0; n < 16; n++) {
5801.1Sjmcneill			block[n] = inp[n] ^ tle[n];
5811.1Sjmcneill		}
5821.1Sjmcneill
5831.1Sjmcneill		hwaes_decN(dec, block, block, 1);
5841.1Sjmcneill
5851.1Sjmcneill		for (unsigned n = 0; n < 16; n++) {
5861.1Sjmcneill			outp[n] = block[n] ^ tle[n];
5871.1Sjmcneill		}
5881.1Sjmcneill
5891.1Sjmcneill		hwaes_xts_update(&t[0], &t[1], &t[2], &t[3]);
5901.1Sjmcneill
5911.1Sjmcneill		nbytes -= HWAES_BLOCK_LEN;
5921.1Sjmcneill		inp += HWAES_BLOCK_LEN;
5931.1Sjmcneill		outp += HWAES_BLOCK_LEN;
5941.1Sjmcneill	}
5951.1Sjmcneill
5961.1Sjmcneill	le32enc(tweak + 4*0, t[0]);
5971.1Sjmcneill	le32enc(tweak + 4*1, t[1]);
5981.1Sjmcneill	le32enc(tweak + 4*2, t[2]);
5991.1Sjmcneill	le32enc(tweak + 4*3, t[3]);
6001.1Sjmcneill
6011.1Sjmcneill	explicit_memset(t, 0, sizeof(t));
6021.1Sjmcneill	explicit_memset(block, 0, sizeof(block));
6031.1Sjmcneill	explicit_memset(tle, 0, sizeof(tle));
6041.1Sjmcneill}
6051.1Sjmcneill
6061.1Sjmcneillstatic void
6071.1Sjmcneillhwaes_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
6081.1Sjmcneill    size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
6091.1Sjmcneill{
6101.1Sjmcneill	const uint8_t *inp = in;
6111.1Sjmcneill
6121.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
6131.1Sjmcneill		aes_bear_impl.ai_cbcmac_update1(enc, in, nbytes, auth0, nrounds);
6141.1Sjmcneill		return;
6151.1Sjmcneill	}
6161.1Sjmcneill
6171.1Sjmcneill	KASSERT(nbytes % 16 == 0);
6181.1Sjmcneill
6191.1Sjmcneill	while (nbytes > 0) {
6201.1Sjmcneill		for (unsigned n = 0; n < 16; n++) {
6211.1Sjmcneill			auth0[n] = auth0[n] ^ inp[n];
6221.1Sjmcneill		}
6231.1Sjmcneill
6241.1Sjmcneill		hwaes_encN(enc, auth0, auth0, 1);
6251.1Sjmcneill
6261.1Sjmcneill		nbytes -= HWAES_BLOCK_LEN;
6271.1Sjmcneill		inp += HWAES_BLOCK_LEN;
6281.1Sjmcneill	}
6291.1Sjmcneill}
6301.1Sjmcneill
6311.1Sjmcneillstatic void
6321.1Sjmcneillhwaes_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
6331.1Sjmcneill    uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
6341.1Sjmcneill    uint32_t nrounds)
6351.1Sjmcneill{
6361.1Sjmcneill	const uint8_t *inp = in;
6371.1Sjmcneill	uint8_t *outp = out;
6381.1Sjmcneill	uint32_t c[4];
6391.1Sjmcneill
6401.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
6411.1Sjmcneill		aes_bear_impl.ai_ccm_enc1(enc, in, out, nbytes, authctr0, nrounds);
6421.1Sjmcneill		return;
6431.1Sjmcneill	}
6441.1Sjmcneill
6451.1Sjmcneill	KASSERT(nbytes % 16 == 0);
6461.1Sjmcneill
6471.1Sjmcneill	c[0] = le32dec(authctr0 + 16 + 4*0);
6481.1Sjmcneill	c[1] = le32dec(authctr0 + 16 + 4*1);
6491.1Sjmcneill	c[2] = le32dec(authctr0 + 16 + 4*2);
6501.1Sjmcneill	c[3] = be32dec(authctr0 + 16 + 4*3);
6511.1Sjmcneill
6521.1Sjmcneill	while (nbytes > 0) {
6531.1Sjmcneill		for (unsigned n = 0; n < 16; n++) {
6541.1Sjmcneill			authctr0[n] = authctr0[n] ^ inp[n];
6551.1Sjmcneill		}
6561.1Sjmcneill
6571.1Sjmcneill		le32enc(authctr0 + 16 + 4*0, c[0]);
6581.1Sjmcneill		le32enc(authctr0 + 16 + 4*1, c[1]);
6591.1Sjmcneill		le32enc(authctr0 + 16 + 4*2, c[2]);
6601.1Sjmcneill		be32enc(authctr0 + 16 + 4*3, ++c[3]);
6611.1Sjmcneill
6621.1Sjmcneill		hwaes_encN(enc, authctr0, authctr0, 2);
6631.1Sjmcneill
6641.1Sjmcneill		for (unsigned n = 0; n < 16; n++) {
6651.1Sjmcneill			outp[n] = inp[n] ^ authctr0[n + 16];
6661.1Sjmcneill		}
6671.1Sjmcneill
6681.1Sjmcneill		nbytes -= HWAES_BLOCK_LEN;
6691.1Sjmcneill		inp += HWAES_BLOCK_LEN;
6701.1Sjmcneill		outp += HWAES_BLOCK_LEN;
6711.1Sjmcneill	}
6721.1Sjmcneill
6731.1Sjmcneill	le32enc(authctr0 + 16 + 4*0, c[0]);
6741.1Sjmcneill	le32enc(authctr0 + 16 + 4*1, c[1]);
6751.1Sjmcneill	le32enc(authctr0 + 16 + 4*2, c[2]);
6761.1Sjmcneill	be32enc(authctr0 + 16 + 4*3, c[3]);
6771.1Sjmcneill}
6781.1Sjmcneill
6791.1Sjmcneillstatic void
6801.1Sjmcneillhwaes_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
6811.1Sjmcneill    uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
6821.1Sjmcneill    uint32_t nrounds)
6831.1Sjmcneill{
6841.1Sjmcneill	const uint8_t *inp = in;
6851.1Sjmcneill	uint8_t *outp = out;
6861.1Sjmcneill	uint32_t c[4];
6871.1Sjmcneill
6881.1Sjmcneill	if (nrounds != AES_128_NROUNDS) {
6891.1Sjmcneill		aes_bear_impl.ai_ccm_dec1(enc, in, out, nbytes, authctr0, nrounds);
6901.1Sjmcneill		return;
6911.1Sjmcneill	}
6921.1Sjmcneill
6931.1Sjmcneill	KASSERT(nbytes % 16 == 0);
6941.1Sjmcneill
6951.1Sjmcneill	c[0] = le32dec(authctr0 + 16 + 4*0);
6961.1Sjmcneill	c[1] = le32dec(authctr0 + 16 + 4*1);
6971.1Sjmcneill	c[2] = le32dec(authctr0 + 16 + 4*2);
6981.1Sjmcneill	c[3] = be32dec(authctr0 + 16 + 4*3);
6991.1Sjmcneill
7001.1Sjmcneill	be32enc(authctr0 + 16 + 4*3, ++c[3]);
7011.1Sjmcneill	hwaes_encN(enc, authctr0 + 16, authctr0 + 16, 1);
7021.1Sjmcneill
7031.1Sjmcneill	while (nbytes > 0) {
7041.1Sjmcneill		for (unsigned n = 0; n < 16; n++) {
7051.1Sjmcneill			outp[n] = authctr0[n + 16] ^ inp[n];
7061.1Sjmcneill			authctr0[n] = authctr0[n] ^ outp[n];
7071.1Sjmcneill		}
7081.1Sjmcneill		nbytes -= HWAES_BLOCK_LEN;
7091.1Sjmcneill		if (nbytes == 0) {
7101.1Sjmcneill			break;
7111.1Sjmcneill		}
7121.1Sjmcneill
7131.1Sjmcneill		inp += HWAES_BLOCK_LEN;
7141.1Sjmcneill		outp += HWAES_BLOCK_LEN;
7151.1Sjmcneill
7161.1Sjmcneill		le32enc(authctr0 + 16 + 4*0, c[0]);
7171.1Sjmcneill		le32enc(authctr0 + 16 + 4*1, c[1]);
7181.1Sjmcneill		le32enc(authctr0 + 16 + 4*2, c[2]);
7191.1Sjmcneill		be32enc(authctr0 + 16 + 4*3, ++c[3]);
7201.1Sjmcneill		hwaes_encN(enc, authctr0, authctr0, 2);
7211.1Sjmcneill	}
7221.1Sjmcneill	hwaes_encN(enc, authctr0, authctr0, 1);
7231.1Sjmcneill
7241.1Sjmcneill	le32enc(authctr0 + 16 + 4*0, c[0]);
7251.1Sjmcneill	le32enc(authctr0 + 16 + 4*1, c[1]);
7261.1Sjmcneill	le32enc(authctr0 + 16 + 4*2, c[2]);
7271.1Sjmcneill	be32enc(authctr0 + 16 + 4*3, c[3]);
7281.1Sjmcneill
7291.1Sjmcneill}
7301.1Sjmcneill
7311.1Sjmcneillstatic struct aes_impl aes_hwaes_impl = {
7321.1Sjmcneill	.ai_name = NULL,	/* filled in by hwaes_register */
7331.1Sjmcneill	.ai_probe = hwaes_probe,
7341.1Sjmcneill	.ai_setenckey = hwaes_setenckey,
7351.1Sjmcneill	.ai_setdeckey = hwaes_setdeckey,
7361.1Sjmcneill	.ai_enc = hwaes_enc,
7371.1Sjmcneill	.ai_dec = hwaes_dec,
7381.1Sjmcneill	.ai_cbc_enc = hwaes_cbc_enc,
7391.1Sjmcneill	.ai_cbc_dec = hwaes_cbc_dec,
7401.1Sjmcneill	.ai_xts_enc = hwaes_xts_enc,
7411.1Sjmcneill	.ai_xts_dec = hwaes_xts_dec,
7421.1Sjmcneill	.ai_cbcmac_update1 = hwaes_cbcmac_update1,
7431.1Sjmcneill	.ai_ccm_enc1 = hwaes_ccm_enc1,
7441.1Sjmcneill	.ai_ccm_dec1 = hwaes_ccm_dec1,
7451.1Sjmcneill};
7461.1Sjmcneill
7471.1Sjmcneillstatic void
7481.1Sjmcneillhwaes_register(void)
7491.1Sjmcneill{
7501.1Sjmcneill	if (wiiu_plat) {
7511.1Sjmcneill		aes_hwaes_impl.ai_name = "Latte AES engine";
7521.1Sjmcneill	} else {
7531.1Sjmcneill		aes_hwaes_impl.ai_name = "Hollywood AES engine";
7541.1Sjmcneill	}
7551.1Sjmcneill	aes_md_init(&aes_hwaes_impl);
7561.1Sjmcneill}
757