11.1Sriastrad/*	$NetBSD: aes_sse2_4x32_dec.c,v 1.1 2025/11/23 22:48:26 riastradh Exp $	*/
21.1Sriastrad
31.1Sriastrad/*
41.1Sriastrad * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
51.1Sriastrad *
61.1Sriastrad * Permission is hereby granted, free of charge, to any person obtaining
71.1Sriastrad * a copy of this software and associated documentation files (the
81.1Sriastrad * "Software"), to deal in the Software without restriction, including
91.1Sriastrad * without limitation the rights to use, copy, modify, merge, publish,
101.1Sriastrad * distribute, sublicense, and/or sell copies of the Software, and to
111.1Sriastrad * permit persons to whom the Software is furnished to do so, subject to
121.1Sriastrad * the following conditions:
131.1Sriastrad *
141.1Sriastrad * The above copyright notice and this permission notice shall be
151.1Sriastrad * included in all copies or substantial portions of the Software.
161.1Sriastrad *
171.1Sriastrad * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
181.1Sriastrad * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
191.1Sriastrad * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
201.1Sriastrad * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
211.1Sriastrad * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
221.1Sriastrad * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
231.1Sriastrad * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
241.1Sriastrad * SOFTWARE.
251.1Sriastrad */
261.1Sriastrad
271.1Sriastrad#include <sys/cdefs.h>
281.1Sriastrad__KERNEL_RCSID(1, "$NetBSD: aes_sse2_4x32_dec.c,v 1.1 2025/11/23 22:48:26 riastradh Exp $");
291.1Sriastrad
301.1Sriastrad#include <sys/types.h>
311.1Sriastrad
321.1Sriastrad#include "aes_sse2_4x32_impl.h"
331.1Sriastrad
341.1Sriastrad/* see inner.h */
351.1Sriastradvoid
361.1Sriastradaes_sse2_4x32_bitslice_invSbox(__m128i q[static 8])
371.1Sriastrad{
381.1Sriastrad	/*
391.1Sriastrad	 * AES S-box is:
401.1Sriastrad	 *   S(x) = A(I(x)) ^ 0x63
411.1Sriastrad	 * where I() is inversion in GF(256), and A() is a linear
421.1Sriastrad	 * transform (0 is formally defined to be its own inverse).
431.1Sriastrad	 * Since inversion is an involution, the inverse S-box can be
441.1Sriastrad	 * computed from the S-box as:
451.1Sriastrad	 *   iS(x) = B(S(B(x ^ 0x63)) ^ 0x63)
461.1Sriastrad	 * where B() is the inverse of A(). Indeed, for any y in GF(256):
471.1Sriastrad	 *   iS(S(y)) = B(A(I(B(A(I(y)) ^ 0x63 ^ 0x63))) ^ 0x63 ^ 0x63) = y
481.1Sriastrad	 *
491.1Sriastrad	 * Note: we reuse the implementation of the forward S-box,
501.1Sriastrad	 * instead of duplicating it here, so that total code size is
511.1Sriastrad	 * lower. By merging the B() transforms into the S-box circuit
521.1Sriastrad	 * we could make faster CBC decryption, but CBC decryption is
531.1Sriastrad	 * already quite faster than CBC encryption because we can
541.1Sriastrad	 * process two blocks in parallel.
551.1Sriastrad	 */
561.1Sriastrad	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
571.1Sriastrad
581.1Sriastrad	q0 = ~q[0];
591.1Sriastrad	q1 = ~q[1];
601.1Sriastrad	q2 = q[2];
611.1Sriastrad	q3 = q[3];
621.1Sriastrad	q4 = q[4];
631.1Sriastrad	q5 = ~q[5];
641.1Sriastrad	q6 = ~q[6];
651.1Sriastrad	q7 = q[7];
661.1Sriastrad	q[7] = q1 ^ q4 ^ q6;
671.1Sriastrad	q[6] = q0 ^ q3 ^ q5;
681.1Sriastrad	q[5] = q7 ^ q2 ^ q4;
691.1Sriastrad	q[4] = q6 ^ q1 ^ q3;
701.1Sriastrad	q[3] = q5 ^ q0 ^ q2;
711.1Sriastrad	q[2] = q4 ^ q7 ^ q1;
721.1Sriastrad	q[1] = q3 ^ q6 ^ q0;
731.1Sriastrad	q[0] = q2 ^ q5 ^ q7;
741.1Sriastrad
751.1Sriastrad	aes_sse2_4x32_bitslice_Sbox(q);
761.1Sriastrad
771.1Sriastrad	q0 = ~q[0];
781.1Sriastrad	q1 = ~q[1];
791.1Sriastrad	q2 = q[2];
801.1Sriastrad	q3 = q[3];
811.1Sriastrad	q4 = q[4];
821.1Sriastrad	q5 = ~q[5];
831.1Sriastrad	q6 = ~q[6];
841.1Sriastrad	q7 = q[7];
851.1Sriastrad	q[7] = q1 ^ q4 ^ q6;
861.1Sriastrad	q[6] = q0 ^ q3 ^ q5;
871.1Sriastrad	q[5] = q7 ^ q2 ^ q4;
881.1Sriastrad	q[4] = q6 ^ q1 ^ q3;
891.1Sriastrad	q[3] = q5 ^ q0 ^ q2;
901.1Sriastrad	q[2] = q4 ^ q7 ^ q1;
911.1Sriastrad	q[1] = q3 ^ q6 ^ q0;
921.1Sriastrad	q[0] = q2 ^ q5 ^ q7;
931.1Sriastrad}
941.1Sriastrad
951.1Sriastradstatic void
961.1Sriastradadd_round_key(__m128i q[static 8], const uint32_t sk[static 8])
971.1Sriastrad{
981.1Sriastrad
991.1Sriastrad	q[0] ^= _mm_set1_epi32(sk[0]);
1001.1Sriastrad	q[1] ^= _mm_set1_epi32(sk[1]);
1011.1Sriastrad	q[2] ^= _mm_set1_epi32(sk[2]);
1021.1Sriastrad	q[3] ^= _mm_set1_epi32(sk[3]);
1031.1Sriastrad	q[4] ^= _mm_set1_epi32(sk[4]);
1041.1Sriastrad	q[5] ^= _mm_set1_epi32(sk[5]);
1051.1Sriastrad	q[6] ^= _mm_set1_epi32(sk[6]);
1061.1Sriastrad	q[7] ^= _mm_set1_epi32(sk[7]);
1071.1Sriastrad}
1081.1Sriastrad
1091.1Sriastradstatic inline __m128i
1101.1Sriastradinv_shift_row(__m128i q)
1111.1Sriastrad{
1121.1Sriastrad	__m128i x, y0, y1, y2, y3, y4, y5, y6;
1131.1Sriastrad
1141.1Sriastrad	x = q;
1151.1Sriastrad	y0 = x & _mm_set1_epi32(0x000000FF);
1161.1Sriastrad	y1 = _mm_slli_epi32(x & _mm_set1_epi32(0x00003F00), 2);
1171.1Sriastrad	y2 = _mm_srli_epi32(x & _mm_set1_epi32(0x0000C000), 6);
1181.1Sriastrad	y3 = _mm_slli_epi32(x & _mm_set1_epi32(0x000F0000), 4);
1191.1Sriastrad	y4 = _mm_srli_epi32(x & _mm_set1_epi32(0x00F00000), 4);
1201.1Sriastrad	y5 = _mm_slli_epi32(x & _mm_set1_epi32(0x03000000), 6);
1211.1Sriastrad	y6 = _mm_srli_epi32(x & _mm_set1_epi32(0xFC000000), 2);
1221.1Sriastrad	return y0 | y1 | y2 | y3 | y4 | y5 | y6;
1231.1Sriastrad}
1241.1Sriastrad
1251.1Sriastradstatic void
1261.1Sriastradinv_shift_rows(__m128i *q)
1271.1Sriastrad{
1281.1Sriastrad
1291.1Sriastrad	q[0] = inv_shift_row(q[0]);
1301.1Sriastrad	q[1] = inv_shift_row(q[1]);
1311.1Sriastrad	q[2] = inv_shift_row(q[2]);
1321.1Sriastrad	q[3] = inv_shift_row(q[3]);
1331.1Sriastrad	q[4] = inv_shift_row(q[4]);
1341.1Sriastrad	q[5] = inv_shift_row(q[5]);
1351.1Sriastrad	q[6] = inv_shift_row(q[6]);
1361.1Sriastrad	q[7] = inv_shift_row(q[7]);
1371.1Sriastrad}
1381.1Sriastrad
1391.1Sriastradstatic inline __m128i
1401.1Sriastradrotr16(__m128i x)
1411.1Sriastrad{
1421.1Sriastrad	return _mm_slli_epi32(x, 16) | _mm_srli_epi32(x, 16);
1431.1Sriastrad}
1441.1Sriastrad
1451.1Sriastradstatic void
1461.1Sriastradinv_mix_columns(__m128i q[static 8])
1471.1Sriastrad{
1481.1Sriastrad	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
1491.1Sriastrad	__m128i r0, r1, r2, r3, r4, r5, r6, r7;
1501.1Sriastrad
1511.1Sriastrad	q0 = q[0];
1521.1Sriastrad	q1 = q[1];
1531.1Sriastrad	q2 = q[2];
1541.1Sriastrad	q3 = q[3];
1551.1Sriastrad	q4 = q[4];
1561.1Sriastrad	q5 = q[5];
1571.1Sriastrad	q6 = q[6];
1581.1Sriastrad	q7 = q[7];
1591.1Sriastrad	r0 = _mm_srli_epi32(q0, 8) | _mm_slli_epi32(q0, 24);
1601.1Sriastrad	r1 = _mm_srli_epi32(q1, 8) | _mm_slli_epi32(q1, 24);
1611.1Sriastrad	r2 = _mm_srli_epi32(q2, 8) | _mm_slli_epi32(q2, 24);
1621.1Sriastrad	r3 = _mm_srli_epi32(q3, 8) | _mm_slli_epi32(q3, 24);
1631.1Sriastrad	r4 = _mm_srli_epi32(q4, 8) | _mm_slli_epi32(q4, 24);
1641.1Sriastrad	r5 = _mm_srli_epi32(q5, 8) | _mm_slli_epi32(q5, 24);
1651.1Sriastrad	r6 = _mm_srli_epi32(q6, 8) | _mm_slli_epi32(q6, 24);
1661.1Sriastrad	r7 = _mm_srli_epi32(q7, 8) | _mm_slli_epi32(q7, 24);
1671.1Sriastrad
1681.1Sriastrad	q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr16(q0 ^ q5 ^ q6 ^ r0 ^ r5);
1691.1Sriastrad	q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6);
1701.1Sriastrad	q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr16(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7);
1711.1Sriastrad	q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr16(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7);
1721.1Sriastrad	q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6);
1731.1Sriastrad	q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7);
1741.1Sriastrad	q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr16(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7);
1751.1Sriastrad	q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr16(q4 ^ q5 ^ q7 ^ r4 ^ r7);
1761.1Sriastrad}
1771.1Sriastrad
1781.1Sriastrad/* see inner.h */
1791.1Sriastradvoid
1801.1Sriastradaes_sse2_4x32_bitslice_decrypt(unsigned num_rounds,
1811.1Sriastrad	const uint32_t skey[static 120], __m128i q[static 8])
1821.1Sriastrad{
1831.1Sriastrad	unsigned u;
1841.1Sriastrad
1851.1Sriastrad	add_round_key(q, skey + (num_rounds << 3));
1861.1Sriastrad	for (u = num_rounds - 1; u > 0; u --) {
1871.1Sriastrad		inv_shift_rows(q);
1881.1Sriastrad		aes_sse2_4x32_bitslice_invSbox(q);
1891.1Sriastrad		add_round_key(q, skey + (u << 3));
1901.1Sriastrad		inv_mix_columns(q);
1911.1Sriastrad	}
1921.1Sriastrad	inv_shift_rows(q);
1931.1Sriastrad	aes_sse2_4x32_bitslice_invSbox(q);
1941.1Sriastrad	add_round_key(q, skey);
1951.1Sriastrad}
196