11.1Sriastrad/* $NetBSD: aes_sse2_4x32_dec.c,v 1.1 2025/11/23 22:48:26 riastradh Exp $ */ 21.1Sriastrad 31.1Sriastrad/* 41.1Sriastrad * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> 51.1Sriastrad * 61.1Sriastrad * Permission is hereby granted, free of charge, to any person obtaining 71.1Sriastrad * a copy of this software and associated documentation files (the 81.1Sriastrad * "Software"), to deal in the Software without restriction, including 91.1Sriastrad * without limitation the rights to use, copy, modify, merge, publish, 101.1Sriastrad * distribute, sublicense, and/or sell copies of the Software, and to 111.1Sriastrad * permit persons to whom the Software is furnished to do so, subject to 121.1Sriastrad * the following conditions: 131.1Sriastrad * 141.1Sriastrad * The above copyright notice and this permission notice shall be 151.1Sriastrad * included in all copies or substantial portions of the Software. 161.1Sriastrad * 171.1Sriastrad * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 181.1Sriastrad * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 191.1Sriastrad * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 201.1Sriastrad * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 211.1Sriastrad * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 221.1Sriastrad * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 231.1Sriastrad * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 241.1Sriastrad * SOFTWARE. 251.1Sriastrad */ 261.1Sriastrad 271.1Sriastrad#include <sys/cdefs.h> 281.1Sriastrad__KERNEL_RCSID(1, "$NetBSD: aes_sse2_4x32_dec.c,v 1.1 2025/11/23 22:48:26 riastradh Exp $"); 291.1Sriastrad 301.1Sriastrad#include <sys/types.h> 311.1Sriastrad 321.1Sriastrad#include "aes_sse2_4x32_impl.h" 331.1Sriastrad 341.1Sriastrad/* see inner.h */ 351.1Sriastradvoid 361.1Sriastradaes_sse2_4x32_bitslice_invSbox(__m128i q[static 8]) 371.1Sriastrad{ 381.1Sriastrad /* 391.1Sriastrad * AES S-box is: 401.1Sriastrad * S(x) = A(I(x)) ^ 0x63 411.1Sriastrad * where I() is inversion in GF(256), and A() is a linear 421.1Sriastrad * transform (0 is formally defined to be its own inverse). 431.1Sriastrad * Since inversion is an involution, the inverse S-box can be 441.1Sriastrad * computed from the S-box as: 451.1Sriastrad * iS(x) = B(S(B(x ^ 0x63)) ^ 0x63) 461.1Sriastrad * where B() is the inverse of A(). Indeed, for any y in GF(256): 471.1Sriastrad * iS(S(y)) = B(A(I(B(A(I(y)) ^ 0x63 ^ 0x63))) ^ 0x63 ^ 0x63) = y 481.1Sriastrad * 491.1Sriastrad * Note: we reuse the implementation of the forward S-box, 501.1Sriastrad * instead of duplicating it here, so that total code size is 511.1Sriastrad * lower. By merging the B() transforms into the S-box circuit 521.1Sriastrad * we could make faster CBC decryption, but CBC decryption is 531.1Sriastrad * already quite faster than CBC encryption because we can 541.1Sriastrad * process two blocks in parallel. 551.1Sriastrad */ 561.1Sriastrad __m128i q0, q1, q2, q3, q4, q5, q6, q7; 571.1Sriastrad 581.1Sriastrad q0 = ~q[0]; 591.1Sriastrad q1 = ~q[1]; 601.1Sriastrad q2 = q[2]; 611.1Sriastrad q3 = q[3]; 621.1Sriastrad q4 = q[4]; 631.1Sriastrad q5 = ~q[5]; 641.1Sriastrad q6 = ~q[6]; 651.1Sriastrad q7 = q[7]; 661.1Sriastrad q[7] = q1 ^ q4 ^ q6; 671.1Sriastrad q[6] = q0 ^ q3 ^ q5; 681.1Sriastrad q[5] = q7 ^ q2 ^ q4; 691.1Sriastrad q[4] = q6 ^ q1 ^ q3; 701.1Sriastrad q[3] = q5 ^ q0 ^ q2; 711.1Sriastrad q[2] = q4 ^ q7 ^ q1; 721.1Sriastrad q[1] = q3 ^ q6 ^ q0; 731.1Sriastrad q[0] = q2 ^ q5 ^ q7; 741.1Sriastrad 751.1Sriastrad aes_sse2_4x32_bitslice_Sbox(q); 761.1Sriastrad 771.1Sriastrad q0 = ~q[0]; 781.1Sriastrad q1 = ~q[1]; 791.1Sriastrad q2 = q[2]; 801.1Sriastrad q3 = q[3]; 811.1Sriastrad q4 = q[4]; 821.1Sriastrad q5 = ~q[5]; 831.1Sriastrad q6 = ~q[6]; 841.1Sriastrad q7 = q[7]; 851.1Sriastrad q[7] = q1 ^ q4 ^ q6; 861.1Sriastrad q[6] = q0 ^ q3 ^ q5; 871.1Sriastrad q[5] = q7 ^ q2 ^ q4; 881.1Sriastrad q[4] = q6 ^ q1 ^ q3; 891.1Sriastrad q[3] = q5 ^ q0 ^ q2; 901.1Sriastrad q[2] = q4 ^ q7 ^ q1; 911.1Sriastrad q[1] = q3 ^ q6 ^ q0; 921.1Sriastrad q[0] = q2 ^ q5 ^ q7; 931.1Sriastrad} 941.1Sriastrad 951.1Sriastradstatic void 961.1Sriastradadd_round_key(__m128i q[static 8], const uint32_t sk[static 8]) 971.1Sriastrad{ 981.1Sriastrad 991.1Sriastrad q[0] ^= _mm_set1_epi32(sk[0]); 1001.1Sriastrad q[1] ^= _mm_set1_epi32(sk[1]); 1011.1Sriastrad q[2] ^= _mm_set1_epi32(sk[2]); 1021.1Sriastrad q[3] ^= _mm_set1_epi32(sk[3]); 1031.1Sriastrad q[4] ^= _mm_set1_epi32(sk[4]); 1041.1Sriastrad q[5] ^= _mm_set1_epi32(sk[5]); 1051.1Sriastrad q[6] ^= _mm_set1_epi32(sk[6]); 1061.1Sriastrad q[7] ^= _mm_set1_epi32(sk[7]); 1071.1Sriastrad} 1081.1Sriastrad 1091.1Sriastradstatic inline __m128i 1101.1Sriastradinv_shift_row(__m128i q) 1111.1Sriastrad{ 1121.1Sriastrad __m128i x, y0, y1, y2, y3, y4, y5, y6; 1131.1Sriastrad 1141.1Sriastrad x = q; 1151.1Sriastrad y0 = x & _mm_set1_epi32(0x000000FF); 1161.1Sriastrad y1 = _mm_slli_epi32(x & _mm_set1_epi32(0x00003F00), 2); 1171.1Sriastrad y2 = _mm_srli_epi32(x & _mm_set1_epi32(0x0000C000), 6); 1181.1Sriastrad y3 = _mm_slli_epi32(x & _mm_set1_epi32(0x000F0000), 4); 1191.1Sriastrad y4 = _mm_srli_epi32(x & _mm_set1_epi32(0x00F00000), 4); 1201.1Sriastrad y5 = _mm_slli_epi32(x & _mm_set1_epi32(0x03000000), 6); 1211.1Sriastrad y6 = _mm_srli_epi32(x & _mm_set1_epi32(0xFC000000), 2); 1221.1Sriastrad return y0 | y1 | y2 | y3 | y4 | y5 | y6; 1231.1Sriastrad} 1241.1Sriastrad 1251.1Sriastradstatic void 1261.1Sriastradinv_shift_rows(__m128i *q) 1271.1Sriastrad{ 1281.1Sriastrad 1291.1Sriastrad q[0] = inv_shift_row(q[0]); 1301.1Sriastrad q[1] = inv_shift_row(q[1]); 1311.1Sriastrad q[2] = inv_shift_row(q[2]); 1321.1Sriastrad q[3] = inv_shift_row(q[3]); 1331.1Sriastrad q[4] = inv_shift_row(q[4]); 1341.1Sriastrad q[5] = inv_shift_row(q[5]); 1351.1Sriastrad q[6] = inv_shift_row(q[6]); 1361.1Sriastrad q[7] = inv_shift_row(q[7]); 1371.1Sriastrad} 1381.1Sriastrad 1391.1Sriastradstatic inline __m128i 1401.1Sriastradrotr16(__m128i x) 1411.1Sriastrad{ 1421.1Sriastrad return _mm_slli_epi32(x, 16) | _mm_srli_epi32(x, 16); 1431.1Sriastrad} 1441.1Sriastrad 1451.1Sriastradstatic void 1461.1Sriastradinv_mix_columns(__m128i q[static 8]) 1471.1Sriastrad{ 1481.1Sriastrad __m128i q0, q1, q2, q3, q4, q5, q6, q7; 1491.1Sriastrad __m128i r0, r1, r2, r3, r4, r5, r6, r7; 1501.1Sriastrad 1511.1Sriastrad q0 = q[0]; 1521.1Sriastrad q1 = q[1]; 1531.1Sriastrad q2 = q[2]; 1541.1Sriastrad q3 = q[3]; 1551.1Sriastrad q4 = q[4]; 1561.1Sriastrad q5 = q[5]; 1571.1Sriastrad q6 = q[6]; 1581.1Sriastrad q7 = q[7]; 1591.1Sriastrad r0 = _mm_srli_epi32(q0, 8) | _mm_slli_epi32(q0, 24); 1601.1Sriastrad r1 = _mm_srli_epi32(q1, 8) | _mm_slli_epi32(q1, 24); 1611.1Sriastrad r2 = _mm_srli_epi32(q2, 8) | _mm_slli_epi32(q2, 24); 1621.1Sriastrad r3 = _mm_srli_epi32(q3, 8) | _mm_slli_epi32(q3, 24); 1631.1Sriastrad r4 = _mm_srli_epi32(q4, 8) | _mm_slli_epi32(q4, 24); 1641.1Sriastrad r5 = _mm_srli_epi32(q5, 8) | _mm_slli_epi32(q5, 24); 1651.1Sriastrad r6 = _mm_srli_epi32(q6, 8) | _mm_slli_epi32(q6, 24); 1661.1Sriastrad r7 = _mm_srli_epi32(q7, 8) | _mm_slli_epi32(q7, 24); 1671.1Sriastrad 1681.1Sriastrad q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr16(q0 ^ q5 ^ q6 ^ r0 ^ r5); 1691.1Sriastrad q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6); 1701.1Sriastrad q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr16(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7); 1711.1Sriastrad q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr16(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7); 1721.1Sriastrad q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6); 1731.1Sriastrad q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7); 1741.1Sriastrad q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr16(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7); 1751.1Sriastrad q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr16(q4 ^ q5 ^ q7 ^ r4 ^ r7); 1761.1Sriastrad} 1771.1Sriastrad 1781.1Sriastrad/* see inner.h */ 1791.1Sriastradvoid 1801.1Sriastradaes_sse2_4x32_bitslice_decrypt(unsigned num_rounds, 1811.1Sriastrad const uint32_t skey[static 120], __m128i q[static 8]) 1821.1Sriastrad{ 1831.1Sriastrad unsigned u; 1841.1Sriastrad 1851.1Sriastrad add_round_key(q, skey + (num_rounds << 3)); 1861.1Sriastrad for (u = num_rounds - 1; u > 0; u --) { 1871.1Sriastrad inv_shift_rows(q); 1881.1Sriastrad aes_sse2_4x32_bitslice_invSbox(q); 1891.1Sriastrad add_round_key(q, skey + (u << 3)); 1901.1Sriastrad inv_mix_columns(q); 1911.1Sriastrad } 1921.1Sriastrad inv_shift_rows(q); 1931.1Sriastrad aes_sse2_4x32_bitslice_invSbox(q); 1941.1Sriastrad add_round_key(q, skey); 1951.1Sriastrad} 196