1/* $NetBSD: aes_sse2_4x32_dec.c,v 1.1 2025/11/23 22:48:26 riastradh Exp $ */ 2 3/* 4 * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sublicense, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be 15 * included in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 * SOFTWARE. 25 */ 26 27#include <sys/cdefs.h> 28__KERNEL_RCSID(1, "$NetBSD: aes_sse2_4x32_dec.c,v 1.1 2025/11/23 22:48:26 riastradh Exp $"); 29 30#include <sys/types.h> 31 32#include "aes_sse2_4x32_impl.h" 33 34/* see inner.h */ 35void 36aes_sse2_4x32_bitslice_invSbox(__m128i q[static 8]) 37{ 38 /* 39 * AES S-box is: 40 * S(x) = A(I(x)) ^ 0x63 41 * where I() is inversion in GF(256), and A() is a linear 42 * transform (0 is formally defined to be its own inverse). 43 * Since inversion is an involution, the inverse S-box can be 44 * computed from the S-box as: 45 * iS(x) = B(S(B(x ^ 0x63)) ^ 0x63) 46 * where B() is the inverse of A(). Indeed, for any y in GF(256): 47 * iS(S(y)) = B(A(I(B(A(I(y)) ^ 0x63 ^ 0x63))) ^ 0x63 ^ 0x63) = y 48 * 49 * Note: we reuse the implementation of the forward S-box, 50 * instead of duplicating it here, so that total code size is 51 * lower. By merging the B() transforms into the S-box circuit 52 * we could make faster CBC decryption, but CBC decryption is 53 * already quite faster than CBC encryption because we can 54 * process two blocks in parallel. 55 */ 56 __m128i q0, q1, q2, q3, q4, q5, q6, q7; 57 58 q0 = ~q[0]; 59 q1 = ~q[1]; 60 q2 = q[2]; 61 q3 = q[3]; 62 q4 = q[4]; 63 q5 = ~q[5]; 64 q6 = ~q[6]; 65 q7 = q[7]; 66 q[7] = q1 ^ q4 ^ q6; 67 q[6] = q0 ^ q3 ^ q5; 68 q[5] = q7 ^ q2 ^ q4; 69 q[4] = q6 ^ q1 ^ q3; 70 q[3] = q5 ^ q0 ^ q2; 71 q[2] = q4 ^ q7 ^ q1; 72 q[1] = q3 ^ q6 ^ q0; 73 q[0] = q2 ^ q5 ^ q7; 74 75 aes_sse2_4x32_bitslice_Sbox(q); 76 77 q0 = ~q[0]; 78 q1 = ~q[1]; 79 q2 = q[2]; 80 q3 = q[3]; 81 q4 = q[4]; 82 q5 = ~q[5]; 83 q6 = ~q[6]; 84 q7 = q[7]; 85 q[7] = q1 ^ q4 ^ q6; 86 q[6] = q0 ^ q3 ^ q5; 87 q[5] = q7 ^ q2 ^ q4; 88 q[4] = q6 ^ q1 ^ q3; 89 q[3] = q5 ^ q0 ^ q2; 90 q[2] = q4 ^ q7 ^ q1; 91 q[1] = q3 ^ q6 ^ q0; 92 q[0] = q2 ^ q5 ^ q7; 93} 94 95static void 96add_round_key(__m128i q[static 8], const uint32_t sk[static 8]) 97{ 98 99 q[0] ^= _mm_set1_epi32(sk[0]); 100 q[1] ^= _mm_set1_epi32(sk[1]); 101 q[2] ^= _mm_set1_epi32(sk[2]); 102 q[3] ^= _mm_set1_epi32(sk[3]); 103 q[4] ^= _mm_set1_epi32(sk[4]); 104 q[5] ^= _mm_set1_epi32(sk[5]); 105 q[6] ^= _mm_set1_epi32(sk[6]); 106 q[7] ^= _mm_set1_epi32(sk[7]); 107} 108 109static inline __m128i 110inv_shift_row(__m128i q) 111{ 112 __m128i x, y0, y1, y2, y3, y4, y5, y6; 113 114 x = q; 115 y0 = x & _mm_set1_epi32(0x000000FF); 116 y1 = _mm_slli_epi32(x & _mm_set1_epi32(0x00003F00), 2); 117 y2 = _mm_srli_epi32(x & _mm_set1_epi32(0x0000C000), 6); 118 y3 = _mm_slli_epi32(x & _mm_set1_epi32(0x000F0000), 4); 119 y4 = _mm_srli_epi32(x & _mm_set1_epi32(0x00F00000), 4); 120 y5 = _mm_slli_epi32(x & _mm_set1_epi32(0x03000000), 6); 121 y6 = _mm_srli_epi32(x & _mm_set1_epi32(0xFC000000), 2); 122 return y0 | y1 | y2 | y3 | y4 | y5 | y6; 123} 124 125static void 126inv_shift_rows(__m128i *q) 127{ 128 129 q[0] = inv_shift_row(q[0]); 130 q[1] = inv_shift_row(q[1]); 131 q[2] = inv_shift_row(q[2]); 132 q[3] = inv_shift_row(q[3]); 133 q[4] = inv_shift_row(q[4]); 134 q[5] = inv_shift_row(q[5]); 135 q[6] = inv_shift_row(q[6]); 136 q[7] = inv_shift_row(q[7]); 137} 138 139static inline __m128i 140rotr16(__m128i x) 141{ 142 return _mm_slli_epi32(x, 16) | _mm_srli_epi32(x, 16); 143} 144 145static void 146inv_mix_columns(__m128i q[static 8]) 147{ 148 __m128i q0, q1, q2, q3, q4, q5, q6, q7; 149 __m128i r0, r1, r2, r3, r4, r5, r6, r7; 150 151 q0 = q[0]; 152 q1 = q[1]; 153 q2 = q[2]; 154 q3 = q[3]; 155 q4 = q[4]; 156 q5 = q[5]; 157 q6 = q[6]; 158 q7 = q[7]; 159 r0 = _mm_srli_epi32(q0, 8) | _mm_slli_epi32(q0, 24); 160 r1 = _mm_srli_epi32(q1, 8) | _mm_slli_epi32(q1, 24); 161 r2 = _mm_srli_epi32(q2, 8) | _mm_slli_epi32(q2, 24); 162 r3 = _mm_srli_epi32(q3, 8) | _mm_slli_epi32(q3, 24); 163 r4 = _mm_srli_epi32(q4, 8) | _mm_slli_epi32(q4, 24); 164 r5 = _mm_srli_epi32(q5, 8) | _mm_slli_epi32(q5, 24); 165 r6 = _mm_srli_epi32(q6, 8) | _mm_slli_epi32(q6, 24); 166 r7 = _mm_srli_epi32(q7, 8) | _mm_slli_epi32(q7, 24); 167 168 q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr16(q0 ^ q5 ^ q6 ^ r0 ^ r5); 169 q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6); 170 q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr16(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7); 171 q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr16(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7); 172 q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6); 173 q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7); 174 q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr16(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7); 175 q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr16(q4 ^ q5 ^ q7 ^ r4 ^ r7); 176} 177 178/* see inner.h */ 179void 180aes_sse2_4x32_bitslice_decrypt(unsigned num_rounds, 181 const uint32_t skey[static 120], __m128i q[static 8]) 182{ 183 unsigned u; 184 185 add_round_key(q, skey + (num_rounds << 3)); 186 for (u = num_rounds - 1; u > 0; u --) { 187 inv_shift_rows(q); 188 aes_sse2_4x32_bitslice_invSbox(q); 189 add_round_key(q, skey + (u << 3)); 190 inv_mix_columns(q); 191 } 192 inv_shift_rows(q); 193 aes_sse2_4x32_bitslice_invSbox(q); 194 add_round_key(q, skey); 195} 196