11.1Sriastrad/*	$NetBSD: aes_sse2_4x32_enc.c,v 1.1 2025/11/23 22:48:26 riastradh Exp $	*/
21.1Sriastrad
31.1Sriastrad/*
41.1Sriastrad * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
51.1Sriastrad *
61.1Sriastrad * Permission is hereby granted, free of charge, to any person obtaining
71.1Sriastrad * a copy of this software and associated documentation files (the
81.1Sriastrad * "Software"), to deal in the Software without restriction, including
91.1Sriastrad * without limitation the rights to use, copy, modify, merge, publish,
101.1Sriastrad * distribute, sublicense, and/or sell copies of the Software, and to
111.1Sriastrad * permit persons to whom the Software is furnished to do so, subject to
121.1Sriastrad * the following conditions:
131.1Sriastrad *
141.1Sriastrad * The above copyright notice and this permission notice shall be
151.1Sriastrad * included in all copies or substantial portions of the Software.
161.1Sriastrad *
171.1Sriastrad * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
181.1Sriastrad * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
191.1Sriastrad * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
201.1Sriastrad * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
211.1Sriastrad * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
221.1Sriastrad * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
231.1Sriastrad * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
241.1Sriastrad * SOFTWARE.
251.1Sriastrad */
261.1Sriastrad
271.1Sriastrad#include <sys/cdefs.h>
281.1Sriastrad__KERNEL_RCSID(1, "$NetBSD: aes_sse2_4x32_enc.c,v 1.1 2025/11/23 22:48:26 riastradh Exp $");
291.1Sriastrad
301.1Sriastrad#include <sys/types.h>
311.1Sriastrad
321.1Sriastrad#include "aes_sse2_4x32_impl.h"
331.1Sriastrad
341.1Sriastradstatic inline void
351.1Sriastradadd_round_key(__m128i q[static 8], const uint32_t sk[static 8])
361.1Sriastrad{
371.1Sriastrad
381.1Sriastrad	q[0] ^= _mm_set1_epi32(sk[0]);
391.1Sriastrad	q[1] ^= _mm_set1_epi32(sk[1]);
401.1Sriastrad	q[2] ^= _mm_set1_epi32(sk[2]);
411.1Sriastrad	q[3] ^= _mm_set1_epi32(sk[3]);
421.1Sriastrad	q[4] ^= _mm_set1_epi32(sk[4]);
431.1Sriastrad	q[5] ^= _mm_set1_epi32(sk[5]);
441.1Sriastrad	q[6] ^= _mm_set1_epi32(sk[6]);
451.1Sriastrad	q[7] ^= _mm_set1_epi32(sk[7]);
461.1Sriastrad}
471.1Sriastrad
481.1Sriastradstatic inline __m128i
491.1Sriastradshift_row(__m128i q)
501.1Sriastrad{
511.1Sriastrad	__m128i x, y0, y1, y2, y3, y4, y5, y6;
521.1Sriastrad
531.1Sriastrad	x = q;
541.1Sriastrad	y0 = x & _mm_set1_epi32(0x000000FF);
551.1Sriastrad	y1 = _mm_srli_epi32(x & _mm_set1_epi32(0x0000FC00), 2);
561.1Sriastrad	y2 = _mm_slli_epi32(x & _mm_set1_epi32(0x00000300), 6);
571.1Sriastrad	y3 = _mm_srli_epi32(x & _mm_set1_epi32(0x00F00000), 4);
581.1Sriastrad	y4 = _mm_slli_epi32(x & _mm_set1_epi32(0x000F0000), 4);
591.1Sriastrad	y5 = _mm_srli_epi32(x & _mm_set1_epi32(0xC0000000), 6);
601.1Sriastrad	y6 = _mm_slli_epi32(x & _mm_set1_epi32(0x3F000000), 2);
611.1Sriastrad	return y0 | y1 | y2 | y3 | y4 | y5 | y6;
621.1Sriastrad}
631.1Sriastrad
641.1Sriastradstatic inline void
651.1Sriastradshift_rows(__m128i q[static 8])
661.1Sriastrad{
671.1Sriastrad
681.1Sriastrad	q[0] = shift_row(q[0]);
691.1Sriastrad	q[1] = shift_row(q[1]);
701.1Sriastrad	q[2] = shift_row(q[2]);
711.1Sriastrad	q[3] = shift_row(q[3]);
721.1Sriastrad	q[4] = shift_row(q[4]);
731.1Sriastrad	q[5] = shift_row(q[5]);
741.1Sriastrad	q[6] = shift_row(q[6]);
751.1Sriastrad	q[7] = shift_row(q[7]);
761.1Sriastrad}
771.1Sriastrad
781.1Sriastradstatic inline __m128i
791.1Sriastradrotr16(__m128i x)
801.1Sriastrad{
811.1Sriastrad	return _mm_slli_epi32(x, 16) | _mm_srli_epi32(x, 16);
821.1Sriastrad}
831.1Sriastrad
841.1Sriastradstatic inline void
851.1Sriastradmix_columns(__m128i q[static 8])
861.1Sriastrad{
871.1Sriastrad	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
881.1Sriastrad	__m128i r0, r1, r2, r3, r4, r5, r6, r7;
891.1Sriastrad
901.1Sriastrad	q0 = q[0];
911.1Sriastrad	q1 = q[1];
921.1Sriastrad	q2 = q[2];
931.1Sriastrad	q3 = q[3];
941.1Sriastrad	q4 = q[4];
951.1Sriastrad	q5 = q[5];
961.1Sriastrad	q6 = q[6];
971.1Sriastrad	q7 = q[7];
981.1Sriastrad	r0 = _mm_srli_epi32(q0, 8) | _mm_slli_epi32(q0, 24);
991.1Sriastrad	r1 = _mm_srli_epi32(q1, 8) | _mm_slli_epi32(q1, 24);
1001.1Sriastrad	r2 = _mm_srli_epi32(q2, 8) | _mm_slli_epi32(q2, 24);
1011.1Sriastrad	r3 = _mm_srli_epi32(q3, 8) | _mm_slli_epi32(q3, 24);
1021.1Sriastrad	r4 = _mm_srli_epi32(q4, 8) | _mm_slli_epi32(q4, 24);
1031.1Sriastrad	r5 = _mm_srli_epi32(q5, 8) | _mm_slli_epi32(q5, 24);
1041.1Sriastrad	r6 = _mm_srli_epi32(q6, 8) | _mm_slli_epi32(q6, 24);
1051.1Sriastrad	r7 = _mm_srli_epi32(q7, 8) | _mm_slli_epi32(q7, 24);
1061.1Sriastrad
1071.1Sriastrad	q[0] = q7 ^ r7 ^ r0 ^ rotr16(q0 ^ r0);
1081.1Sriastrad	q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr16(q1 ^ r1);
1091.1Sriastrad	q[2] = q1 ^ r1 ^ r2 ^ rotr16(q2 ^ r2);
1101.1Sriastrad	q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr16(q3 ^ r3);
1111.1Sriastrad	q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr16(q4 ^ r4);
1121.1Sriastrad	q[5] = q4 ^ r4 ^ r5 ^ rotr16(q5 ^ r5);
1131.1Sriastrad	q[6] = q5 ^ r5 ^ r6 ^ rotr16(q6 ^ r6);
1141.1Sriastrad	q[7] = q6 ^ r6 ^ r7 ^ rotr16(q7 ^ r7);
1151.1Sriastrad}
1161.1Sriastrad
1171.1Sriastrad/* see inner.h */
1181.1Sriastradvoid
1191.1Sriastradaes_sse2_4x32_bitslice_encrypt(unsigned num_rounds,
1201.1Sriastrad	const uint32_t skey[static 120], __m128i q[static 8])
1211.1Sriastrad{
1221.1Sriastrad	unsigned u;
1231.1Sriastrad
1241.1Sriastrad	add_round_key(q, skey);
1251.1Sriastrad	for (u = 1; u < num_rounds; u ++) {
1261.1Sriastrad		aes_sse2_4x32_bitslice_Sbox(q);
1271.1Sriastrad		shift_rows(q);
1281.1Sriastrad		mix_columns(q);
1291.1Sriastrad		add_round_key(q, skey + (u << 3));
1301.1Sriastrad	}
1311.1Sriastrad	aes_sse2_4x32_bitslice_Sbox(q);
1321.1Sriastrad	shift_rows(q);
1331.1Sriastrad	add_round_key(q, skey + (num_rounds << 3));
1341.1Sriastrad}
135