1 2 #ifndef blake2b_compress_avx2_H 3 #define blake2b_compress_avx2_H 4 5 #define LOAD128(p) _mm_load_si128((__m128i *) (p)) 6 #define STORE128(p, r) _mm_store_si128((__m128i *) (p), r) 7 8 #define LOADU128(p) _mm_loadu_si128((__m128i *) (p)) 9 #define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r) 10 11 #define LOAD(p) _mm256_load_si256((__m256i *) (p)) 12 #define STORE(p, r) _mm256_store_si256((__m256i *) (p), r) 13 14 #define LOADU(p) _mm256_loadu_si256((__m256i *) (p)) 15 #define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r) 16 17 static inline uint64_t 18 LOADU64(const void *p) 19 { 20 uint64_t v; 21 memcpy(&v, p, sizeof v); 22 return v; 23 } 24 25 #define ROTATE16 \ 26 _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \ 27 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9) 28 29 #define ROTATE24 \ 30 _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \ 31 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10) 32 33 #define ADD(a, b) _mm256_add_epi64(a, b) 34 #define SUB(a, b) _mm256_sub_epi64(a, b) 35 36 #define XOR(a, b) _mm256_xor_si256(a, b) 37 #define AND(a, b) _mm256_and_si256(a, b) 38 #define OR(a, b) _mm256_or_si256(a, b) 39 40 #define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) 41 #define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24) 42 #define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16) 43 #define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x))) 44 45 #define BLAKE2B_G1_V1(a, b, c, d, m) \ 46 do { \ 47 a = ADD(a, m); \ 48 a = ADD(a, b); \ 49 d = XOR(d, a); \ 50 d = ROT32(d); \ 51 c = ADD(c, d); \ 52 b = XOR(b, c); \ 53 b = ROT24(b); \ 54 } while (0) 55 56 #define BLAKE2B_G2_V1(a, b, c, d, m) \ 57 do { \ 58 a = ADD(a, m); \ 59 a = ADD(a, b); \ 60 d = XOR(d, a); \ 61 d = ROT16(d); \ 62 c = ADD(c, d); \ 63 b = XOR(b, c); \ 64 b = ROT63(b); \ 65 } while (0) 66 67 #define BLAKE2B_DIAG_V1(a, b, c, d) \ 68 do { \ 69 d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \ 70 c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \ 71 b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \ 72 } while (0) 73 74 #define BLAKE2B_UNDIAG_V1(a, b, c, d) \ 75 do { \ 76 d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \ 77 c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \ 78 b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \ 79 } while (0) 80 81 #include "blake2b-load-avx2.h" 82 83 #define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \ 84 do { \ 85 __m256i b0; \ 86 BLAKE2B_LOAD_MSG_##r##_1(b0); \ 87 BLAKE2B_G1_V1(a, b, c, d, b0); \ 88 BLAKE2B_LOAD_MSG_##r##_2(b0); \ 89 BLAKE2B_G2_V1(a, b, c, d, b0); \ 90 BLAKE2B_DIAG_V1(a, b, c, d); \ 91 BLAKE2B_LOAD_MSG_##r##_3(b0); \ 92 BLAKE2B_G1_V1(a, b, c, d, b0); \ 93 BLAKE2B_LOAD_MSG_##r##_4(b0); \ 94 BLAKE2B_G2_V1(a, b, c, d, b0); \ 95 BLAKE2B_UNDIAG_V1(a, b, c, d); \ 96 } while (0) 97 98 #define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \ 99 do { \ 100 BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \ 101 BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \ 102 BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \ 103 BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \ 104 BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \ 105 BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \ 106 BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \ 107 BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \ 108 BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \ 109 BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \ 110 BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \ 111 BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \ 112 } while (0) 113 114 #define DECLARE_MESSAGE_WORDS(m) \ 115 const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \ 116 const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \ 117 const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \ 118 const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \ 119 const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \ 120 const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \ 121 const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \ 122 const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \ 123 __m256i t0, t1; 124 125 #define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \ 126 do { \ 127 DECLARE_MESSAGE_WORDS(m) \ 128 const __m256i iv0 = a; \ 129 const __m256i iv1 = b; \ 130 __m256i c = LOAD(&blake2b_IV[0]); \ 131 __m256i d = \ 132 XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \ 133 BLAKE2B_ROUNDS_V1(a, b, c, d, m); \ 134 a = XOR(a, c); \ 135 b = XOR(b, d); \ 136 a = XOR(a, iv0); \ 137 b = XOR(b, iv1); \ 138 } while (0) 139 140 #endif 141