Home | History | Annotate | Line # | Download | only in ref
      1 
      2 #ifndef blake2b_compress_avx2_H
      3 #define blake2b_compress_avx2_H
      4 
      5 #define LOAD128(p) _mm_load_si128((__m128i *) (p))
      6 #define STORE128(p, r) _mm_store_si128((__m128i *) (p), r)
      7 
      8 #define LOADU128(p) _mm_loadu_si128((__m128i *) (p))
      9 #define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r)
     10 
     11 #define LOAD(p) _mm256_load_si256((__m256i *) (p))
     12 #define STORE(p, r) _mm256_store_si256((__m256i *) (p), r)
     13 
     14 #define LOADU(p) _mm256_loadu_si256((__m256i *) (p))
     15 #define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r)
     16 
     17 static inline uint64_t
     18 LOADU64(const void *p)
     19 {
     20     uint64_t v;
     21     memcpy(&v, p, sizeof v);
     22     return v;
     23 }
     24 
     25 #define ROTATE16                                                              \
     26     _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \
     27                      3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
     28 
     29 #define ROTATE24                                                              \
     30     _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \
     31                      4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
     32 
     33 #define ADD(a, b) _mm256_add_epi64(a, b)
     34 #define SUB(a, b) _mm256_sub_epi64(a, b)
     35 
     36 #define XOR(a, b) _mm256_xor_si256(a, b)
     37 #define AND(a, b) _mm256_and_si256(a, b)
     38 #define OR(a, b) _mm256_or_si256(a, b)
     39 
     40 #define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))
     41 #define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24)
     42 #define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16)
     43 #define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
     44 
     45 #define BLAKE2B_G1_V1(a, b, c, d, m) \
     46     do {                             \
     47         a = ADD(a, m);               \
     48         a = ADD(a, b);               \
     49         d = XOR(d, a);               \
     50         d = ROT32(d);                \
     51         c = ADD(c, d);               \
     52         b = XOR(b, c);               \
     53         b = ROT24(b);                \
     54     } while (0)
     55 
     56 #define BLAKE2B_G2_V1(a, b, c, d, m) \
     57     do {                             \
     58         a = ADD(a, m);               \
     59         a = ADD(a, b);               \
     60         d = XOR(d, a);               \
     61         d = ROT16(d);                \
     62         c = ADD(c, d);               \
     63         b = XOR(b, c);               \
     64         b = ROT63(b);                \
     65     } while (0)
     66 
     67 #define BLAKE2B_DIAG_V1(a, b, c, d)                               \
     68     do {                                                          \
     69         d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
     70         c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
     71         b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
     72     } while (0)
     73 
     74 #define BLAKE2B_UNDIAG_V1(a, b, c, d)                             \
     75     do {                                                          \
     76         d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
     77         c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
     78         b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
     79     } while (0)
     80 
     81 #include "blake2b-load-avx2.h"
     82 
     83 #define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \
     84     do {                                   \
     85         __m256i b0;                        \
     86         BLAKE2B_LOAD_MSG_##r##_1(b0);      \
     87         BLAKE2B_G1_V1(a, b, c, d, b0);     \
     88         BLAKE2B_LOAD_MSG_##r##_2(b0);      \
     89         BLAKE2B_G2_V1(a, b, c, d, b0);     \
     90         BLAKE2B_DIAG_V1(a, b, c, d);       \
     91         BLAKE2B_LOAD_MSG_##r##_3(b0);      \
     92         BLAKE2B_G1_V1(a, b, c, d, b0);     \
     93         BLAKE2B_LOAD_MSG_##r##_4(b0);      \
     94         BLAKE2B_G2_V1(a, b, c, d, b0);     \
     95         BLAKE2B_UNDIAG_V1(a, b, c, d);     \
     96     } while (0)
     97 
     98 #define BLAKE2B_ROUNDS_V1(a, b, c, d, m)       \
     99     do {                                       \
    100         BLAKE2B_ROUND_V1(a, b, c, d, 0, (m));  \
    101         BLAKE2B_ROUND_V1(a, b, c, d, 1, (m));  \
    102         BLAKE2B_ROUND_V1(a, b, c, d, 2, (m));  \
    103         BLAKE2B_ROUND_V1(a, b, c, d, 3, (m));  \
    104         BLAKE2B_ROUND_V1(a, b, c, d, 4, (m));  \
    105         BLAKE2B_ROUND_V1(a, b, c, d, 5, (m));  \
    106         BLAKE2B_ROUND_V1(a, b, c, d, 6, (m));  \
    107         BLAKE2B_ROUND_V1(a, b, c, d, 7, (m));  \
    108         BLAKE2B_ROUND_V1(a, b, c, d, 8, (m));  \
    109         BLAKE2B_ROUND_V1(a, b, c, d, 9, (m));  \
    110         BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
    111         BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
    112     } while (0)
    113 
    114 #define DECLARE_MESSAGE_WORDS(m)                                         \
    115     const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0));   \
    116     const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16));  \
    117     const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32));  \
    118     const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48));  \
    119     const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64));  \
    120     const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80));  \
    121     const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96));  \
    122     const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
    123     __m256i       t0, t1;
    124 
    125 #define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1)                      \
    126     do {                                                                  \
    127         DECLARE_MESSAGE_WORDS(m)                                          \
    128         const __m256i iv0 = a;                                            \
    129         const __m256i iv1 = b;                                            \
    130         __m256i       c   = LOAD(&blake2b_IV[0]);                         \
    131         __m256i       d =                                                 \
    132             XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
    133         BLAKE2B_ROUNDS_V1(a, b, c, d, m);                                 \
    134         a = XOR(a, c);                                                    \
    135         b = XOR(b, d);                                                    \
    136         a = XOR(a, iv0);                                                  \
    137         b = XOR(b, iv1);                                                  \
    138     } while (0)
    139 
    140 #endif
    141