Home | History | Annotate | Line # | Download | only in ref
      1 
      2 #define BLAKE2_USE_SSSE3
      3 #define BLAKE2_USE_SSE41
      4 
      5 #include <stdint.h>
      6 #include <string.h>
      7 
      8 #include "blake2.h"
      9 #include "private/common.h"
     10 #include "private/sse2_64_32.h"
     11 
     12 #if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \
     13     defined(HAVE_SMMINTRIN_H)
     14 
     15 # ifdef __GNUC__
     16 #  pragma GCC target("sse2")
     17 #  pragma GCC target("ssse3")
     18 #  pragma GCC target("sse4.1")
     19 # endif
     20 
     21 # include <emmintrin.h>
     22 # include <smmintrin.h>
     23 # include <tmmintrin.h>
     24 
     25 # include "blake2b-compress-sse41.h"
     26 
     27 CRYPTO_ALIGN(64)
     28 static const uint64_t blake2b_IV[8] = {
     29     0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL,
     30     0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
     31     0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
     32 };
     33 
     34 int
     35 blake2b_compress_sse41(blake2b_state *S,
     36                        const uint8_t  block[BLAKE2B_BLOCKBYTES])
     37 {
     38     __m128i       row1l, row1h;
     39     __m128i       row2l, row2h;
     40     __m128i       row3l, row3h;
     41     __m128i       row4l, row4h;
     42     __m128i       b0, b1;
     43     __m128i       t0, t1;
     44     const __m128i r16 =
     45         _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
     46     const __m128i r24 =
     47         _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
     48     const __m128i m0 = LOADU(block + 00);
     49     const __m128i m1 = LOADU(block + 16);
     50     const __m128i m2 = LOADU(block + 32);
     51     const __m128i m3 = LOADU(block + 48);
     52     const __m128i m4 = LOADU(block + 64);
     53     const __m128i m5 = LOADU(block + 80);
     54     const __m128i m6 = LOADU(block + 96);
     55     const __m128i m7 = LOADU(block + 112);
     56     row1l            = LOADU(&S->h[0]);
     57     row1h            = LOADU(&S->h[2]);
     58     row2l            = LOADU(&S->h[4]);
     59     row2h            = LOADU(&S->h[6]);
     60     row3l            = LOADU(&blake2b_IV[0]);
     61     row3h            = LOADU(&blake2b_IV[2]);
     62     row4l            = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
     63     row4h            = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
     64     ROUND(0);
     65     ROUND(1);
     66     ROUND(2);
     67     ROUND(3);
     68     ROUND(4);
     69     ROUND(5);
     70     ROUND(6);
     71     ROUND(7);
     72     ROUND(8);
     73     ROUND(9);
     74     ROUND(10);
     75     ROUND(11);
     76     row1l = _mm_xor_si128(row3l, row1l);
     77     row1h = _mm_xor_si128(row3h, row1h);
     78     STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
     79     STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
     80     row2l = _mm_xor_si128(row4l, row2l);
     81     row2h = _mm_xor_si128(row4h, row2h);
     82     STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
     83     STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
     84     return 0;
     85 }
     86 
     87 #endif
     88