Home | History | Annotate | Line # | Download | only in sse2
      1 
      2 #include <stdint.h>
      3 #include <string.h>
      4 
      5 #include "../onetimeauth_poly1305.h"
      6 #include "crypto_verify_16.h"
      7 #include "poly1305_sse2.h"
      8 #include "private/common.h"
      9 #include "private/sse2_64_32.h"
     10 #include "utils.h"
     11 
     12 #if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
     13 
     14 # ifdef __GNUC__
     15 #  pragma GCC target("sse2")
     16 # endif
     17 
     18 # include <emmintrin.h>
     19 
     20 typedef __m128i xmmi;
     21 
     22 # if defined(_MSC_VER)
     23 #  define POLY1305_NOINLINE __declspec(noinline)
     24 # elif defined(__GNUC__)
     25 #  define POLY1305_NOINLINE __attribute__((noinline))
     26 # else
     27 #  define POLY1305_NOINLINE
     28 # endif
     29 
     30 # define poly1305_block_size 32
     31 
     32 enum poly1305_state_flags_t {
     33     poly1305_started       = 1,
     34     poly1305_final_shift8  = 4,
     35     poly1305_final_shift16 = 8,
     36     poly1305_final_r2_r    = 16, /* use [r^2,r] for the final block */
     37     poly1305_final_r_1     = 32  /* use [r,1] for the final block */
     38 };
     39 
     40 typedef struct poly1305_state_internal_t {
     41     union {
     42         uint64_t h[3];
     43         uint32_t hh[10];
     44     } H;                                            /*  40 bytes  */
     45     uint32_t           R[5];                        /*  20 bytes  */
     46     uint32_t           R2[5];                       /*  20 bytes  */
     47     uint32_t           R4[5];                       /*  20 bytes  */
     48     uint64_t           pad[2];                      /*  16 bytes  */
     49     uint64_t           flags;                       /*   8 bytes  */
     50     unsigned long long leftover;                    /* 8 bytes */
     51     unsigned char      buffer[poly1305_block_size]; /* 32 bytes */
     52 } poly1305_state_internal_t;                        /* 164 bytes total */
     53 
     54 /*
     55  * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are
     56  * totally fine, even though this intrinsic requires a __m128i* input.
     57  * This confuses dynamic analysis, so force alignment, only in debug mode.
     58  */
     59 # ifdef DEBUG
     60 static xmmi
     61 _fakealign_mm_loadl_epi64(const void *m)
     62 {
     63     xmmi tmp;
     64     memcpy(&tmp, m, 8);
     65 
     66     return _mm_loadl_epi64(&tmp);
     67 }
     68 # define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X)
     69 #endif
     70 
     71 /* copy 0-31 bytes */
     72 static inline void
     73 poly1305_block_copy31(unsigned char *dst, const unsigned char *src,
     74                       unsigned long long bytes)
     75 {
     76     if (bytes & 16) {
     77         _mm_store_si128((xmmi *) (void *) dst,
     78                         _mm_loadu_si128((const xmmi *) (const void *) src));
     79         src += 16;
     80         dst += 16;
     81     }
     82     if (bytes & 8) {
     83         memcpy(dst, src, 8);
     84         src += 8;
     85         dst += 8;
     86     }
     87     if (bytes & 4) {
     88         memcpy(dst, src, 4);
     89         src += 4;
     90         dst += 4;
     91     }
     92     if (bytes & 2) {
     93         memcpy(dst, src, 2);
     94         src += 2;
     95         dst += 2;
     96     }
     97     if (bytes & 1) {
     98         *dst = *src;
     99     }
    100 }
    101 
    102 static POLY1305_NOINLINE void
    103 poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32],
    104                   unsigned long long bytes)
    105 {
    106     uint32_t          *R;
    107     uint128_t          d[3];
    108     uint64_t           r0, r1, r2;
    109     uint64_t           rt0, rt1, rt2, st2, c;
    110     uint64_t           t0, t1;
    111     unsigned long long i;
    112 
    113     if (!bytes) {
    114         bytes = ~(unsigned long long) 0;
    115     }
    116     /* H = 0 */
    117     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128());
    118     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128());
    119     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128());
    120 
    121     /* clamp key */
    122     memcpy(&t0, key, 8);
    123     memcpy(&t1, key + 8, 8);
    124     r0 = t0 & 0xffc0fffffff;
    125     t0 >>= 44;
    126     t0 |= t1 << 20;
    127     r1 = t0 & 0xfffffc0ffff;
    128     t1 >>= 24;
    129     r2 = t1 & 0x00ffffffc0f;
    130 
    131     /* r^1 */
    132     R    = st->R;
    133     R[0] = (uint32_t)(r0) &0x3ffffff;
    134     R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
    135     R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
    136     R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
    137     R[4] = (uint32_t)((r2 >> 16));
    138 
    139     /* save pad */
    140     memcpy(&st->pad[0], key + 16, 8);
    141     memcpy(&st->pad[1], key + 24, 8);
    142 
    143     rt0 = r0;
    144     rt1 = r1;
    145     rt2 = r2;
    146 
    147     /* r^2, r^4 */
    148     for (i = 0; i < 2; i++) {
    149         if (i == 0) {
    150             R = st->R2;
    151             if (bytes <= 16) {
    152                 break;
    153             }
    154         } else if (i == 1) {
    155             R = st->R4;
    156             if (bytes < 96) {
    157                 break;
    158             }
    159         }
    160         st2 = rt2 * (5 << 2);
    161 
    162         d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
    163         d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
    164         d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
    165 
    166         rt0 = (uint64_t) d[0] & 0xfffffffffff;
    167         c   = (uint64_t)(d[0] >> 44);
    168         d[1] += c;
    169 
    170         rt1 = (uint64_t) d[1] & 0xfffffffffff;
    171         c   = (uint64_t)(d[1] >> 44);
    172         d[2] += c;
    173 
    174         rt2 = (uint64_t) d[2] & 0x3ffffffffff;
    175         c   = (uint64_t)(d[2] >> 42);
    176         rt0 += c * 5;
    177         c   = (rt0 >> 44);
    178         rt0 = rt0 & 0xfffffffffff;
    179         rt1 += c;
    180         c   = (rt1 >> 44);
    181         rt1 = rt1 & 0xfffffffffff;
    182         rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
    183                      is safe to multiply with */
    184 
    185         R[0] = (uint32_t)(rt0) &0x3ffffff;
    186         R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
    187         R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
    188         R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
    189         R[4] = (uint32_t)((rt2 >> 16));
    190     }
    191     st->flags    = 0;
    192     st->leftover = 0U;
    193 }
    194 
    195 static POLY1305_NOINLINE void
    196 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
    197                 unsigned long long bytes)
    198 {
    199     CRYPTO_ALIGN(64)
    200     xmmi HIBIT =
    201         _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
    202     const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
    203                                          _MM_SHUFFLE(1, 0, 1, 0));
    204     const xmmi FIVE =
    205         _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
    206     xmmi H0, H1, H2, H3, H4;
    207     xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
    208     xmmi M0, M1, M2, M3, M4;
    209     xmmi M5, M6, M7, M8;
    210     xmmi C1, C2;
    211     xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
    212     xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
    213 
    214     if (st->flags & poly1305_final_shift8) {
    215         HIBIT = _mm_srli_si128(HIBIT, 8);
    216     }
    217     if (st->flags & poly1305_final_shift16) {
    218         HIBIT = _mm_setzero_si128();
    219     }
    220     if (!(st->flags & poly1305_started)) {
    221         /* H = [Mx,My] */
    222         T5 = _mm_unpacklo_epi64(
    223             _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
    224             _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
    225         T6 = _mm_unpacklo_epi64(
    226             _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
    227             _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
    228         H0 = _mm_and_si128(MMASK, T5);
    229         H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    230         T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    231         H2 = _mm_and_si128(MMASK, T5);
    232         H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    233         H4 = _mm_srli_epi64(T6, 40);
    234         H4 = _mm_or_si128(H4, HIBIT);
    235         m += 32;
    236         bytes -= 32;
    237         st->flags |= poly1305_started;
    238     } else {
    239         T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]);
    240         T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]);
    241         T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]);
    242         H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
    243         H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
    244         H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
    245         H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
    246         H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
    247     }
    248     if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
    249         if (st->flags & poly1305_final_r2_r) {
    250             /* use [r^2, r] */
    251             T2  = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
    252             T3  = _mm_cvtsi32_si128(st->R[4]);
    253             T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
    254             T1  = _mm_cvtsi32_si128(st->R2[4]);
    255             T4  = _mm_unpacklo_epi32(T0, T2);
    256             T5  = _mm_unpackhi_epi32(T0, T2);
    257             R24 = _mm_unpacklo_epi64(T1, T3);
    258         } else {
    259             /* use [r^1, 1] */
    260             T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
    261             T1  = _mm_cvtsi32_si128(st->R[4]);
    262             T2  = _mm_cvtsi32_si128(1);
    263             T4  = _mm_unpacklo_epi32(T0, T2);
    264             T5  = _mm_unpackhi_epi32(T0, T2);
    265             R24 = T1;
    266         }
    267         R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
    268         R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
    269         R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
    270         R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
    271     } else {
    272         /* use [r^2, r^2] */
    273         T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
    274         T1  = _mm_cvtsi32_si128(st->R2[4]);
    275         R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
    276         R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
    277         R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
    278         R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
    279         R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
    280     }
    281     S21 = _mm_mul_epu32(R21, FIVE);
    282     S22 = _mm_mul_epu32(R22, FIVE);
    283     S23 = _mm_mul_epu32(R23, FIVE);
    284     S24 = _mm_mul_epu32(R24, FIVE);
    285 
    286     if (bytes >= 64) {
    287         T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
    288         T1  = _mm_cvtsi32_si128(st->R4[4]);
    289         R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
    290         R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
    291         R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
    292         R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
    293         R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
    294         S41 = _mm_mul_epu32(R41, FIVE);
    295         S42 = _mm_mul_epu32(R42, FIVE);
    296         S43 = _mm_mul_epu32(R43, FIVE);
    297         S44 = _mm_mul_epu32(R44, FIVE);
    298 
    299         while (bytes >= 64) {
    300             xmmi v00, v01, v02, v03, v04;
    301             xmmi v10, v11, v12, v13, v14;
    302             xmmi v20, v21, v22, v23, v24;
    303             xmmi v30, v31, v32, v33, v34;
    304             xmmi v40, v41, v42, v43, v44;
    305             xmmi T14, T15;
    306 
    307             /* H *= [r^4,r^4], preload [Mx,My] */
    308             T15 = S42;
    309             T0  = H4;
    310             T0  = _mm_mul_epu32(T0, S41);
    311             v01 = H3;
    312             v01 = _mm_mul_epu32(v01, T15);
    313             T14 = S43;
    314             T1  = H4;
    315             T1  = _mm_mul_epu32(T1, T15);
    316             v11 = H3;
    317             v11 = _mm_mul_epu32(v11, T14);
    318             T2  = H4;
    319             T2  = _mm_mul_epu32(T2, T14);
    320             T0  = _mm_add_epi64(T0, v01);
    321             T15 = S44;
    322             v02 = H2;
    323             v02 = _mm_mul_epu32(v02, T14);
    324             T3  = H4;
    325             T3  = _mm_mul_epu32(T3, T15);
    326             T1  = _mm_add_epi64(T1, v11);
    327             v03 = H1;
    328             v03 = _mm_mul_epu32(v03, T15);
    329             v12 = H2;
    330             v12 = _mm_mul_epu32(v12, T15);
    331             T0  = _mm_add_epi64(T0, v02);
    332             T14 = R40;
    333             v21 = H3;
    334             v21 = _mm_mul_epu32(v21, T15);
    335             v31 = H3;
    336             v31 = _mm_mul_epu32(v31, T14);
    337             T0  = _mm_add_epi64(T0, v03);
    338             T4  = H4;
    339             T4  = _mm_mul_epu32(T4, T14);
    340             T1  = _mm_add_epi64(T1, v12);
    341             v04 = H0;
    342             v04 = _mm_mul_epu32(v04, T14);
    343             T2  = _mm_add_epi64(T2, v21);
    344             v13 = H1;
    345             v13 = _mm_mul_epu32(v13, T14);
    346             T3  = _mm_add_epi64(T3, v31);
    347             T15 = R41;
    348             v22 = H2;
    349             v22 = _mm_mul_epu32(v22, T14);
    350             v32 = H2;
    351             v32 = _mm_mul_epu32(v32, T15);
    352             T0  = _mm_add_epi64(T0, v04);
    353             v41 = H3;
    354             v41 = _mm_mul_epu32(v41, T15);
    355             T1  = _mm_add_epi64(T1, v13);
    356             v14 = H0;
    357             v14 = _mm_mul_epu32(v14, T15);
    358             T2  = _mm_add_epi64(T2, v22);
    359             T14 = R42;
    360             T5  = _mm_unpacklo_epi64(
    361                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
    362                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
    363             v23 = H1;
    364             v23 = _mm_mul_epu32(v23, T15);
    365             T3  = _mm_add_epi64(T3, v32);
    366             v33 = H1;
    367             v33 = _mm_mul_epu32(v33, T14);
    368             T4  = _mm_add_epi64(T4, v41);
    369             v42 = H2;
    370             v42 = _mm_mul_epu32(v42, T14);
    371             T1  = _mm_add_epi64(T1, v14);
    372             T15 = R43;
    373             T6  = _mm_unpacklo_epi64(
    374                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
    375                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
    376             v24 = H0;
    377             v24 = _mm_mul_epu32(v24, T14);
    378             T2  = _mm_add_epi64(T2, v23);
    379             v34 = H0;
    380             v34 = _mm_mul_epu32(v34, T15);
    381             T3  = _mm_add_epi64(T3, v33);
    382             M0  = _mm_and_si128(MMASK, T5);
    383             v43 = H1;
    384             v43 = _mm_mul_epu32(v43, T15);
    385             T4  = _mm_add_epi64(T4, v42);
    386             M1  = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    387             v44 = H0;
    388             v44 = _mm_mul_epu32(v44, R44);
    389             T2  = _mm_add_epi64(T2, v24);
    390             T5  = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    391             T3  = _mm_add_epi64(T3, v34);
    392             M3  = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
    393             T4  = _mm_add_epi64(T4, v43);
    394             M2  = _mm_and_si128(MMASK, T5);
    395             T4  = _mm_add_epi64(T4, v44);
    396             M4  = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
    397 
    398             /* H += [Mx',My'] */
    399             T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
    400             T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
    401             T7 = _mm_unpacklo_epi32(T5, T6);
    402             T8 = _mm_unpackhi_epi32(T5, T6);
    403             M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
    404             M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
    405             M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
    406             M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
    407             M6 = _mm_slli_epi64(M6, 6);
    408             M7 = _mm_slli_epi64(M7, 12);
    409             M8 = _mm_slli_epi64(M8, 18);
    410             T0 = _mm_add_epi64(T0, M5);
    411             T1 = _mm_add_epi64(T1, M6);
    412             T2 = _mm_add_epi64(T2, M7);
    413             T3 = _mm_add_epi64(T3, M8);
    414             T4 = _mm_add_epi64(T4, HIBIT);
    415 
    416             /* H += [Mx,My]*[r^2,r^2] */
    417             T15 = S22;
    418             v00 = M4;
    419             v00 = _mm_mul_epu32(v00, S21);
    420             v01 = M3;
    421             v01 = _mm_mul_epu32(v01, T15);
    422             T14 = S23;
    423             v10 = M4;
    424             v10 = _mm_mul_epu32(v10, T15);
    425             v11 = M3;
    426             v11 = _mm_mul_epu32(v11, T14);
    427             T0  = _mm_add_epi64(T0, v00);
    428             v20 = M4;
    429             v20 = _mm_mul_epu32(v20, T14);
    430             T0  = _mm_add_epi64(T0, v01);
    431             T15 = S24;
    432             v02 = M2;
    433             v02 = _mm_mul_epu32(v02, T14);
    434             T1  = _mm_add_epi64(T1, v10);
    435             v30 = M4;
    436             v30 = _mm_mul_epu32(v30, T15);
    437             T1  = _mm_add_epi64(T1, v11);
    438             v03 = M1;
    439             v03 = _mm_mul_epu32(v03, T15);
    440             T2  = _mm_add_epi64(T2, v20);
    441             v12 = M2;
    442             v12 = _mm_mul_epu32(v12, T15);
    443             T0  = _mm_add_epi64(T0, v02);
    444             T14 = R20;
    445             v21 = M3;
    446             v21 = _mm_mul_epu32(v21, T15);
    447             T3  = _mm_add_epi64(T3, v30);
    448             v31 = M3;
    449             v31 = _mm_mul_epu32(v31, T14);
    450             T0  = _mm_add_epi64(T0, v03);
    451             v40 = M4;
    452             v40 = _mm_mul_epu32(v40, T14);
    453             T1  = _mm_add_epi64(T1, v12);
    454             v04 = M0;
    455             v04 = _mm_mul_epu32(v04, T14);
    456             T2  = _mm_add_epi64(T2, v21);
    457             v13 = M1;
    458             v13 = _mm_mul_epu32(v13, T14);
    459             T3  = _mm_add_epi64(T3, v31);
    460             T15 = R21;
    461             v22 = M2;
    462             v22 = _mm_mul_epu32(v22, T14);
    463             T4  = _mm_add_epi64(T4, v40);
    464             v32 = M2;
    465             v32 = _mm_mul_epu32(v32, T15);
    466             T0  = _mm_add_epi64(T0, v04);
    467             v41 = M3;
    468             v41 = _mm_mul_epu32(v41, T15);
    469             T1  = _mm_add_epi64(T1, v13);
    470             v14 = M0;
    471             v14 = _mm_mul_epu32(v14, T15);
    472             T2  = _mm_add_epi64(T2, v22);
    473             T14 = R22;
    474             v23 = M1;
    475             v23 = _mm_mul_epu32(v23, T15);
    476             T3  = _mm_add_epi64(T3, v32);
    477             v33 = M1;
    478             v33 = _mm_mul_epu32(v33, T14);
    479             T4  = _mm_add_epi64(T4, v41);
    480             v42 = M2;
    481             v42 = _mm_mul_epu32(v42, T14);
    482             T1  = _mm_add_epi64(T1, v14);
    483             T15 = R23;
    484             v24 = M0;
    485             v24 = _mm_mul_epu32(v24, T14);
    486             T2  = _mm_add_epi64(T2, v23);
    487             v34 = M0;
    488             v34 = _mm_mul_epu32(v34, T15);
    489             T3  = _mm_add_epi64(T3, v33);
    490             v43 = M1;
    491             v43 = _mm_mul_epu32(v43, T15);
    492             T4  = _mm_add_epi64(T4, v42);
    493             v44 = M0;
    494             v44 = _mm_mul_epu32(v44, R24);
    495             T2  = _mm_add_epi64(T2, v24);
    496             T3  = _mm_add_epi64(T3, v34);
    497             T4  = _mm_add_epi64(T4, v43);
    498             T4  = _mm_add_epi64(T4, v44);
    499 
    500             /* reduce */
    501             C1 = _mm_srli_epi64(T0, 26);
    502             C2 = _mm_srli_epi64(T3, 26);
    503             T0 = _mm_and_si128(T0, MMASK);
    504             T3 = _mm_and_si128(T3, MMASK);
    505             T1 = _mm_add_epi64(T1, C1);
    506             T4 = _mm_add_epi64(T4, C2);
    507             C1 = _mm_srli_epi64(T1, 26);
    508             C2 = _mm_srli_epi64(T4, 26);
    509             T1 = _mm_and_si128(T1, MMASK);
    510             T4 = _mm_and_si128(T4, MMASK);
    511             T2 = _mm_add_epi64(T2, C1);
    512             T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
    513             C1 = _mm_srli_epi64(T2, 26);
    514             C2 = _mm_srli_epi64(T0, 26);
    515             T2 = _mm_and_si128(T2, MMASK);
    516             T0 = _mm_and_si128(T0, MMASK);
    517             T3 = _mm_add_epi64(T3, C1);
    518             T1 = _mm_add_epi64(T1, C2);
    519             C1 = _mm_srli_epi64(T3, 26);
    520             T3 = _mm_and_si128(T3, MMASK);
    521             T4 = _mm_add_epi64(T4, C1);
    522 
    523             /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
    524             H0 = T0;
    525             H1 = T1;
    526             H2 = T2;
    527             H3 = T3;
    528             H4 = T4;
    529 
    530             m += 64;
    531             bytes -= 64;
    532         }
    533     }
    534 
    535     if (bytes >= 32) {
    536         xmmi v01, v02, v03, v04;
    537         xmmi v11, v12, v13, v14;
    538         xmmi v21, v22, v23, v24;
    539         xmmi v31, v32, v33, v34;
    540         xmmi v41, v42, v43, v44;
    541         xmmi T14, T15;
    542 
    543         /* H *= [r^2,r^2] */
    544         T15 = S22;
    545         T0  = H4;
    546         T0  = _mm_mul_epu32(T0, S21);
    547         v01 = H3;
    548         v01 = _mm_mul_epu32(v01, T15);
    549         T14 = S23;
    550         T1  = H4;
    551         T1  = _mm_mul_epu32(T1, T15);
    552         v11 = H3;
    553         v11 = _mm_mul_epu32(v11, T14);
    554         T2  = H4;
    555         T2  = _mm_mul_epu32(T2, T14);
    556         T0  = _mm_add_epi64(T0, v01);
    557         T15 = S24;
    558         v02 = H2;
    559         v02 = _mm_mul_epu32(v02, T14);
    560         T3  = H4;
    561         T3  = _mm_mul_epu32(T3, T15);
    562         T1  = _mm_add_epi64(T1, v11);
    563         v03 = H1;
    564         v03 = _mm_mul_epu32(v03, T15);
    565         v12 = H2;
    566         v12 = _mm_mul_epu32(v12, T15);
    567         T0  = _mm_add_epi64(T0, v02);
    568         T14 = R20;
    569         v21 = H3;
    570         v21 = _mm_mul_epu32(v21, T15);
    571         v31 = H3;
    572         v31 = _mm_mul_epu32(v31, T14);
    573         T0  = _mm_add_epi64(T0, v03);
    574         T4  = H4;
    575         T4  = _mm_mul_epu32(T4, T14);
    576         T1  = _mm_add_epi64(T1, v12);
    577         v04 = H0;
    578         v04 = _mm_mul_epu32(v04, T14);
    579         T2  = _mm_add_epi64(T2, v21);
    580         v13 = H1;
    581         v13 = _mm_mul_epu32(v13, T14);
    582         T3  = _mm_add_epi64(T3, v31);
    583         T15 = R21;
    584         v22 = H2;
    585         v22 = _mm_mul_epu32(v22, T14);
    586         v32 = H2;
    587         v32 = _mm_mul_epu32(v32, T15);
    588         T0  = _mm_add_epi64(T0, v04);
    589         v41 = H3;
    590         v41 = _mm_mul_epu32(v41, T15);
    591         T1  = _mm_add_epi64(T1, v13);
    592         v14 = H0;
    593         v14 = _mm_mul_epu32(v14, T15);
    594         T2  = _mm_add_epi64(T2, v22);
    595         T14 = R22;
    596         v23 = H1;
    597         v23 = _mm_mul_epu32(v23, T15);
    598         T3  = _mm_add_epi64(T3, v32);
    599         v33 = H1;
    600         v33 = _mm_mul_epu32(v33, T14);
    601         T4  = _mm_add_epi64(T4, v41);
    602         v42 = H2;
    603         v42 = _mm_mul_epu32(v42, T14);
    604         T1  = _mm_add_epi64(T1, v14);
    605         T15 = R23;
    606         v24 = H0;
    607         v24 = _mm_mul_epu32(v24, T14);
    608         T2  = _mm_add_epi64(T2, v23);
    609         v34 = H0;
    610         v34 = _mm_mul_epu32(v34, T15);
    611         T3  = _mm_add_epi64(T3, v33);
    612         v43 = H1;
    613         v43 = _mm_mul_epu32(v43, T15);
    614         T4  = _mm_add_epi64(T4, v42);
    615         v44 = H0;
    616         v44 = _mm_mul_epu32(v44, R24);
    617         T2  = _mm_add_epi64(T2, v24);
    618         T3  = _mm_add_epi64(T3, v34);
    619         T4  = _mm_add_epi64(T4, v43);
    620         T4  = _mm_add_epi64(T4, v44);
    621 
    622         /* H += [Mx,My] */
    623         if (m) {
    624             T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
    625             T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
    626             T7 = _mm_unpacklo_epi32(T5, T6);
    627             T8 = _mm_unpackhi_epi32(T5, T6);
    628             M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
    629             M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
    630             M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
    631             M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
    632             M1 = _mm_slli_epi64(M1, 6);
    633             M2 = _mm_slli_epi64(M2, 12);
    634             M3 = _mm_slli_epi64(M3, 18);
    635             T0 = _mm_add_epi64(T0, M0);
    636             T1 = _mm_add_epi64(T1, M1);
    637             T2 = _mm_add_epi64(T2, M2);
    638             T3 = _mm_add_epi64(T3, M3);
    639             T4 = _mm_add_epi64(T4, HIBIT);
    640         }
    641 
    642         /* reduce */
    643         C1 = _mm_srli_epi64(T0, 26);
    644         C2 = _mm_srli_epi64(T3, 26);
    645         T0 = _mm_and_si128(T0, MMASK);
    646         T3 = _mm_and_si128(T3, MMASK);
    647         T1 = _mm_add_epi64(T1, C1);
    648         T4 = _mm_add_epi64(T4, C2);
    649         C1 = _mm_srli_epi64(T1, 26);
    650         C2 = _mm_srli_epi64(T4, 26);
    651         T1 = _mm_and_si128(T1, MMASK);
    652         T4 = _mm_and_si128(T4, MMASK);
    653         T2 = _mm_add_epi64(T2, C1);
    654         T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
    655         C1 = _mm_srli_epi64(T2, 26);
    656         C2 = _mm_srli_epi64(T0, 26);
    657         T2 = _mm_and_si128(T2, MMASK);
    658         T0 = _mm_and_si128(T0, MMASK);
    659         T3 = _mm_add_epi64(T3, C1);
    660         T1 = _mm_add_epi64(T1, C2);
    661         C1 = _mm_srli_epi64(T3, 26);
    662         T3 = _mm_and_si128(T3, MMASK);
    663         T4 = _mm_add_epi64(T4, C1);
    664 
    665         /* H = (H*[r^2,r^2] + [Mx,My]) */
    666         H0 = T0;
    667         H1 = T1;
    668         H2 = T2;
    669         H3 = T3;
    670         H4 = T4;
    671     }
    672 
    673     if (m) {
    674         T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
    675         T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
    676         T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
    677         T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
    678         T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
    679         T0 = _mm_unpacklo_epi64(T0, T1);
    680         T1 = _mm_unpacklo_epi64(T2, T3);
    681         _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0);
    682         _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1);
    683         _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4);
    684     } else {
    685         uint32_t t0, t1, t2, t3, t4, b;
    686         uint64_t h0, h1, h2, g0, g1, g2, c, nc;
    687 
    688         /* H = H[0]+H[1] */
    689         T0 = H0;
    690         T1 = H1;
    691         T2 = H2;
    692         T3 = H3;
    693         T4 = H4;
    694 
    695         T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
    696         T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
    697         T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
    698         T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
    699         T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
    700 
    701         t0 = _mm_cvtsi128_si32(T0);
    702         b  = (t0 >> 26);
    703         t0 &= 0x3ffffff;
    704         t1 = _mm_cvtsi128_si32(T1) + b;
    705         b  = (t1 >> 26);
    706         t1 &= 0x3ffffff;
    707         t2 = _mm_cvtsi128_si32(T2) + b;
    708         b  = (t2 >> 26);
    709         t2 &= 0x3ffffff;
    710         t3 = _mm_cvtsi128_si32(T3) + b;
    711         b  = (t3 >> 26);
    712         t3 &= 0x3ffffff;
    713         t4 = _mm_cvtsi128_si32(T4) + b;
    714 
    715         /* everything except t4 is in range, so this is all safe */
    716         h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
    717         h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
    718               ((uint64_t) t3 << 34)) &
    719              0xfffffffffffull;
    720         h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
    721 
    722         c = (h2 >> 42);
    723         h2 &= 0x3ffffffffff;
    724         h0 += c * 5;
    725         c = (h0 >> 44);
    726         h0 &= 0xfffffffffff;
    727         h1 += c;
    728         c = (h1 >> 44);
    729         h1 &= 0xfffffffffff;
    730         h2 += c;
    731         c = (h2 >> 42);
    732         h2 &= 0x3ffffffffff;
    733         h0 += c * 5;
    734         c = (h0 >> 44);
    735         h0 &= 0xfffffffffff;
    736         h1 += c;
    737 
    738         g0 = h0 + 5;
    739         c  = (g0 >> 44);
    740         g0 &= 0xfffffffffff;
    741         g1 = h1 + c;
    742         c  = (g1 >> 44);
    743         g1 &= 0xfffffffffff;
    744         g2 = h2 + c - ((uint64_t) 1 << 42);
    745 
    746         c  = (g2 >> 63) - 1;
    747         nc = ~c;
    748         h0 = (h0 & nc) | (g0 & c);
    749         h1 = (h1 & nc) | (g1 & c);
    750         h2 = (h2 & nc) | (g2 & c);
    751 
    752         st->H.h[0] = h0;
    753         st->H.h[1] = h1;
    754         st->H.h[2] = h2;
    755     }
    756 }
    757 
    758 static void
    759 poly1305_update(poly1305_state_internal_t *st, const unsigned char *m,
    760                 unsigned long long bytes)
    761 {
    762     unsigned long long i;
    763 
    764     /* handle leftover */
    765     if (st->leftover) {
    766         unsigned long long want = (poly1305_block_size - st->leftover);
    767 
    768         if (want > bytes) {
    769             want = bytes;
    770         }
    771         for (i = 0; i < want; i++) {
    772             st->buffer[st->leftover + i] = m[i];
    773         }
    774         bytes -= want;
    775         m += want;
    776         st->leftover += want;
    777         if (st->leftover < poly1305_block_size) {
    778             return;
    779         }
    780         poly1305_blocks(st, st->buffer, poly1305_block_size);
    781         st->leftover = 0;
    782     }
    783 
    784     /* process full blocks */
    785     if (bytes >= poly1305_block_size) {
    786         unsigned long long want = (bytes & ~(poly1305_block_size - 1));
    787 
    788         poly1305_blocks(st, m, want);
    789         m += want;
    790         bytes -= want;
    791     }
    792 
    793     /* store leftover */
    794     if (bytes) {
    795         for (i = 0; i < bytes; i++) {
    796             st->buffer[st->leftover + i] = m[i];
    797         }
    798         st->leftover += bytes;
    799     }
    800 }
    801 
    802 static POLY1305_NOINLINE void
    803 poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
    804                     unsigned long long leftover, unsigned char mac[16])
    805 {
    806     uint64_t h0, h1, h2;
    807 
    808     if (leftover) {
    809         CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
    810 
    811         poly1305_block_copy31(final, m, leftover);
    812         if (leftover != 16) {
    813             final[leftover] = 1;
    814         }
    815         st->flags |=
    816             (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
    817         poly1305_blocks(st, final, 32);
    818     }
    819 
    820     if (st->flags & poly1305_started) {
    821         /* finalize, H *= [r^2,r], or H *= [r,1] */
    822         if (!leftover || (leftover > 16)) {
    823             st->flags |= poly1305_final_r2_r;
    824         } else {
    825             st->flags |= poly1305_final_r_1;
    826         }
    827         poly1305_blocks(st, NULL, 32);
    828     }
    829 
    830     h0 = st->H.h[0];
    831     h1 = st->H.h[1];
    832     h2 = st->H.h[2];
    833 
    834     /* pad */
    835     h0 = ((h0) | (h1 << 44));
    836     h1 = ((h1 >> 20) | (h2 << 24));
    837 #ifdef HAVE_AMD64_ASM
    838     __asm__ __volatile__(
    839         "addq %2, %0 ;\n"
    840         "adcq %3, %1 ;\n"
    841         : "+r"(h0), "+r"(h1)
    842         : "r"(st->pad[0]), "r"(st->pad[1])
    843         : "flags", "cc");
    844 #else
    845     {
    846         uint128_t h;
    847 
    848         memcpy(&h, &st->pad[0], 16);
    849         h += ((uint128_t) h1 << 64) | h0;
    850         h0 = (uint64_t) h;
    851         h1 = (uint64_t)(h >> 64);
    852     }
    853 #endif
    854     _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
    855     _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
    856     _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
    857     _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
    858     _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
    859     _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
    860     _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
    861     _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
    862 
    863     memcpy(&mac[0], &h0, 8);
    864     memcpy(&mac[8], &h1, 8);
    865 
    866     sodium_memzero((void *) st, sizeof *st);
    867 }
    868 
    869 static void
    870 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
    871 {
    872     poly1305_finish_ext(st, st->buffer, st->leftover, mac);
    873 }
    874 
    875 static int
    876 crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state,
    877                                       const unsigned char *key)
    878 {
    879     COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
    880                     sizeof(poly1305_state_internal_t));
    881     poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
    882 
    883     return 0;
    884 }
    885 
    886 static int
    887 crypto_onetimeauth_poly1305_sse2_update(
    888     crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
    889     unsigned long long inlen)
    890 {
    891     poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
    892 
    893     return 0;
    894 }
    895 
    896 static int
    897 crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state,
    898                                        unsigned char *out)
    899 {
    900     poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
    901 
    902     return 0;
    903 }
    904 
    905 static int
    906 crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m,
    907                                  unsigned long long   inlen,
    908                                  const unsigned char *key)
    909 {
    910     CRYPTO_ALIGN(64) poly1305_state_internal_t st;
    911     unsigned long long                         blocks;
    912 
    913     poly1305_init_ext(&st, key, inlen);
    914     blocks = inlen & ~31;
    915     if (blocks > 0) {
    916         poly1305_blocks(&st, m, blocks);
    917         m += blocks;
    918         inlen -= blocks;
    919     }
    920     poly1305_finish_ext(&st, m, inlen, out);
    921 
    922     return 0;
    923 }
    924 
    925 static int
    926 crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h,
    927                                         const unsigned char *in,
    928                                         unsigned long long   inlen,
    929                                         const unsigned char *k)
    930 {
    931     unsigned char correct[16];
    932 
    933     crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
    934 
    935     return crypto_verify_16(h, correct);
    936 }
    937 
    938 struct crypto_onetimeauth_poly1305_implementation
    939     crypto_onetimeauth_poly1305_sse2_implementation = {
    940         SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
    941         SODIUM_C99(.onetimeauth_verify =)
    942             crypto_onetimeauth_poly1305_sse2_verify,
    943         SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
    944         SODIUM_C99(.onetimeauth_update =)
    945             crypto_onetimeauth_poly1305_sse2_update,
    946         SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
    947     };
    948 
    949 #endif
    950