Lines Matching refs:__m128i
36 static inline __m128i
37 rol32(__m128i x, uint8_t n)
45 chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3,
48 __m128i r0, r1, r2, r3;
49 __m128i c0, c1, c2, c3;
92 __m128i in0, in1, in2, in3;
93 __m128i r0, r1, r2, r3;
95 r0 = in0 = _mm_loadu_si128((const __m128i *)c);
96 r1 = in1 = _mm_loadu_si128((const __m128i *)k);
97 r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1);
98 r3 = in3 = _mm_loadu_si128((const __m128i *)in);
102 _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0));
103 _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1));
104 _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2));
105 _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3));
115 __m128i r0, r1, r2, r3;
117 r0 = _mm_loadu_si128((const __m128i *)c);
118 r1 = _mm_loadu_si128((const __m128i *)k);
119 r2 = _mm_loadu_si128((const __m128i *)k + 1);
120 r3 = _mm_loadu_si128((const __m128i *)in);
124 _mm_storeu_si128((__m128i *)out + 0, r0);
125 _mm_storeu_si128((__m128i *)out + 1, r3);
137 static inline __m128i
140 return (__m128i)_mm_load1_ps(p);
143 static inline __m128i
150 storeu_epi32(void *p, __m128i v)
155 static inline __m128i
156 unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
162 return (__m128i)_mm_movelh_ps(lo, hi);
165 static inline __m128i
166 unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
172 return (__m128i)_mm_movehl_ps(hi, lo);
175 static inline __m128i
176 unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
182 return (__m128i)_mm_movelh_ps(lo, hi);
185 static inline __m128i
186 unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
192 return (__m128i)_mm_movehl_ps(hi, lo);
203 __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
204 __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
205 __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
313 const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
314 __m128i in0, in1, in2, in3;
315 __m128i r0, r1, r2, r3;
317 in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
318 in1 = _mm_loadu_si128((const __m128i *)k);
319 in2 = _mm_loadu_si128((const __m128i *)k + 1);
337 _mm_storeu_si128((__m128i *)buf + 0, r0);
338 _mm_storeu_si128((__m128i *)buf + 1, r1);
339 _mm_storeu_si128((__m128i *)buf + 2, r2);
340 _mm_storeu_si128((__m128i *)buf + 3, r3);
346 _mm_storeu_si128((__m128i *)s + 0, r0);
347 _mm_storeu_si128((__m128i *)s + 1, r1);
348 _mm_storeu_si128((__m128i *)s + 2, r2);
349 _mm_storeu_si128((__m128i *)s + 3, r3);
363 __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
364 __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
365 __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
472 const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
473 __m128i in0, in1, in2, in3;
474 __m128i r0, r1, r2, r3;
476 in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
477 in1 = _mm_loadu_si128((const __m128i *)k);
478 in2 = _mm_loadu_si128((const __m128i *)k + 1);
497 _mm_storeu_si128((__m128i *)buf + 0, r0);
498 _mm_storeu_si128((__m128i *)buf + 1, r1);
499 _mm_storeu_si128((__m128i *)buf + 2, r2);
500 _mm_storeu_si128((__m128i *)buf + 3, r3);
511 r0 ^= _mm_loadu_si128((const __m128i *)p + 0);
512 r1 ^= _mm_loadu_si128((const __m128i *)p + 1);
513 r2 ^= _mm_loadu_si128((const __m128i *)p + 2);
514 r3 ^= _mm_loadu_si128((const __m128i *)p + 3);
515 _mm_storeu_si128((__m128i *)s + 0, r0);
516 _mm_storeu_si128((__m128i *)s + 1, r1);
517 _mm_storeu_si128((__m128i *)s + 2, r2);
518 _mm_storeu_si128((__m128i *)s + 3, r3);