mmintrin.h revision 1.1 1 1.1 mrg /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
2 1.1 mrg
3 1.1 mrg This file is part of GCC.
4 1.1 mrg
5 1.1 mrg GCC is free software; you can redistribute it and/or modify
6 1.1 mrg it under the terms of the GNU General Public License as published by
7 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
8 1.1 mrg any later version.
9 1.1 mrg
10 1.1 mrg GCC is distributed in the hope that it will be useful,
11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 1.1 mrg GNU General Public License for more details.
14 1.1 mrg
15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
16 1.1 mrg permissions described in the GCC Runtime Library Exception, version
17 1.1 mrg 3.1, as published by the Free Software Foundation.
18 1.1 mrg
19 1.1 mrg You should have received a copy of the GNU General Public License and
20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 1.1 mrg <http://www.gnu.org/licenses/>. */
23 1.1 mrg
24 1.1 mrg /* Implemented from the specification included in the Intel C++ Compiler
25 1.1 mrg User Guide and Reference, version 9.0. */
26 1.1 mrg
27 1.1 mrg #ifndef NO_WARN_X86_INTRINSICS
28 1.1 mrg /* This header is distributed to simplify porting x86_64 code that
29 1.1 mrg makes explicit use of Intel intrinsics to powerpc64le.
30 1.1 mrg It is the user's responsibility to determine if the results are
31 1.1 mrg acceptable and make additional changes as necessary.
32 1.1 mrg Note that much code that uses Intel intrinsics can be rewritten in
33 1.1 mrg standard C or GNU C extensions, which are more portable and better
34 1.1 mrg optimized across multiple targets.
35 1.1 mrg
36 1.1 mrg In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37 1.1 mrg target does not support a native __vector_size__ (8) type. Instead
38 1.1 mrg we typedef __m64 to a 64-bit unsigned long long, which is natively
39 1.1 mrg supported in 64-bit mode. This works well for the _si64 and some
40 1.1 mrg _pi32 operations, but starts to generate long sequences for _pi16
41 1.1 mrg and _pi8 operations. For those cases it better (faster and
42 1.1 mrg smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43 1.1 mrg unit, perform the operation, and then transfer the result back to
44 1.1 mrg the __m64 type. This implies that the direct register move
45 1.1 mrg instructions, introduced with power8, are available for efficient
46 1.1 mrg implementation of these transfers.
47 1.1 mrg
48 1.1 mrg Most MMX intrinsic operations can be performed efficiently as
49 1.1 mrg C language 64-bit scalar operation or optimized to use the newer
50 1.1 mrg 128-bit SSE/Altivec operations. We recomend this for new
51 1.1 mrg applications. */
52 1.1 mrg #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
53 1.1 mrg #endif
54 1.1 mrg
55 1.1 mrg #ifndef _MMINTRIN_H_INCLUDED
56 1.1 mrg #define _MMINTRIN_H_INCLUDED
57 1.1 mrg
58 1.1 mrg #include <altivec.h>
59 1.1 mrg /* The Intel API is flexible enough that we must allow aliasing with other
60 1.1 mrg vector types, and their scalar components. */
61 1.1 mrg typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
62 1.1 mrg
63 1.1 mrg typedef __attribute__ ((__aligned__ (8)))
64 1.1 mrg union
65 1.1 mrg {
66 1.1 mrg __m64 as_m64;
67 1.1 mrg char as_char[8];
68 1.1 mrg signed char as_signed_char [8];
69 1.1 mrg short as_short[4];
70 1.1 mrg int as_int[2];
71 1.1 mrg long long as_long_long;
72 1.1 mrg float as_float[2];
73 1.1 mrg double as_double;
74 1.1 mrg } __m64_union;
75 1.1 mrg
76 1.1 mrg /* Empty the multimedia state. */
77 1.1 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 1.1 mrg _mm_empty (void)
79 1.1 mrg {
80 1.1 mrg /* nothing to do on PowerPC. */
81 1.1 mrg }
82 1.1 mrg
83 1.1 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84 1.1 mrg _m_empty (void)
85 1.1 mrg {
86 1.1 mrg /* nothing to do on PowerPC. */
87 1.1 mrg }
88 1.1 mrg
89 1.1 mrg /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
90 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91 1.1 mrg _mm_cvtsi32_si64 (int __i)
92 1.1 mrg {
93 1.1 mrg return (__m64) (unsigned int) __i;
94 1.1 mrg }
95 1.1 mrg
96 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 1.1 mrg _m_from_int (int __i)
98 1.1 mrg {
99 1.1 mrg return _mm_cvtsi32_si64 (__i);
100 1.1 mrg }
101 1.1 mrg
102 1.1 mrg /* Convert the lower 32 bits of the __m64 object into an integer. */
103 1.1 mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 1.1 mrg _mm_cvtsi64_si32 (__m64 __i)
105 1.1 mrg {
106 1.1 mrg return ((int) __i);
107 1.1 mrg }
108 1.1 mrg
109 1.1 mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 1.1 mrg _m_to_int (__m64 __i)
111 1.1 mrg {
112 1.1 mrg return _mm_cvtsi64_si32 (__i);
113 1.1 mrg }
114 1.1 mrg
115 1.1 mrg #ifdef __powerpc64__
116 1.1 mrg /* Convert I to a __m64 object. */
117 1.1 mrg
118 1.1 mrg /* Intel intrinsic. */
119 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 1.1 mrg _m_from_int64 (long long __i)
121 1.1 mrg {
122 1.1 mrg return (__m64) __i;
123 1.1 mrg }
124 1.1 mrg
125 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 1.1 mrg _mm_cvtsi64_m64 (long long __i)
127 1.1 mrg {
128 1.1 mrg return (__m64) __i;
129 1.1 mrg }
130 1.1 mrg
131 1.1 mrg /* Microsoft intrinsic. */
132 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 1.1 mrg _mm_cvtsi64x_si64 (long long __i)
134 1.1 mrg {
135 1.1 mrg return (__m64) __i;
136 1.1 mrg }
137 1.1 mrg
138 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 1.1 mrg _mm_set_pi64x (long long __i)
140 1.1 mrg {
141 1.1 mrg return (__m64) __i;
142 1.1 mrg }
143 1.1 mrg
144 1.1 mrg /* Convert the __m64 object to a 64bit integer. */
145 1.1 mrg
146 1.1 mrg /* Intel intrinsic. */
147 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 1.1 mrg _m_to_int64 (__m64 __i)
149 1.1 mrg {
150 1.1 mrg return (long long)__i;
151 1.1 mrg }
152 1.1 mrg
153 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154 1.1 mrg _mm_cvtm64_si64 (__m64 __i)
155 1.1 mrg {
156 1.1 mrg return (long long) __i;
157 1.1 mrg }
158 1.1 mrg
159 1.1 mrg /* Microsoft intrinsic. */
160 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 1.1 mrg _mm_cvtsi64_si64x (__m64 __i)
162 1.1 mrg {
163 1.1 mrg return (long long) __i;
164 1.1 mrg }
165 1.1 mrg
166 1.1 mrg #ifdef _ARCH_PWR8
167 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
168 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit
169 1.1 mrg values of the result, all with signed saturation. */
170 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
171 1.1 mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2)
172 1.1 mrg {
173 1.1 mrg __vector signed short vm1;
174 1.1 mrg __vector signed char vresult;
175 1.1 mrg
176 1.1 mrg vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
177 1.1 mrg vresult = vec_vpkshss (vm1, vm1);
178 1.1 mrg return (__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0);
179 1.1 mrg }
180 1.1 mrg
181 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182 1.1 mrg _m_packsswb (__m64 __m1, __m64 __m2)
183 1.1 mrg {
184 1.1 mrg return _mm_packs_pi16 (__m1, __m2);
185 1.1 mrg }
186 1.1 mrg
187 1.1 mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
188 1.1 mrg the result, and the two 32-bit values from M2 into the upper two 16-bit
189 1.1 mrg values of the result, all with signed saturation. */
190 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 1.1 mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2)
192 1.1 mrg {
193 1.1 mrg __vector signed int vm1;
194 1.1 mrg __vector signed short vresult;
195 1.1 mrg
196 1.1 mrg vm1 = (__vector signed int)__builtin_pack_vector_int128 (__m2, __m1);
197 1.1 mrg vresult = vec_vpkswss (vm1, vm1);
198 1.1 mrg return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
199 1.1 mrg }
200 1.1 mrg
201 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 1.1 mrg _m_packssdw (__m64 __m1, __m64 __m2)
203 1.1 mrg {
204 1.1 mrg return _mm_packs_pi32 (__m1, __m2);
205 1.1 mrg }
206 1.1 mrg
207 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
208 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit
209 1.1 mrg values of the result, all with unsigned saturation. */
210 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 1.1 mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2)
212 1.1 mrg {
213 1.1 mrg __vector signed short vm1;
214 1.1 mrg __vector unsigned char vresult;
215 1.1 mrg
216 1.1 mrg vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
217 1.1 mrg vresult = vec_vpkshus (vm1, vm1);
218 1.1 mrg return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
219 1.1 mrg }
220 1.1 mrg
221 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222 1.1 mrg _m_packuswb (__m64 __m1, __m64 __m2)
223 1.1 mrg {
224 1.1 mrg return _mm_packs_pu16 (__m1, __m2);
225 1.1 mrg }
226 1.1 mrg #endif /* end ARCH_PWR8 */
227 1.1 mrg
228 1.1 mrg /* Interleave the four 8-bit values from the high half of M1 with the four
229 1.1 mrg 8-bit values from the high half of M2. */
230 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231 1.1 mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
232 1.1 mrg {
233 1.1 mrg #if _ARCH_PWR8
234 1.1 mrg __vector unsigned char a, b, c;
235 1.1 mrg
236 1.1 mrg a = (__vector unsigned char)vec_splats (__m1);
237 1.1 mrg b = (__vector unsigned char)vec_splats (__m2);
238 1.1 mrg c = vec_mergel (a, b);
239 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
240 1.1 mrg #else
241 1.1 mrg __m64_union m1, m2, res;
242 1.1 mrg
243 1.1 mrg m1.as_m64 = __m1;
244 1.1 mrg m2.as_m64 = __m2;
245 1.1 mrg
246 1.1 mrg res.as_char[0] = m1.as_char[4];
247 1.1 mrg res.as_char[1] = m2.as_char[4];
248 1.1 mrg res.as_char[2] = m1.as_char[5];
249 1.1 mrg res.as_char[3] = m2.as_char[5];
250 1.1 mrg res.as_char[4] = m1.as_char[6];
251 1.1 mrg res.as_char[5] = m2.as_char[6];
252 1.1 mrg res.as_char[6] = m1.as_char[7];
253 1.1 mrg res.as_char[7] = m2.as_char[7];
254 1.1 mrg
255 1.1 mrg return (__m64) res.as_m64;
256 1.1 mrg #endif
257 1.1 mrg }
258 1.1 mrg
259 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260 1.1 mrg _m_punpckhbw (__m64 __m1, __m64 __m2)
261 1.1 mrg {
262 1.1 mrg return _mm_unpackhi_pi8 (__m1, __m2);
263 1.1 mrg }
264 1.1 mrg
265 1.1 mrg /* Interleave the two 16-bit values from the high half of M1 with the two
266 1.1 mrg 16-bit values from the high half of M2. */
267 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 1.1 mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
269 1.1 mrg {
270 1.1 mrg __m64_union m1, m2, res;
271 1.1 mrg
272 1.1 mrg m1.as_m64 = __m1;
273 1.1 mrg m2.as_m64 = __m2;
274 1.1 mrg
275 1.1 mrg res.as_short[0] = m1.as_short[2];
276 1.1 mrg res.as_short[1] = m2.as_short[2];
277 1.1 mrg res.as_short[2] = m1.as_short[3];
278 1.1 mrg res.as_short[3] = m2.as_short[3];
279 1.1 mrg
280 1.1 mrg return (__m64) res.as_m64;
281 1.1 mrg }
282 1.1 mrg
283 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 1.1 mrg _m_punpckhwd (__m64 __m1, __m64 __m2)
285 1.1 mrg {
286 1.1 mrg return _mm_unpackhi_pi16 (__m1, __m2);
287 1.1 mrg }
288 1.1 mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit
289 1.1 mrg value from the high half of M2. */
290 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291 1.1 mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
292 1.1 mrg {
293 1.1 mrg __m64_union m1, m2, res;
294 1.1 mrg
295 1.1 mrg m1.as_m64 = __m1;
296 1.1 mrg m2.as_m64 = __m2;
297 1.1 mrg
298 1.1 mrg res.as_int[0] = m1.as_int[1];
299 1.1 mrg res.as_int[1] = m2.as_int[1];
300 1.1 mrg
301 1.1 mrg return (__m64) res.as_m64;
302 1.1 mrg }
303 1.1 mrg
304 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 1.1 mrg _m_punpckhdq (__m64 __m1, __m64 __m2)
306 1.1 mrg {
307 1.1 mrg return _mm_unpackhi_pi32 (__m1, __m2);
308 1.1 mrg }
309 1.1 mrg /* Interleave the four 8-bit values from the low half of M1 with the four
310 1.1 mrg 8-bit values from the low half of M2. */
311 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 1.1 mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
313 1.1 mrg {
314 1.1 mrg #if _ARCH_PWR8
315 1.1 mrg __vector unsigned char a, b, c;
316 1.1 mrg
317 1.1 mrg a = (__vector unsigned char)vec_splats (__m1);
318 1.1 mrg b = (__vector unsigned char)vec_splats (__m2);
319 1.1 mrg c = vec_mergel (a, b);
320 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 1));
321 1.1 mrg #else
322 1.1 mrg __m64_union m1, m2, res;
323 1.1 mrg
324 1.1 mrg m1.as_m64 = __m1;
325 1.1 mrg m2.as_m64 = __m2;
326 1.1 mrg
327 1.1 mrg res.as_char[0] = m1.as_char[0];
328 1.1 mrg res.as_char[1] = m2.as_char[0];
329 1.1 mrg res.as_char[2] = m1.as_char[1];
330 1.1 mrg res.as_char[3] = m2.as_char[1];
331 1.1 mrg res.as_char[4] = m1.as_char[2];
332 1.1 mrg res.as_char[5] = m2.as_char[2];
333 1.1 mrg res.as_char[6] = m1.as_char[3];
334 1.1 mrg res.as_char[7] = m2.as_char[3];
335 1.1 mrg
336 1.1 mrg return (__m64) res.as_m64;
337 1.1 mrg #endif
338 1.1 mrg }
339 1.1 mrg
340 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 1.1 mrg _m_punpcklbw (__m64 __m1, __m64 __m2)
342 1.1 mrg {
343 1.1 mrg return _mm_unpacklo_pi8 (__m1, __m2);
344 1.1 mrg }
345 1.1 mrg /* Interleave the two 16-bit values from the low half of M1 with the two
346 1.1 mrg 16-bit values from the low half of M2. */
347 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
348 1.1 mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
349 1.1 mrg {
350 1.1 mrg __m64_union m1, m2, res;
351 1.1 mrg
352 1.1 mrg m1.as_m64 = __m1;
353 1.1 mrg m2.as_m64 = __m2;
354 1.1 mrg
355 1.1 mrg res.as_short[0] = m1.as_short[0];
356 1.1 mrg res.as_short[1] = m2.as_short[0];
357 1.1 mrg res.as_short[2] = m1.as_short[1];
358 1.1 mrg res.as_short[3] = m2.as_short[1];
359 1.1 mrg
360 1.1 mrg return (__m64) res.as_m64;
361 1.1 mrg }
362 1.1 mrg
363 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
364 1.1 mrg _m_punpcklwd (__m64 __m1, __m64 __m2)
365 1.1 mrg {
366 1.1 mrg return _mm_unpacklo_pi16 (__m1, __m2);
367 1.1 mrg }
368 1.1 mrg
369 1.1 mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit
370 1.1 mrg value from the low half of M2. */
371 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 1.1 mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
373 1.1 mrg {
374 1.1 mrg __m64_union m1, m2, res;
375 1.1 mrg
376 1.1 mrg m1.as_m64 = __m1;
377 1.1 mrg m2.as_m64 = __m2;
378 1.1 mrg
379 1.1 mrg res.as_int[0] = m1.as_int[0];
380 1.1 mrg res.as_int[1] = m2.as_int[0];
381 1.1 mrg
382 1.1 mrg return (__m64) res.as_m64;
383 1.1 mrg }
384 1.1 mrg
385 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 1.1 mrg _m_punpckldq (__m64 __m1, __m64 __m2)
387 1.1 mrg {
388 1.1 mrg return _mm_unpacklo_pi32 (__m1, __m2);
389 1.1 mrg }
390 1.1 mrg
391 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2. */
392 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393 1.1 mrg _mm_add_pi8 (__m64 __m1, __m64 __m2)
394 1.1 mrg {
395 1.1 mrg #if _ARCH_PWR8
396 1.1 mrg __vector signed char a, b, c;
397 1.1 mrg
398 1.1 mrg a = (__vector signed char)vec_splats (__m1);
399 1.1 mrg b = (__vector signed char)vec_splats (__m2);
400 1.1 mrg c = vec_add (a, b);
401 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
402 1.1 mrg #else
403 1.1 mrg __m64_union m1, m2, res;
404 1.1 mrg
405 1.1 mrg m1.as_m64 = __m1;
406 1.1 mrg m2.as_m64 = __m2;
407 1.1 mrg
408 1.1 mrg res.as_char[0] = m1.as_char[0] + m2.as_char[0];
409 1.1 mrg res.as_char[1] = m1.as_char[1] + m2.as_char[1];
410 1.1 mrg res.as_char[2] = m1.as_char[2] + m2.as_char[2];
411 1.1 mrg res.as_char[3] = m1.as_char[3] + m2.as_char[3];
412 1.1 mrg res.as_char[4] = m1.as_char[4] + m2.as_char[4];
413 1.1 mrg res.as_char[5] = m1.as_char[5] + m2.as_char[5];
414 1.1 mrg res.as_char[6] = m1.as_char[6] + m2.as_char[6];
415 1.1 mrg res.as_char[7] = m1.as_char[7] + m2.as_char[7];
416 1.1 mrg
417 1.1 mrg return (__m64) res.as_m64;
418 1.1 mrg #endif
419 1.1 mrg }
420 1.1 mrg
421 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 1.1 mrg _m_paddb (__m64 __m1, __m64 __m2)
423 1.1 mrg {
424 1.1 mrg return _mm_add_pi8 (__m1, __m2);
425 1.1 mrg }
426 1.1 mrg
427 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2. */
428 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429 1.1 mrg _mm_add_pi16 (__m64 __m1, __m64 __m2)
430 1.1 mrg {
431 1.1 mrg #if _ARCH_PWR8
432 1.1 mrg __vector signed short a, b, c;
433 1.1 mrg
434 1.1 mrg a = (__vector signed short)vec_splats (__m1);
435 1.1 mrg b = (__vector signed short)vec_splats (__m2);
436 1.1 mrg c = vec_add (a, b);
437 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
438 1.1 mrg #else
439 1.1 mrg __m64_union m1, m2, res;
440 1.1 mrg
441 1.1 mrg m1.as_m64 = __m1;
442 1.1 mrg m2.as_m64 = __m2;
443 1.1 mrg
444 1.1 mrg res.as_short[0] = m1.as_short[0] + m2.as_short[0];
445 1.1 mrg res.as_short[1] = m1.as_short[1] + m2.as_short[1];
446 1.1 mrg res.as_short[2] = m1.as_short[2] + m2.as_short[2];
447 1.1 mrg res.as_short[3] = m1.as_short[3] + m2.as_short[3];
448 1.1 mrg
449 1.1 mrg return (__m64) res.as_m64;
450 1.1 mrg #endif
451 1.1 mrg }
452 1.1 mrg
453 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454 1.1 mrg _m_paddw (__m64 __m1, __m64 __m2)
455 1.1 mrg {
456 1.1 mrg return _mm_add_pi16 (__m1, __m2);
457 1.1 mrg }
458 1.1 mrg
459 1.1 mrg /* Add the 32-bit values in M1 to the 32-bit values in M2. */
460 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
461 1.1 mrg _mm_add_pi32 (__m64 __m1, __m64 __m2)
462 1.1 mrg {
463 1.1 mrg #if _ARCH_PWR9
464 1.1 mrg __vector signed int a, b, c;
465 1.1 mrg
466 1.1 mrg a = (__vector signed int)vec_splats (__m1);
467 1.1 mrg b = (__vector signed int)vec_splats (__m2);
468 1.1 mrg c = vec_add (a, b);
469 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
470 1.1 mrg #else
471 1.1 mrg __m64_union m1, m2, res;
472 1.1 mrg
473 1.1 mrg m1.as_m64 = __m1;
474 1.1 mrg m2.as_m64 = __m2;
475 1.1 mrg
476 1.1 mrg res.as_int[0] = m1.as_int[0] + m2.as_int[0];
477 1.1 mrg res.as_int[1] = m1.as_int[1] + m2.as_int[1];
478 1.1 mrg
479 1.1 mrg return (__m64) res.as_m64;
480 1.1 mrg #endif
481 1.1 mrg }
482 1.1 mrg
483 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484 1.1 mrg _m_paddd (__m64 __m1, __m64 __m2)
485 1.1 mrg {
486 1.1 mrg return _mm_add_pi32 (__m1, __m2);
487 1.1 mrg }
488 1.1 mrg
489 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
490 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
491 1.1 mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2)
492 1.1 mrg {
493 1.1 mrg #if _ARCH_PWR8
494 1.1 mrg __vector signed char a, b, c;
495 1.1 mrg
496 1.1 mrg a = (__vector signed char)vec_splats (__m1);
497 1.1 mrg b = (__vector signed char)vec_splats (__m2);
498 1.1 mrg c = vec_sub (a, b);
499 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
500 1.1 mrg #else
501 1.1 mrg __m64_union m1, m2, res;
502 1.1 mrg
503 1.1 mrg m1.as_m64 = __m1;
504 1.1 mrg m2.as_m64 = __m2;
505 1.1 mrg
506 1.1 mrg res.as_char[0] = m1.as_char[0] - m2.as_char[0];
507 1.1 mrg res.as_char[1] = m1.as_char[1] - m2.as_char[1];
508 1.1 mrg res.as_char[2] = m1.as_char[2] - m2.as_char[2];
509 1.1 mrg res.as_char[3] = m1.as_char[3] - m2.as_char[3];
510 1.1 mrg res.as_char[4] = m1.as_char[4] - m2.as_char[4];
511 1.1 mrg res.as_char[5] = m1.as_char[5] - m2.as_char[5];
512 1.1 mrg res.as_char[6] = m1.as_char[6] - m2.as_char[6];
513 1.1 mrg res.as_char[7] = m1.as_char[7] - m2.as_char[7];
514 1.1 mrg
515 1.1 mrg return (__m64) res.as_m64;
516 1.1 mrg #endif
517 1.1 mrg }
518 1.1 mrg
519 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520 1.1 mrg _m_psubb (__m64 __m1, __m64 __m2)
521 1.1 mrg {
522 1.1 mrg return _mm_sub_pi8 (__m1, __m2);
523 1.1 mrg }
524 1.1 mrg
525 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
526 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527 1.1 mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2)
528 1.1 mrg {
529 1.1 mrg #if _ARCH_PWR8
530 1.1 mrg __vector signed short a, b, c;
531 1.1 mrg
532 1.1 mrg a = (__vector signed short)vec_splats (__m1);
533 1.1 mrg b = (__vector signed short)vec_splats (__m2);
534 1.1 mrg c = vec_sub (a, b);
535 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
536 1.1 mrg #else
537 1.1 mrg __m64_union m1, m2, res;
538 1.1 mrg
539 1.1 mrg m1.as_m64 = __m1;
540 1.1 mrg m2.as_m64 = __m2;
541 1.1 mrg
542 1.1 mrg res.as_short[0] = m1.as_short[0] - m2.as_short[0];
543 1.1 mrg res.as_short[1] = m1.as_short[1] - m2.as_short[1];
544 1.1 mrg res.as_short[2] = m1.as_short[2] - m2.as_short[2];
545 1.1 mrg res.as_short[3] = m1.as_short[3] - m2.as_short[3];
546 1.1 mrg
547 1.1 mrg return (__m64) res.as_m64;
548 1.1 mrg #endif
549 1.1 mrg }
550 1.1 mrg
551 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
552 1.1 mrg _m_psubw (__m64 __m1, __m64 __m2)
553 1.1 mrg {
554 1.1 mrg return _mm_sub_pi16 (__m1, __m2);
555 1.1 mrg }
556 1.1 mrg
557 1.1 mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
558 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
559 1.1 mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2)
560 1.1 mrg {
561 1.1 mrg #if _ARCH_PWR9
562 1.1 mrg __vector signed int a, b, c;
563 1.1 mrg
564 1.1 mrg a = (__vector signed int)vec_splats (__m1);
565 1.1 mrg b = (__vector signed int)vec_splats (__m2);
566 1.1 mrg c = vec_sub (a, b);
567 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
568 1.1 mrg #else
569 1.1 mrg __m64_union m1, m2, res;
570 1.1 mrg
571 1.1 mrg m1.as_m64 = __m1;
572 1.1 mrg m2.as_m64 = __m2;
573 1.1 mrg
574 1.1 mrg res.as_int[0] = m1.as_int[0] - m2.as_int[0];
575 1.1 mrg res.as_int[1] = m1.as_int[1] - m2.as_int[1];
576 1.1 mrg
577 1.1 mrg return (__m64) res.as_m64;
578 1.1 mrg #endif
579 1.1 mrg }
580 1.1 mrg
581 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582 1.1 mrg _m_psubd (__m64 __m1, __m64 __m2)
583 1.1 mrg {
584 1.1 mrg return _mm_sub_pi32 (__m1, __m2);
585 1.1 mrg }
586 1.1 mrg
587 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588 1.1 mrg _mm_add_si64 (__m64 __m1, __m64 __m2)
589 1.1 mrg {
590 1.1 mrg return (__m1 + __m2);
591 1.1 mrg }
592 1.1 mrg
593 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
594 1.1 mrg _mm_sub_si64 (__m64 __m1, __m64 __m2)
595 1.1 mrg {
596 1.1 mrg return (__m1 - __m2);
597 1.1 mrg }
598 1.1 mrg
599 1.1 mrg /* Shift the 64-bit value in M left by COUNT. */
600 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601 1.1 mrg _mm_sll_si64 (__m64 __m, __m64 __count)
602 1.1 mrg {
603 1.1 mrg return (__m << __count);
604 1.1 mrg }
605 1.1 mrg
606 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 1.1 mrg _m_psllq (__m64 __m, __m64 __count)
608 1.1 mrg {
609 1.1 mrg return _mm_sll_si64 (__m, __count);
610 1.1 mrg }
611 1.1 mrg
612 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 1.1 mrg _mm_slli_si64 (__m64 __m, const int __count)
614 1.1 mrg {
615 1.1 mrg return (__m << __count);
616 1.1 mrg }
617 1.1 mrg
618 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 1.1 mrg _m_psllqi (__m64 __m, const int __count)
620 1.1 mrg {
621 1.1 mrg return _mm_slli_si64 (__m, __count);
622 1.1 mrg }
623 1.1 mrg
624 1.1 mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
625 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626 1.1 mrg _mm_srl_si64 (__m64 __m, __m64 __count)
627 1.1 mrg {
628 1.1 mrg return (__m >> __count);
629 1.1 mrg }
630 1.1 mrg
631 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
632 1.1 mrg _m_psrlq (__m64 __m, __m64 __count)
633 1.1 mrg {
634 1.1 mrg return _mm_srl_si64 (__m, __count);
635 1.1 mrg }
636 1.1 mrg
637 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
638 1.1 mrg _mm_srli_si64 (__m64 __m, const int __count)
639 1.1 mrg {
640 1.1 mrg return (__m >> __count);
641 1.1 mrg }
642 1.1 mrg
643 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644 1.1 mrg _m_psrlqi (__m64 __m, const int __count)
645 1.1 mrg {
646 1.1 mrg return _mm_srli_si64 (__m, __count);
647 1.1 mrg }
648 1.1 mrg
649 1.1 mrg /* Bit-wise AND the 64-bit values in M1 and M2. */
650 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651 1.1 mrg _mm_and_si64 (__m64 __m1, __m64 __m2)
652 1.1 mrg {
653 1.1 mrg return (__m1 & __m2);
654 1.1 mrg }
655 1.1 mrg
656 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
657 1.1 mrg _m_pand (__m64 __m1, __m64 __m2)
658 1.1 mrg {
659 1.1 mrg return _mm_and_si64 (__m1, __m2);
660 1.1 mrg }
661 1.1 mrg
662 1.1 mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
663 1.1 mrg 64-bit value in M2. */
664 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
665 1.1 mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2)
666 1.1 mrg {
667 1.1 mrg return (~__m1 & __m2);
668 1.1 mrg }
669 1.1 mrg
670 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671 1.1 mrg _m_pandn (__m64 __m1, __m64 __m2)
672 1.1 mrg {
673 1.1 mrg return _mm_andnot_si64 (__m1, __m2);
674 1.1 mrg }
675 1.1 mrg
676 1.1 mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
677 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678 1.1 mrg _mm_or_si64 (__m64 __m1, __m64 __m2)
679 1.1 mrg {
680 1.1 mrg return (__m1 | __m2);
681 1.1 mrg }
682 1.1 mrg
683 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
684 1.1 mrg _m_por (__m64 __m1, __m64 __m2)
685 1.1 mrg {
686 1.1 mrg return _mm_or_si64 (__m1, __m2);
687 1.1 mrg }
688 1.1 mrg
689 1.1 mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
690 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691 1.1 mrg _mm_xor_si64 (__m64 __m1, __m64 __m2)
692 1.1 mrg {
693 1.1 mrg return (__m1 ^ __m2);
694 1.1 mrg }
695 1.1 mrg
696 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 1.1 mrg _m_pxor (__m64 __m1, __m64 __m2)
698 1.1 mrg {
699 1.1 mrg return _mm_xor_si64 (__m1, __m2);
700 1.1 mrg }
701 1.1 mrg
702 1.1 mrg /* Creates a 64-bit zero. */
703 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704 1.1 mrg _mm_setzero_si64 (void)
705 1.1 mrg {
706 1.1 mrg return (__m64) 0;
707 1.1 mrg }
708 1.1 mrg
709 1.1 mrg /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
710 1.1 mrg test is true and zero if false. */
711 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
712 1.1 mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
713 1.1 mrg {
714 1.1 mrg #ifdef _ARCH_PWR6
715 1.1 mrg __m64 res;
716 1.1 mrg __asm__(
717 1.1 mrg "cmpb %0,%1,%2;\n"
718 1.1 mrg : "=r" (res)
719 1.1 mrg : "r" (__m1),
720 1.1 mrg "r" (__m2)
721 1.1 mrg : );
722 1.1 mrg return (res);
723 1.1 mrg #else
724 1.1 mrg __m64_union m1, m2, res;
725 1.1 mrg
726 1.1 mrg m1.as_m64 = __m1;
727 1.1 mrg m2.as_m64 = __m2;
728 1.1 mrg
729 1.1 mrg res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
730 1.1 mrg res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
731 1.1 mrg res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
732 1.1 mrg res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
733 1.1 mrg res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
734 1.1 mrg res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
735 1.1 mrg res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
736 1.1 mrg res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
737 1.1 mrg
738 1.1 mrg return (__m64) res.as_m64;
739 1.1 mrg #endif
740 1.1 mrg }
741 1.1 mrg
742 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 1.1 mrg _m_pcmpeqb (__m64 __m1, __m64 __m2)
744 1.1 mrg {
745 1.1 mrg return _mm_cmpeq_pi8 (__m1, __m2);
746 1.1 mrg }
747 1.1 mrg
748 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749 1.1 mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
750 1.1 mrg {
751 1.1 mrg #if _ARCH_PWR8
752 1.1 mrg __vector signed char a, b, c;
753 1.1 mrg
754 1.1 mrg a = (__vector signed char)vec_splats (__m1);
755 1.1 mrg b = (__vector signed char)vec_splats (__m2);
756 1.1 mrg c = (__vector signed char)vec_cmpgt (a, b);
757 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
758 1.1 mrg #else
759 1.1 mrg __m64_union m1, m2, res;
760 1.1 mrg
761 1.1 mrg m1.as_m64 = __m1;
762 1.1 mrg m2.as_m64 = __m2;
763 1.1 mrg
764 1.1 mrg res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
765 1.1 mrg res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
766 1.1 mrg res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
767 1.1 mrg res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
768 1.1 mrg res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
769 1.1 mrg res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
770 1.1 mrg res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
771 1.1 mrg res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
772 1.1 mrg
773 1.1 mrg return (__m64) res.as_m64;
774 1.1 mrg #endif
775 1.1 mrg }
776 1.1 mrg
777 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778 1.1 mrg _m_pcmpgtb (__m64 __m1, __m64 __m2)
779 1.1 mrg {
780 1.1 mrg return _mm_cmpgt_pi8 (__m1, __m2);
781 1.1 mrg }
782 1.1 mrg
783 1.1 mrg /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
784 1.1 mrg the test is true and zero if false. */
785 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
786 1.1 mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
787 1.1 mrg {
788 1.1 mrg #if _ARCH_PWR8
789 1.1 mrg __vector signed short a, b, c;
790 1.1 mrg
791 1.1 mrg a = (__vector signed short)vec_splats (__m1);
792 1.1 mrg b = (__vector signed short)vec_splats (__m2);
793 1.1 mrg c = (__vector signed short)vec_cmpeq (a, b);
794 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
795 1.1 mrg #else
796 1.1 mrg __m64_union m1, m2, res;
797 1.1 mrg
798 1.1 mrg m1.as_m64 = __m1;
799 1.1 mrg m2.as_m64 = __m2;
800 1.1 mrg
801 1.1 mrg res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
802 1.1 mrg res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
803 1.1 mrg res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
804 1.1 mrg res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
805 1.1 mrg
806 1.1 mrg return (__m64) res.as_m64;
807 1.1 mrg #endif
808 1.1 mrg }
809 1.1 mrg
810 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811 1.1 mrg _m_pcmpeqw (__m64 __m1, __m64 __m2)
812 1.1 mrg {
813 1.1 mrg return _mm_cmpeq_pi16 (__m1, __m2);
814 1.1 mrg }
815 1.1 mrg
816 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817 1.1 mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
818 1.1 mrg {
819 1.1 mrg #if _ARCH_PWR8
820 1.1 mrg __vector signed short a, b, c;
821 1.1 mrg
822 1.1 mrg a = (__vector signed short)vec_splats (__m1);
823 1.1 mrg b = (__vector signed short)vec_splats (__m2);
824 1.1 mrg c = (__vector signed short)vec_cmpgt (a, b);
825 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
826 1.1 mrg #else
827 1.1 mrg __m64_union m1, m2, res;
828 1.1 mrg
829 1.1 mrg m1.as_m64 = __m1;
830 1.1 mrg m2.as_m64 = __m2;
831 1.1 mrg
832 1.1 mrg res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
833 1.1 mrg res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
834 1.1 mrg res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
835 1.1 mrg res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
836 1.1 mrg
837 1.1 mrg return (__m64) res.as_m64;
838 1.1 mrg #endif
839 1.1 mrg }
840 1.1 mrg
841 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 1.1 mrg _m_pcmpgtw (__m64 __m1, __m64 __m2)
843 1.1 mrg {
844 1.1 mrg return _mm_cmpgt_pi16 (__m1, __m2);
845 1.1 mrg }
846 1.1 mrg
847 1.1 mrg /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
848 1.1 mrg the test is true and zero if false. */
849 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
850 1.1 mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
851 1.1 mrg {
852 1.1 mrg #if _ARCH_PWR9
853 1.1 mrg __vector signed int a, b, c;
854 1.1 mrg
855 1.1 mrg a = (__vector signed int)vec_splats (__m1);
856 1.1 mrg b = (__vector signed int)vec_splats (__m2);
857 1.1 mrg c = (__vector signed int)vec_cmpeq (a, b);
858 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
859 1.1 mrg #else
860 1.1 mrg __m64_union m1, m2, res;
861 1.1 mrg
862 1.1 mrg m1.as_m64 = __m1;
863 1.1 mrg m2.as_m64 = __m2;
864 1.1 mrg
865 1.1 mrg res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
866 1.1 mrg res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
867 1.1 mrg
868 1.1 mrg return (__m64) res.as_m64;
869 1.1 mrg #endif
870 1.1 mrg }
871 1.1 mrg
872 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873 1.1 mrg _m_pcmpeqd (__m64 __m1, __m64 __m2)
874 1.1 mrg {
875 1.1 mrg return _mm_cmpeq_pi32 (__m1, __m2);
876 1.1 mrg }
877 1.1 mrg
878 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879 1.1 mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
880 1.1 mrg {
881 1.1 mrg #if _ARCH_PWR9
882 1.1 mrg __vector signed int a, b, c;
883 1.1 mrg
884 1.1 mrg a = (__vector signed int)vec_splats (__m1);
885 1.1 mrg b = (__vector signed int)vec_splats (__m2);
886 1.1 mrg c = (__vector signed int)vec_cmpgt (a, b);
887 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
888 1.1 mrg #else
889 1.1 mrg __m64_union m1, m2, res;
890 1.1 mrg
891 1.1 mrg m1.as_m64 = __m1;
892 1.1 mrg m2.as_m64 = __m2;
893 1.1 mrg
894 1.1 mrg res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
895 1.1 mrg res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
896 1.1 mrg
897 1.1 mrg return (__m64) res.as_m64;
898 1.1 mrg #endif
899 1.1 mrg }
900 1.1 mrg
901 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902 1.1 mrg _m_pcmpgtd (__m64 __m1, __m64 __m2)
903 1.1 mrg {
904 1.1 mrg return _mm_cmpgt_pi32 (__m1, __m2);
905 1.1 mrg }
906 1.1 mrg
907 1.1 mrg #if _ARCH_PWR8
908 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
909 1.1 mrg saturated arithmetic. */
910 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911 1.1 mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2)
912 1.1 mrg {
913 1.1 mrg __vector signed char a, b, c;
914 1.1 mrg
915 1.1 mrg a = (__vector signed char)vec_splats (__m1);
916 1.1 mrg b = (__vector signed char)vec_splats (__m2);
917 1.1 mrg c = vec_adds (a, b);
918 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
919 1.1 mrg }
920 1.1 mrg
921 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922 1.1 mrg _m_paddsb (__m64 __m1, __m64 __m2)
923 1.1 mrg {
924 1.1 mrg return _mm_adds_pi8 (__m1, __m2);
925 1.1 mrg }
926 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
927 1.1 mrg saturated arithmetic. */
928 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929 1.1 mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2)
930 1.1 mrg {
931 1.1 mrg __vector signed short a, b, c;
932 1.1 mrg
933 1.1 mrg a = (__vector signed short)vec_splats (__m1);
934 1.1 mrg b = (__vector signed short)vec_splats (__m2);
935 1.1 mrg c = vec_adds (a, b);
936 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
937 1.1 mrg }
938 1.1 mrg
939 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
940 1.1 mrg _m_paddsw (__m64 __m1, __m64 __m2)
941 1.1 mrg {
942 1.1 mrg return _mm_adds_pi16 (__m1, __m2);
943 1.1 mrg }
944 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
945 1.1 mrg saturated arithmetic. */
946 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
947 1.1 mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2)
948 1.1 mrg {
949 1.1 mrg __vector unsigned char a, b, c;
950 1.1 mrg
951 1.1 mrg a = (__vector unsigned char)vec_splats (__m1);
952 1.1 mrg b = (__vector unsigned char)vec_splats (__m2);
953 1.1 mrg c = vec_adds (a, b);
954 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
955 1.1 mrg }
956 1.1 mrg
957 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958 1.1 mrg _m_paddusb (__m64 __m1, __m64 __m2)
959 1.1 mrg {
960 1.1 mrg return _mm_adds_pu8 (__m1, __m2);
961 1.1 mrg }
962 1.1 mrg
963 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
964 1.1 mrg saturated arithmetic. */
965 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966 1.1 mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2)
967 1.1 mrg {
968 1.1 mrg __vector unsigned short a, b, c;
969 1.1 mrg
970 1.1 mrg a = (__vector unsigned short)vec_splats (__m1);
971 1.1 mrg b = (__vector unsigned short)vec_splats (__m2);
972 1.1 mrg c = vec_adds (a, b);
973 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
974 1.1 mrg }
975 1.1 mrg
976 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 1.1 mrg _m_paddusw (__m64 __m1, __m64 __m2)
978 1.1 mrg {
979 1.1 mrg return _mm_adds_pu16 (__m1, __m2);
980 1.1 mrg }
981 1.1 mrg
982 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
983 1.1 mrg saturating arithmetic. */
984 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985 1.1 mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2)
986 1.1 mrg {
987 1.1 mrg __vector signed char a, b, c;
988 1.1 mrg
989 1.1 mrg a = (__vector signed char)vec_splats (__m1);
990 1.1 mrg b = (__vector signed char)vec_splats (__m2);
991 1.1 mrg c = vec_subs (a, b);
992 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
993 1.1 mrg }
994 1.1 mrg
995 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 1.1 mrg _m_psubsb (__m64 __m1, __m64 __m2)
997 1.1 mrg {
998 1.1 mrg return _mm_subs_pi8 (__m1, __m2);
999 1.1 mrg }
1000 1.1 mrg
1001 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1002 1.1 mrg signed saturating arithmetic. */
1003 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1004 1.1 mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1005 1.1 mrg {
1006 1.1 mrg __vector signed short a, b, c;
1007 1.1 mrg
1008 1.1 mrg a = (__vector signed short)vec_splats (__m1);
1009 1.1 mrg b = (__vector signed short)vec_splats (__m2);
1010 1.1 mrg c = vec_subs (a, b);
1011 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1012 1.1 mrg }
1013 1.1 mrg
1014 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 1.1 mrg _m_psubsw (__m64 __m1, __m64 __m2)
1016 1.1 mrg {
1017 1.1 mrg return _mm_subs_pi16 (__m1, __m2);
1018 1.1 mrg }
1019 1.1 mrg
1020 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1021 1.1 mrg unsigned saturating arithmetic. */
1022 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1023 1.1 mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1024 1.1 mrg {
1025 1.1 mrg __vector unsigned char a, b, c;
1026 1.1 mrg
1027 1.1 mrg a = (__vector unsigned char)vec_splats (__m1);
1028 1.1 mrg b = (__vector unsigned char)vec_splats (__m2);
1029 1.1 mrg c = vec_subs (a, b);
1030 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1031 1.1 mrg }
1032 1.1 mrg
1033 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034 1.1 mrg _m_psubusb (__m64 __m1, __m64 __m2)
1035 1.1 mrg {
1036 1.1 mrg return _mm_subs_pu8 (__m1, __m2);
1037 1.1 mrg }
1038 1.1 mrg
1039 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1040 1.1 mrg unsigned saturating arithmetic. */
1041 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 1.1 mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1043 1.1 mrg {
1044 1.1 mrg __vector unsigned short a, b, c;
1045 1.1 mrg
1046 1.1 mrg a = (__vector unsigned short)vec_splats (__m1);
1047 1.1 mrg b = (__vector unsigned short)vec_splats (__m2);
1048 1.1 mrg c = vec_subs (a, b);
1049 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1050 1.1 mrg }
1051 1.1 mrg
1052 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1053 1.1 mrg _m_psubusw (__m64 __m1, __m64 __m2)
1054 1.1 mrg {
1055 1.1 mrg return _mm_subs_pu16 (__m1, __m2);
1056 1.1 mrg }
1057 1.1 mrg
1058 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1059 1.1 mrg four 32-bit intermediate results, which are then summed by pairs to
1060 1.1 mrg produce two 32-bit results. */
1061 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1062 1.1 mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1063 1.1 mrg {
1064 1.1 mrg __vector signed short a, b;
1065 1.1 mrg __vector signed int c;
1066 1.1 mrg __vector signed int zero = {0, 0, 0, 0};
1067 1.1 mrg
1068 1.1 mrg a = (__vector signed short)vec_splats (__m1);
1069 1.1 mrg b = (__vector signed short)vec_splats (__m2);
1070 1.1 mrg c = vec_vmsumshm (a, b, zero);
1071 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1072 1.1 mrg }
1073 1.1 mrg
1074 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075 1.1 mrg _m_pmaddwd (__m64 __m1, __m64 __m2)
1076 1.1 mrg {
1077 1.1 mrg return _mm_madd_pi16 (__m1, __m2);
1078 1.1 mrg }
1079 1.1 mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1080 1.1 mrg M2 and produce the high 16 bits of the 32-bit results. */
1081 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082 1.1 mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1083 1.1 mrg {
1084 1.1 mrg __vector signed short a, b;
1085 1.1 mrg __vector signed short c;
1086 1.1 mrg __vector signed int w0, w1;
1087 1.1 mrg __vector unsigned char xform1 = {
1088 1.1 mrg 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1089 1.1 mrg 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1090 1.1 mrg };
1091 1.1 mrg
1092 1.1 mrg a = (__vector signed short)vec_splats (__m1);
1093 1.1 mrg b = (__vector signed short)vec_splats (__m2);
1094 1.1 mrg
1095 1.1 mrg w0 = vec_vmulesh (a, b);
1096 1.1 mrg w1 = vec_vmulosh (a, b);
1097 1.1 mrg c = (__vector signed short)vec_perm (w0, w1, xform1);
1098 1.1 mrg
1099 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1100 1.1 mrg }
1101 1.1 mrg
1102 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103 1.1 mrg _m_pmulhw (__m64 __m1, __m64 __m2)
1104 1.1 mrg {
1105 1.1 mrg return _mm_mulhi_pi16 (__m1, __m2);
1106 1.1 mrg }
1107 1.1 mrg
1108 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1109 1.1 mrg the low 16 bits of the results. */
1110 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111 1.1 mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1112 1.1 mrg {
1113 1.1 mrg __vector signed short a, b, c;
1114 1.1 mrg
1115 1.1 mrg a = (__vector signed short)vec_splats (__m1);
1116 1.1 mrg b = (__vector signed short)vec_splats (__m2);
1117 1.1 mrg c = a * b;
1118 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1119 1.1 mrg }
1120 1.1 mrg
1121 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 1.1 mrg _m_pmullw (__m64 __m1, __m64 __m2)
1123 1.1 mrg {
1124 1.1 mrg return _mm_mullo_pi16 (__m1, __m2);
1125 1.1 mrg }
1126 1.1 mrg
1127 1.1 mrg /* Shift four 16-bit values in M left by COUNT. */
1128 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129 1.1 mrg _mm_sll_pi16 (__m64 __m, __m64 __count)
1130 1.1 mrg {
1131 1.1 mrg __vector signed short m, r;
1132 1.1 mrg __vector unsigned short c;
1133 1.1 mrg
1134 1.1 mrg if (__count <= 15)
1135 1.1 mrg {
1136 1.1 mrg m = (__vector signed short)vec_splats (__m);
1137 1.1 mrg c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1138 1.1 mrg r = vec_sl (m, (__vector unsigned short)c);
1139 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1140 1.1 mrg }
1141 1.1 mrg else
1142 1.1 mrg return (0);
1143 1.1 mrg }
1144 1.1 mrg
1145 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1146 1.1 mrg _m_psllw (__m64 __m, __m64 __count)
1147 1.1 mrg {
1148 1.1 mrg return _mm_sll_pi16 (__m, __count);
1149 1.1 mrg }
1150 1.1 mrg
1151 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152 1.1 mrg _mm_slli_pi16 (__m64 __m, int __count)
1153 1.1 mrg {
1154 1.1 mrg /* Promote int to long then invoke mm_sll_pi16. */
1155 1.1 mrg return _mm_sll_pi16 (__m, __count);
1156 1.1 mrg }
1157 1.1 mrg
1158 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159 1.1 mrg _m_psllwi (__m64 __m, int __count)
1160 1.1 mrg {
1161 1.1 mrg return _mm_slli_pi16 (__m, __count);
1162 1.1 mrg }
1163 1.1 mrg
1164 1.1 mrg /* Shift two 32-bit values in M left by COUNT. */
1165 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1166 1.1 mrg _mm_sll_pi32 (__m64 __m, __m64 __count)
1167 1.1 mrg {
1168 1.1 mrg __m64_union m, res;
1169 1.1 mrg
1170 1.1 mrg m.as_m64 = __m;
1171 1.1 mrg
1172 1.1 mrg res.as_int[0] = m.as_int[0] << __count;
1173 1.1 mrg res.as_int[1] = m.as_int[1] << __count;
1174 1.1 mrg return (res.as_m64);
1175 1.1 mrg }
1176 1.1 mrg
1177 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178 1.1 mrg _m_pslld (__m64 __m, __m64 __count)
1179 1.1 mrg {
1180 1.1 mrg return _mm_sll_pi32 (__m, __count);
1181 1.1 mrg }
1182 1.1 mrg
1183 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 1.1 mrg _mm_slli_pi32 (__m64 __m, int __count)
1185 1.1 mrg {
1186 1.1 mrg /* Promote int to long then invoke mm_sll_pi32. */
1187 1.1 mrg return _mm_sll_pi32 (__m, __count);
1188 1.1 mrg }
1189 1.1 mrg
1190 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191 1.1 mrg _m_pslldi (__m64 __m, int __count)
1192 1.1 mrg {
1193 1.1 mrg return _mm_slli_pi32 (__m, __count);
1194 1.1 mrg }
1195 1.1 mrg
1196 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1197 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 1.1 mrg _mm_sra_pi16 (__m64 __m, __m64 __count)
1199 1.1 mrg {
1200 1.1 mrg __vector signed short m, r;
1201 1.1 mrg __vector unsigned short c;
1202 1.1 mrg
1203 1.1 mrg if (__count <= 15)
1204 1.1 mrg {
1205 1.1 mrg m = (__vector signed short)vec_splats (__m);
1206 1.1 mrg c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1207 1.1 mrg r = vec_sra (m, (__vector unsigned short)c);
1208 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1209 1.1 mrg }
1210 1.1 mrg else
1211 1.1 mrg return (0);
1212 1.1 mrg }
1213 1.1 mrg
1214 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 1.1 mrg _m_psraw (__m64 __m, __m64 __count)
1216 1.1 mrg {
1217 1.1 mrg return _mm_sra_pi16 (__m, __count);
1218 1.1 mrg }
1219 1.1 mrg
1220 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221 1.1 mrg _mm_srai_pi16 (__m64 __m, int __count)
1222 1.1 mrg {
1223 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1224 1.1 mrg return _mm_sra_pi16 (__m, __count);
1225 1.1 mrg }
1226 1.1 mrg
1227 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228 1.1 mrg _m_psrawi (__m64 __m, int __count)
1229 1.1 mrg {
1230 1.1 mrg return _mm_srai_pi16 (__m, __count);
1231 1.1 mrg }
1232 1.1 mrg
1233 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1234 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235 1.1 mrg _mm_sra_pi32 (__m64 __m, __m64 __count)
1236 1.1 mrg {
1237 1.1 mrg __m64_union m, res;
1238 1.1 mrg
1239 1.1 mrg m.as_m64 = __m;
1240 1.1 mrg
1241 1.1 mrg res.as_int[0] = m.as_int[0] >> __count;
1242 1.1 mrg res.as_int[1] = m.as_int[1] >> __count;
1243 1.1 mrg return (res.as_m64);
1244 1.1 mrg }
1245 1.1 mrg
1246 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247 1.1 mrg _m_psrad (__m64 __m, __m64 __count)
1248 1.1 mrg {
1249 1.1 mrg return _mm_sra_pi32 (__m, __count);
1250 1.1 mrg }
1251 1.1 mrg
1252 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 1.1 mrg _mm_srai_pi32 (__m64 __m, int __count)
1254 1.1 mrg {
1255 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1256 1.1 mrg return _mm_sra_pi32 (__m, __count);
1257 1.1 mrg }
1258 1.1 mrg
1259 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260 1.1 mrg _m_psradi (__m64 __m, int __count)
1261 1.1 mrg {
1262 1.1 mrg return _mm_srai_pi32 (__m, __count);
1263 1.1 mrg }
1264 1.1 mrg
1265 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1266 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1267 1.1 mrg _mm_srl_pi16 (__m64 __m, __m64 __count)
1268 1.1 mrg {
1269 1.1 mrg __vector unsigned short m, r;
1270 1.1 mrg __vector unsigned short c;
1271 1.1 mrg
1272 1.1 mrg if (__count <= 15)
1273 1.1 mrg {
1274 1.1 mrg m = (__vector unsigned short)vec_splats (__m);
1275 1.1 mrg c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1276 1.1 mrg r = vec_sr (m, (__vector unsigned short)c);
1277 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1278 1.1 mrg }
1279 1.1 mrg else
1280 1.1 mrg return (0);
1281 1.1 mrg }
1282 1.1 mrg
1283 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1284 1.1 mrg _m_psrlw (__m64 __m, __m64 __count)
1285 1.1 mrg {
1286 1.1 mrg return _mm_srl_pi16 (__m, __count);
1287 1.1 mrg }
1288 1.1 mrg
1289 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 1.1 mrg _mm_srli_pi16 (__m64 __m, int __count)
1291 1.1 mrg {
1292 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1293 1.1 mrg return _mm_srl_pi16 (__m, __count);
1294 1.1 mrg }
1295 1.1 mrg
1296 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 1.1 mrg _m_psrlwi (__m64 __m, int __count)
1298 1.1 mrg {
1299 1.1 mrg return _mm_srli_pi16 (__m, __count);
1300 1.1 mrg }
1301 1.1 mrg
1302 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1303 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 1.1 mrg _mm_srl_pi32 (__m64 __m, __m64 __count)
1305 1.1 mrg {
1306 1.1 mrg __m64_union m, res;
1307 1.1 mrg
1308 1.1 mrg m.as_m64 = __m;
1309 1.1 mrg
1310 1.1 mrg res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1311 1.1 mrg res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1312 1.1 mrg return (res.as_m64);
1313 1.1 mrg }
1314 1.1 mrg
1315 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316 1.1 mrg _m_psrld (__m64 __m, __m64 __count)
1317 1.1 mrg {
1318 1.1 mrg return _mm_srl_pi32 (__m, __count);
1319 1.1 mrg }
1320 1.1 mrg
1321 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322 1.1 mrg _mm_srli_pi32 (__m64 __m, int __count)
1323 1.1 mrg {
1324 1.1 mrg /* Promote int to long then invoke mm_srl_pi32. */
1325 1.1 mrg return _mm_srl_pi32 (__m, __count);
1326 1.1 mrg }
1327 1.1 mrg
1328 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 1.1 mrg _m_psrldi (__m64 __m, int __count)
1330 1.1 mrg {
1331 1.1 mrg return _mm_srli_pi32 (__m, __count);
1332 1.1 mrg }
1333 1.1 mrg #endif /* _ARCH_PWR8 */
1334 1.1 mrg
1335 1.1 mrg /* Creates a vector of two 32-bit values; I0 is least significant. */
1336 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 1.1 mrg _mm_set_pi32 (int __i1, int __i0)
1338 1.1 mrg {
1339 1.1 mrg __m64_union res;
1340 1.1 mrg
1341 1.1 mrg res.as_int[0] = __i0;
1342 1.1 mrg res.as_int[1] = __i1;
1343 1.1 mrg return (res.as_m64);
1344 1.1 mrg }
1345 1.1 mrg
1346 1.1 mrg /* Creates a vector of four 16-bit values; W0 is least significant. */
1347 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1348 1.1 mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1349 1.1 mrg {
1350 1.1 mrg __m64_union res;
1351 1.1 mrg
1352 1.1 mrg res.as_short[0] = __w0;
1353 1.1 mrg res.as_short[1] = __w1;
1354 1.1 mrg res.as_short[2] = __w2;
1355 1.1 mrg res.as_short[3] = __w3;
1356 1.1 mrg return (res.as_m64);
1357 1.1 mrg }
1358 1.1 mrg
1359 1.1 mrg /* Creates a vector of eight 8-bit values; B0 is least significant. */
1360 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 1.1 mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1362 1.1 mrg char __b3, char __b2, char __b1, char __b0)
1363 1.1 mrg {
1364 1.1 mrg __m64_union res;
1365 1.1 mrg
1366 1.1 mrg res.as_char[0] = __b0;
1367 1.1 mrg res.as_char[1] = __b1;
1368 1.1 mrg res.as_char[2] = __b2;
1369 1.1 mrg res.as_char[3] = __b3;
1370 1.1 mrg res.as_char[4] = __b4;
1371 1.1 mrg res.as_char[5] = __b5;
1372 1.1 mrg res.as_char[6] = __b6;
1373 1.1 mrg res.as_char[7] = __b7;
1374 1.1 mrg return (res.as_m64);
1375 1.1 mrg }
1376 1.1 mrg
1377 1.1 mrg /* Similar, but with the arguments in reverse order. */
1378 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 1.1 mrg _mm_setr_pi32 (int __i0, int __i1)
1380 1.1 mrg {
1381 1.1 mrg __m64_union res;
1382 1.1 mrg
1383 1.1 mrg res.as_int[0] = __i0;
1384 1.1 mrg res.as_int[1] = __i1;
1385 1.1 mrg return (res.as_m64);
1386 1.1 mrg }
1387 1.1 mrg
1388 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389 1.1 mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1390 1.1 mrg {
1391 1.1 mrg return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1392 1.1 mrg }
1393 1.1 mrg
1394 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395 1.1 mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1396 1.1 mrg char __b4, char __b5, char __b6, char __b7)
1397 1.1 mrg {
1398 1.1 mrg return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1399 1.1 mrg }
1400 1.1 mrg
1401 1.1 mrg /* Creates a vector of two 32-bit values, both elements containing I. */
1402 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 1.1 mrg _mm_set1_pi32 (int __i)
1404 1.1 mrg {
1405 1.1 mrg __m64_union res;
1406 1.1 mrg
1407 1.1 mrg res.as_int[0] = __i;
1408 1.1 mrg res.as_int[1] = __i;
1409 1.1 mrg return (res.as_m64);
1410 1.1 mrg }
1411 1.1 mrg
1412 1.1 mrg /* Creates a vector of four 16-bit values, all elements containing W. */
1413 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1414 1.1 mrg _mm_set1_pi16 (short __w)
1415 1.1 mrg {
1416 1.1 mrg #if _ARCH_PWR9
1417 1.1 mrg __vector signed short w;
1418 1.1 mrg
1419 1.1 mrg w = (__vector signed short)vec_splats (__w);
1420 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)w, 0));
1421 1.1 mrg #else
1422 1.1 mrg __m64_union res;
1423 1.1 mrg
1424 1.1 mrg res.as_short[0] = __w;
1425 1.1 mrg res.as_short[1] = __w;
1426 1.1 mrg res.as_short[2] = __w;
1427 1.1 mrg res.as_short[3] = __w;
1428 1.1 mrg return (res.as_m64);
1429 1.1 mrg #endif
1430 1.1 mrg }
1431 1.1 mrg
1432 1.1 mrg /* Creates a vector of eight 8-bit values, all elements containing B. */
1433 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1434 1.1 mrg _mm_set1_pi8 (signed char __b)
1435 1.1 mrg {
1436 1.1 mrg #if _ARCH_PWR8
1437 1.1 mrg __vector signed char b;
1438 1.1 mrg
1439 1.1 mrg b = (__vector signed char)vec_splats (__b);
1440 1.1 mrg return (__builtin_unpack_vector_int128 ((__vector __int128_t)b, 0));
1441 1.1 mrg #else
1442 1.1 mrg __m64_union res;
1443 1.1 mrg
1444 1.1 mrg res.as_char[0] = __b;
1445 1.1 mrg res.as_char[1] = __b;
1446 1.1 mrg res.as_char[2] = __b;
1447 1.1 mrg res.as_char[3] = __b;
1448 1.1 mrg res.as_char[4] = __b;
1449 1.1 mrg res.as_char[5] = __b;
1450 1.1 mrg res.as_char[6] = __b;
1451 1.1 mrg res.as_char[7] = __b;
1452 1.1 mrg return (res.as_m64);
1453 1.1 mrg #endif
1454 1.1 mrg }
1455 1.1 mrg #endif /* __powerpc64__ */
1456 1.1 mrg #endif /* _MMINTRIN_H_INCLUDED */
1457