mmintrin.h revision 1.1.1.4 1 1.1.1.4 mrg /* Copyright (C) 2002-2022 Free Software Foundation, Inc.
2 1.1 mrg
3 1.1 mrg This file is part of GCC.
4 1.1 mrg
5 1.1 mrg GCC is free software; you can redistribute it and/or modify
6 1.1 mrg it under the terms of the GNU General Public License as published by
7 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
8 1.1 mrg any later version.
9 1.1 mrg
10 1.1 mrg GCC is distributed in the hope that it will be useful,
11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 1.1 mrg GNU General Public License for more details.
14 1.1 mrg
15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
16 1.1 mrg permissions described in the GCC Runtime Library Exception, version
17 1.1 mrg 3.1, as published by the Free Software Foundation.
18 1.1 mrg
19 1.1 mrg You should have received a copy of the GNU General Public License and
20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 1.1 mrg <http://www.gnu.org/licenses/>. */
23 1.1 mrg
24 1.1 mrg /* Implemented from the specification included in the Intel C++ Compiler
25 1.1 mrg User Guide and Reference, version 9.0. */
26 1.1 mrg
27 1.1 mrg #ifndef NO_WARN_X86_INTRINSICS
28 1.1 mrg /* This header is distributed to simplify porting x86_64 code that
29 1.1 mrg makes explicit use of Intel intrinsics to powerpc64le.
30 1.1 mrg It is the user's responsibility to determine if the results are
31 1.1 mrg acceptable and make additional changes as necessary.
32 1.1 mrg Note that much code that uses Intel intrinsics can be rewritten in
33 1.1 mrg standard C or GNU C extensions, which are more portable and better
34 1.1 mrg optimized across multiple targets.
35 1.1 mrg
36 1.1 mrg In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37 1.1 mrg target does not support a native __vector_size__ (8) type. Instead
38 1.1 mrg we typedef __m64 to a 64-bit unsigned long long, which is natively
39 1.1 mrg supported in 64-bit mode. This works well for the _si64 and some
40 1.1 mrg _pi32 operations, but starts to generate long sequences for _pi16
41 1.1 mrg and _pi8 operations. For those cases it better (faster and
42 1.1 mrg smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43 1.1 mrg unit, perform the operation, and then transfer the result back to
44 1.1 mrg the __m64 type. This implies that the direct register move
45 1.1 mrg instructions, introduced with power8, are available for efficient
46 1.1 mrg implementation of these transfers.
47 1.1 mrg
48 1.1 mrg Most MMX intrinsic operations can be performed efficiently as
49 1.1 mrg C language 64-bit scalar operation or optimized to use the newer
50 1.1 mrg 128-bit SSE/Altivec operations. We recomend this for new
51 1.1 mrg applications. */
52 1.1 mrg #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
53 1.1 mrg #endif
54 1.1 mrg
55 1.1 mrg #ifndef _MMINTRIN_H_INCLUDED
56 1.1 mrg #define _MMINTRIN_H_INCLUDED
57 1.1 mrg
58 1.1 mrg #include <altivec.h>
59 1.1 mrg /* The Intel API is flexible enough that we must allow aliasing with other
60 1.1 mrg vector types, and their scalar components. */
61 1.1.1.3 mrg typedef __attribute__ ((__aligned__ (8),
62 1.1.1.3 mrg __may_alias__)) unsigned long long __m64;
63 1.1 mrg
64 1.1 mrg typedef __attribute__ ((__aligned__ (8)))
65 1.1 mrg union
66 1.1 mrg {
67 1.1 mrg __m64 as_m64;
68 1.1 mrg char as_char[8];
69 1.1 mrg signed char as_signed_char [8];
70 1.1 mrg short as_short[4];
71 1.1 mrg int as_int[2];
72 1.1 mrg long long as_long_long;
73 1.1 mrg float as_float[2];
74 1.1 mrg double as_double;
75 1.1 mrg } __m64_union;
76 1.1 mrg
77 1.1 mrg /* Empty the multimedia state. */
78 1.1 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79 1.1 mrg _mm_empty (void)
80 1.1 mrg {
81 1.1 mrg /* nothing to do on PowerPC. */
82 1.1 mrg }
83 1.1 mrg
84 1.1 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85 1.1 mrg _m_empty (void)
86 1.1 mrg {
87 1.1 mrg /* nothing to do on PowerPC. */
88 1.1 mrg }
89 1.1 mrg
90 1.1 mrg /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
91 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92 1.1 mrg _mm_cvtsi32_si64 (int __i)
93 1.1 mrg {
94 1.1 mrg return (__m64) (unsigned int) __i;
95 1.1 mrg }
96 1.1 mrg
97 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98 1.1 mrg _m_from_int (int __i)
99 1.1 mrg {
100 1.1 mrg return _mm_cvtsi32_si64 (__i);
101 1.1 mrg }
102 1.1 mrg
103 1.1 mrg /* Convert the lower 32 bits of the __m64 object into an integer. */
104 1.1 mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105 1.1 mrg _mm_cvtsi64_si32 (__m64 __i)
106 1.1 mrg {
107 1.1 mrg return ((int) __i);
108 1.1 mrg }
109 1.1 mrg
110 1.1 mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
111 1.1 mrg _m_to_int (__m64 __i)
112 1.1 mrg {
113 1.1 mrg return _mm_cvtsi64_si32 (__i);
114 1.1 mrg }
115 1.1 mrg
116 1.1 mrg /* Convert I to a __m64 object. */
117 1.1 mrg
118 1.1 mrg /* Intel intrinsic. */
119 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 1.1 mrg _m_from_int64 (long long __i)
121 1.1 mrg {
122 1.1 mrg return (__m64) __i;
123 1.1 mrg }
124 1.1 mrg
125 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 1.1 mrg _mm_cvtsi64_m64 (long long __i)
127 1.1 mrg {
128 1.1 mrg return (__m64) __i;
129 1.1 mrg }
130 1.1 mrg
131 1.1 mrg /* Microsoft intrinsic. */
132 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 1.1 mrg _mm_cvtsi64x_si64 (long long __i)
134 1.1 mrg {
135 1.1 mrg return (__m64) __i;
136 1.1 mrg }
137 1.1 mrg
138 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 1.1 mrg _mm_set_pi64x (long long __i)
140 1.1 mrg {
141 1.1 mrg return (__m64) __i;
142 1.1 mrg }
143 1.1 mrg
144 1.1 mrg /* Convert the __m64 object to a 64bit integer. */
145 1.1 mrg
146 1.1 mrg /* Intel intrinsic. */
147 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 1.1 mrg _m_to_int64 (__m64 __i)
149 1.1 mrg {
150 1.1 mrg return (long long)__i;
151 1.1 mrg }
152 1.1 mrg
153 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154 1.1 mrg _mm_cvtm64_si64 (__m64 __i)
155 1.1 mrg {
156 1.1 mrg return (long long) __i;
157 1.1 mrg }
158 1.1 mrg
159 1.1 mrg /* Microsoft intrinsic. */
160 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 1.1 mrg _mm_cvtsi64_si64x (__m64 __i)
162 1.1 mrg {
163 1.1 mrg return (long long) __i;
164 1.1 mrg }
165 1.1 mrg
166 1.1 mrg #ifdef _ARCH_PWR8
167 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
168 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit
169 1.1 mrg values of the result, all with signed saturation. */
170 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
171 1.1 mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2)
172 1.1 mrg {
173 1.1.1.3 mrg __vector signed short __vm1;
174 1.1.1.3 mrg __vector signed char __vresult;
175 1.1 mrg
176 1.1.1.3 mrg __vm1 = (__vector signed short) (__vector unsigned long long)
177 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__
178 1.1.1.2 mrg { __m1, __m2 };
179 1.1.1.2 mrg #else
180 1.1.1.2 mrg { __m2, __m1 };
181 1.1.1.2 mrg #endif
182 1.1.1.3 mrg __vresult = vec_packs (__vm1, __vm1);
183 1.1.1.3 mrg return (__m64) ((__vector long long) __vresult)[0];
184 1.1 mrg }
185 1.1 mrg
186 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
187 1.1 mrg _m_packsswb (__m64 __m1, __m64 __m2)
188 1.1 mrg {
189 1.1 mrg return _mm_packs_pi16 (__m1, __m2);
190 1.1 mrg }
191 1.1 mrg
192 1.1 mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
193 1.1 mrg the result, and the two 32-bit values from M2 into the upper two 16-bit
194 1.1 mrg values of the result, all with signed saturation. */
195 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 1.1 mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2)
197 1.1 mrg {
198 1.1.1.3 mrg __vector signed int __vm1;
199 1.1.1.3 mrg __vector signed short __vresult;
200 1.1 mrg
201 1.1.1.3 mrg __vm1 = (__vector signed int) (__vector unsigned long long)
202 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__
203 1.1.1.2 mrg { __m1, __m2 };
204 1.1.1.2 mrg #else
205 1.1.1.2 mrg { __m2, __m1 };
206 1.1.1.2 mrg #endif
207 1.1.1.3 mrg __vresult = vec_packs (__vm1, __vm1);
208 1.1.1.3 mrg return (__m64) ((__vector long long) __vresult)[0];
209 1.1 mrg }
210 1.1 mrg
211 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 1.1 mrg _m_packssdw (__m64 __m1, __m64 __m2)
213 1.1 mrg {
214 1.1 mrg return _mm_packs_pi32 (__m1, __m2);
215 1.1 mrg }
216 1.1 mrg
217 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
218 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit
219 1.1 mrg values of the result, all with unsigned saturation. */
220 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 1.1 mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2)
222 1.1 mrg {
223 1.1.1.3 mrg __vector unsigned char __r;
224 1.1.1.3 mrg __vector signed short __vm1 = (__vector signed short) (__vector long long)
225 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__
226 1.1.1.2 mrg { __m1, __m2 };
227 1.1.1.2 mrg #else
228 1.1.1.2 mrg { __m2, __m1 };
229 1.1.1.2 mrg #endif
230 1.1.1.2 mrg const __vector signed short __zero = { 0 };
231 1.1.1.3 mrg __vector __bool short __select = vec_cmplt (__vm1, __zero);
232 1.1.1.3 mrg __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1);
233 1.1.1.3 mrg __vector __bool char __packsel = vec_pack (__select, __select);
234 1.1.1.3 mrg __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel);
235 1.1.1.3 mrg return (__m64) ((__vector long long) __r)[0];
236 1.1 mrg }
237 1.1 mrg
238 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239 1.1 mrg _m_packuswb (__m64 __m1, __m64 __m2)
240 1.1 mrg {
241 1.1 mrg return _mm_packs_pu16 (__m1, __m2);
242 1.1 mrg }
243 1.1 mrg #endif /* end ARCH_PWR8 */
244 1.1 mrg
245 1.1 mrg /* Interleave the four 8-bit values from the high half of M1 with the four
246 1.1 mrg 8-bit values from the high half of M2. */
247 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 1.1 mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
249 1.1 mrg {
250 1.1 mrg #if _ARCH_PWR8
251 1.1.1.3 mrg __vector unsigned char __a, __b, __c;
252 1.1 mrg
253 1.1.1.3 mrg __a = (__vector unsigned char)vec_splats (__m1);
254 1.1.1.3 mrg __b = (__vector unsigned char)vec_splats (__m2);
255 1.1.1.3 mrg __c = vec_mergel (__a, __b);
256 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[1];
257 1.1 mrg #else
258 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
259 1.1 mrg
260 1.1.1.3 mrg __mu1.as_m64 = __m1;
261 1.1.1.3 mrg __mu2.as_m64 = __m2;
262 1.1 mrg
263 1.1.1.3 mrg __res.as_char[0] = __mu1.as_char[4];
264 1.1.1.3 mrg __res.as_char[1] = __mu2.as_char[4];
265 1.1.1.3 mrg __res.as_char[2] = __mu1.as_char[5];
266 1.1.1.3 mrg __res.as_char[3] = __mu2.as_char[5];
267 1.1.1.3 mrg __res.as_char[4] = __mu1.as_char[6];
268 1.1.1.3 mrg __res.as_char[5] = __mu2.as_char[6];
269 1.1.1.3 mrg __res.as_char[6] = __mu1.as_char[7];
270 1.1.1.3 mrg __res.as_char[7] = __mu2.as_char[7];
271 1.1 mrg
272 1.1.1.3 mrg return (__m64) __res.as_m64;
273 1.1 mrg #endif
274 1.1 mrg }
275 1.1 mrg
276 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277 1.1 mrg _m_punpckhbw (__m64 __m1, __m64 __m2)
278 1.1 mrg {
279 1.1 mrg return _mm_unpackhi_pi8 (__m1, __m2);
280 1.1 mrg }
281 1.1 mrg
282 1.1 mrg /* Interleave the two 16-bit values from the high half of M1 with the two
283 1.1 mrg 16-bit values from the high half of M2. */
284 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285 1.1 mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
286 1.1 mrg {
287 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
288 1.1 mrg
289 1.1.1.3 mrg __mu1.as_m64 = __m1;
290 1.1.1.3 mrg __mu2.as_m64 = __m2;
291 1.1 mrg
292 1.1.1.3 mrg __res.as_short[0] = __mu1.as_short[2];
293 1.1.1.3 mrg __res.as_short[1] = __mu2.as_short[2];
294 1.1.1.3 mrg __res.as_short[2] = __mu1.as_short[3];
295 1.1.1.3 mrg __res.as_short[3] = __mu2.as_short[3];
296 1.1 mrg
297 1.1.1.3 mrg return (__m64) __res.as_m64;
298 1.1 mrg }
299 1.1 mrg
300 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
301 1.1 mrg _m_punpckhwd (__m64 __m1, __m64 __m2)
302 1.1 mrg {
303 1.1 mrg return _mm_unpackhi_pi16 (__m1, __m2);
304 1.1 mrg }
305 1.1 mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit
306 1.1 mrg value from the high half of M2. */
307 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308 1.1 mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
309 1.1 mrg {
310 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
311 1.1 mrg
312 1.1.1.3 mrg __mu1.as_m64 = __m1;
313 1.1.1.3 mrg __mu2.as_m64 = __m2;
314 1.1 mrg
315 1.1.1.3 mrg __res.as_int[0] = __mu1.as_int[1];
316 1.1.1.3 mrg __res.as_int[1] = __mu2.as_int[1];
317 1.1 mrg
318 1.1.1.3 mrg return (__m64) __res.as_m64;
319 1.1 mrg }
320 1.1 mrg
321 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
322 1.1 mrg _m_punpckhdq (__m64 __m1, __m64 __m2)
323 1.1 mrg {
324 1.1 mrg return _mm_unpackhi_pi32 (__m1, __m2);
325 1.1 mrg }
326 1.1 mrg /* Interleave the four 8-bit values from the low half of M1 with the four
327 1.1 mrg 8-bit values from the low half of M2. */
328 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 1.1 mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
330 1.1 mrg {
331 1.1 mrg #if _ARCH_PWR8
332 1.1.1.3 mrg __vector unsigned char __a, __b, __c;
333 1.1 mrg
334 1.1.1.3 mrg __a = (__vector unsigned char)vec_splats (__m1);
335 1.1.1.3 mrg __b = (__vector unsigned char)vec_splats (__m2);
336 1.1.1.3 mrg __c = vec_mergel (__a, __b);
337 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
338 1.1 mrg #else
339 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
340 1.1 mrg
341 1.1.1.3 mrg __mu1.as_m64 = __m1;
342 1.1.1.3 mrg __mu2.as_m64 = __m2;
343 1.1 mrg
344 1.1.1.3 mrg __res.as_char[0] = __mu1.as_char[0];
345 1.1.1.3 mrg __res.as_char[1] = __mu2.as_char[0];
346 1.1.1.3 mrg __res.as_char[2] = __mu1.as_char[1];
347 1.1.1.3 mrg __res.as_char[3] = __mu2.as_char[1];
348 1.1.1.3 mrg __res.as_char[4] = __mu1.as_char[2];
349 1.1.1.3 mrg __res.as_char[5] = __mu2.as_char[2];
350 1.1.1.3 mrg __res.as_char[6] = __mu1.as_char[3];
351 1.1.1.3 mrg __res.as_char[7] = __mu2.as_char[3];
352 1.1 mrg
353 1.1.1.3 mrg return (__m64) __res.as_m64;
354 1.1 mrg #endif
355 1.1 mrg }
356 1.1 mrg
357 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
358 1.1 mrg _m_punpcklbw (__m64 __m1, __m64 __m2)
359 1.1 mrg {
360 1.1 mrg return _mm_unpacklo_pi8 (__m1, __m2);
361 1.1 mrg }
362 1.1 mrg /* Interleave the two 16-bit values from the low half of M1 with the two
363 1.1 mrg 16-bit values from the low half of M2. */
364 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365 1.1 mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
366 1.1 mrg {
367 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
368 1.1 mrg
369 1.1.1.3 mrg __mu1.as_m64 = __m1;
370 1.1.1.3 mrg __mu2.as_m64 = __m2;
371 1.1 mrg
372 1.1.1.3 mrg __res.as_short[0] = __mu1.as_short[0];
373 1.1.1.3 mrg __res.as_short[1] = __mu2.as_short[0];
374 1.1.1.3 mrg __res.as_short[2] = __mu1.as_short[1];
375 1.1.1.3 mrg __res.as_short[3] = __mu2.as_short[1];
376 1.1 mrg
377 1.1.1.3 mrg return (__m64) __res.as_m64;
378 1.1 mrg }
379 1.1 mrg
380 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381 1.1 mrg _m_punpcklwd (__m64 __m1, __m64 __m2)
382 1.1 mrg {
383 1.1 mrg return _mm_unpacklo_pi16 (__m1, __m2);
384 1.1 mrg }
385 1.1 mrg
386 1.1 mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit
387 1.1 mrg value from the low half of M2. */
388 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 1.1 mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
390 1.1 mrg {
391 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
392 1.1 mrg
393 1.1.1.3 mrg __mu1.as_m64 = __m1;
394 1.1.1.3 mrg __mu2.as_m64 = __m2;
395 1.1 mrg
396 1.1.1.3 mrg __res.as_int[0] = __mu1.as_int[0];
397 1.1.1.3 mrg __res.as_int[1] = __mu2.as_int[0];
398 1.1 mrg
399 1.1.1.3 mrg return (__m64) __res.as_m64;
400 1.1 mrg }
401 1.1 mrg
402 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403 1.1 mrg _m_punpckldq (__m64 __m1, __m64 __m2)
404 1.1 mrg {
405 1.1 mrg return _mm_unpacklo_pi32 (__m1, __m2);
406 1.1 mrg }
407 1.1 mrg
408 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2. */
409 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410 1.1 mrg _mm_add_pi8 (__m64 __m1, __m64 __m2)
411 1.1 mrg {
412 1.1 mrg #if _ARCH_PWR8
413 1.1.1.3 mrg __vector signed char __a, __b, __c;
414 1.1 mrg
415 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1);
416 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2);
417 1.1.1.3 mrg __c = vec_add (__a, __b);
418 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
419 1.1 mrg #else
420 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
421 1.1 mrg
422 1.1.1.3 mrg __mu1.as_m64 = __m1;
423 1.1.1.3 mrg __mu2.as_m64 = __m2;
424 1.1 mrg
425 1.1.1.3 mrg __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
426 1.1.1.3 mrg __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
427 1.1.1.3 mrg __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
428 1.1.1.3 mrg __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
429 1.1.1.3 mrg __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
430 1.1.1.3 mrg __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
431 1.1.1.3 mrg __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
432 1.1.1.3 mrg __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
433 1.1 mrg
434 1.1.1.3 mrg return (__m64) __res.as_m64;
435 1.1 mrg #endif
436 1.1 mrg }
437 1.1 mrg
438 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
439 1.1 mrg _m_paddb (__m64 __m1, __m64 __m2)
440 1.1 mrg {
441 1.1 mrg return _mm_add_pi8 (__m1, __m2);
442 1.1 mrg }
443 1.1 mrg
444 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2. */
445 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446 1.1 mrg _mm_add_pi16 (__m64 __m1, __m64 __m2)
447 1.1 mrg {
448 1.1 mrg #if _ARCH_PWR8
449 1.1.1.3 mrg __vector signed short __a, __b, __c;
450 1.1 mrg
451 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
452 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
453 1.1.1.3 mrg __c = vec_add (__a, __b);
454 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
455 1.1 mrg #else
456 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
457 1.1 mrg
458 1.1.1.3 mrg __mu1.as_m64 = __m1;
459 1.1.1.3 mrg __mu2.as_m64 = __m2;
460 1.1 mrg
461 1.1.1.3 mrg __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
462 1.1.1.3 mrg __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
463 1.1.1.3 mrg __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
464 1.1.1.3 mrg __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
465 1.1 mrg
466 1.1.1.3 mrg return (__m64) __res.as_m64;
467 1.1 mrg #endif
468 1.1 mrg }
469 1.1 mrg
470 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 1.1 mrg _m_paddw (__m64 __m1, __m64 __m2)
472 1.1 mrg {
473 1.1 mrg return _mm_add_pi16 (__m1, __m2);
474 1.1 mrg }
475 1.1 mrg
476 1.1 mrg /* Add the 32-bit values in M1 to the 32-bit values in M2. */
477 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478 1.1 mrg _mm_add_pi32 (__m64 __m1, __m64 __m2)
479 1.1 mrg {
480 1.1 mrg #if _ARCH_PWR9
481 1.1.1.3 mrg __vector signed int __a, __b, __c;
482 1.1 mrg
483 1.1.1.3 mrg __a = (__vector signed int)vec_splats (__m1);
484 1.1.1.3 mrg __b = (__vector signed int)vec_splats (__m2);
485 1.1.1.3 mrg __c = vec_add (__a, __b);
486 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
487 1.1 mrg #else
488 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
489 1.1 mrg
490 1.1.1.3 mrg __mu1.as_m64 = __m1;
491 1.1.1.3 mrg __mu2.as_m64 = __m2;
492 1.1 mrg
493 1.1.1.3 mrg __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
494 1.1.1.3 mrg __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
495 1.1 mrg
496 1.1.1.3 mrg return (__m64) __res.as_m64;
497 1.1 mrg #endif
498 1.1 mrg }
499 1.1 mrg
500 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501 1.1 mrg _m_paddd (__m64 __m1, __m64 __m2)
502 1.1 mrg {
503 1.1 mrg return _mm_add_pi32 (__m1, __m2);
504 1.1 mrg }
505 1.1 mrg
506 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
507 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
508 1.1 mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2)
509 1.1 mrg {
510 1.1 mrg #if _ARCH_PWR8
511 1.1.1.3 mrg __vector signed char __a, __b, __c;
512 1.1 mrg
513 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1);
514 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2);
515 1.1.1.3 mrg __c = vec_sub (__a, __b);
516 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
517 1.1 mrg #else
518 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
519 1.1 mrg
520 1.1.1.3 mrg __mu1.as_m64 = __m1;
521 1.1.1.3 mrg __mu2.as_m64 = __m2;
522 1.1 mrg
523 1.1.1.3 mrg __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
524 1.1.1.3 mrg __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
525 1.1.1.3 mrg __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
526 1.1.1.3 mrg __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
527 1.1.1.3 mrg __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
528 1.1.1.3 mrg __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
529 1.1.1.3 mrg __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
530 1.1.1.3 mrg __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
531 1.1 mrg
532 1.1.1.3 mrg return (__m64) __res.as_m64;
533 1.1 mrg #endif
534 1.1 mrg }
535 1.1 mrg
536 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
537 1.1 mrg _m_psubb (__m64 __m1, __m64 __m2)
538 1.1 mrg {
539 1.1 mrg return _mm_sub_pi8 (__m1, __m2);
540 1.1 mrg }
541 1.1 mrg
542 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
543 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
544 1.1 mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2)
545 1.1 mrg {
546 1.1 mrg #if _ARCH_PWR8
547 1.1.1.3 mrg __vector signed short __a, __b, __c;
548 1.1 mrg
549 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
550 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
551 1.1.1.3 mrg __c = vec_sub (__a, __b);
552 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
553 1.1 mrg #else
554 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
555 1.1 mrg
556 1.1.1.3 mrg __mu1.as_m64 = __m1;
557 1.1.1.3 mrg __mu2.as_m64 = __m2;
558 1.1 mrg
559 1.1.1.3 mrg __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
560 1.1.1.3 mrg __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
561 1.1.1.3 mrg __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
562 1.1.1.3 mrg __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
563 1.1 mrg
564 1.1.1.3 mrg return (__m64) __res.as_m64;
565 1.1 mrg #endif
566 1.1 mrg }
567 1.1 mrg
568 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569 1.1 mrg _m_psubw (__m64 __m1, __m64 __m2)
570 1.1 mrg {
571 1.1 mrg return _mm_sub_pi16 (__m1, __m2);
572 1.1 mrg }
573 1.1 mrg
574 1.1 mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
575 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
576 1.1 mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2)
577 1.1 mrg {
578 1.1 mrg #if _ARCH_PWR9
579 1.1.1.3 mrg __vector signed int __a, __b, __c;
580 1.1 mrg
581 1.1.1.3 mrg __a = (__vector signed int)vec_splats (__m1);
582 1.1.1.3 mrg __b = (__vector signed int)vec_splats (__m2);
583 1.1.1.3 mrg __c = vec_sub (__a, __b);
584 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
585 1.1 mrg #else
586 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
587 1.1 mrg
588 1.1.1.3 mrg __mu1.as_m64 = __m1;
589 1.1.1.3 mrg __mu2.as_m64 = __m2;
590 1.1 mrg
591 1.1.1.3 mrg __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
592 1.1.1.3 mrg __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
593 1.1 mrg
594 1.1.1.3 mrg return (__m64) __res.as_m64;
595 1.1 mrg #endif
596 1.1 mrg }
597 1.1 mrg
598 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 1.1 mrg _m_psubd (__m64 __m1, __m64 __m2)
600 1.1 mrg {
601 1.1 mrg return _mm_sub_pi32 (__m1, __m2);
602 1.1 mrg }
603 1.1 mrg
604 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605 1.1 mrg _mm_add_si64 (__m64 __m1, __m64 __m2)
606 1.1 mrg {
607 1.1 mrg return (__m1 + __m2);
608 1.1 mrg }
609 1.1 mrg
610 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611 1.1 mrg _mm_sub_si64 (__m64 __m1, __m64 __m2)
612 1.1 mrg {
613 1.1 mrg return (__m1 - __m2);
614 1.1 mrg }
615 1.1 mrg
616 1.1 mrg /* Shift the 64-bit value in M left by COUNT. */
617 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618 1.1 mrg _mm_sll_si64 (__m64 __m, __m64 __count)
619 1.1 mrg {
620 1.1 mrg return (__m << __count);
621 1.1 mrg }
622 1.1 mrg
623 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624 1.1 mrg _m_psllq (__m64 __m, __m64 __count)
625 1.1 mrg {
626 1.1 mrg return _mm_sll_si64 (__m, __count);
627 1.1 mrg }
628 1.1 mrg
629 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630 1.1 mrg _mm_slli_si64 (__m64 __m, const int __count)
631 1.1 mrg {
632 1.1 mrg return (__m << __count);
633 1.1 mrg }
634 1.1 mrg
635 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
636 1.1 mrg _m_psllqi (__m64 __m, const int __count)
637 1.1 mrg {
638 1.1 mrg return _mm_slli_si64 (__m, __count);
639 1.1 mrg }
640 1.1 mrg
641 1.1 mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
642 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 1.1 mrg _mm_srl_si64 (__m64 __m, __m64 __count)
644 1.1 mrg {
645 1.1 mrg return (__m >> __count);
646 1.1 mrg }
647 1.1 mrg
648 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 1.1 mrg _m_psrlq (__m64 __m, __m64 __count)
650 1.1 mrg {
651 1.1 mrg return _mm_srl_si64 (__m, __count);
652 1.1 mrg }
653 1.1 mrg
654 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655 1.1 mrg _mm_srli_si64 (__m64 __m, const int __count)
656 1.1 mrg {
657 1.1 mrg return (__m >> __count);
658 1.1 mrg }
659 1.1 mrg
660 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661 1.1 mrg _m_psrlqi (__m64 __m, const int __count)
662 1.1 mrg {
663 1.1 mrg return _mm_srli_si64 (__m, __count);
664 1.1 mrg }
665 1.1 mrg
666 1.1 mrg /* Bit-wise AND the 64-bit values in M1 and M2. */
667 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668 1.1 mrg _mm_and_si64 (__m64 __m1, __m64 __m2)
669 1.1 mrg {
670 1.1 mrg return (__m1 & __m2);
671 1.1 mrg }
672 1.1 mrg
673 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674 1.1 mrg _m_pand (__m64 __m1, __m64 __m2)
675 1.1 mrg {
676 1.1 mrg return _mm_and_si64 (__m1, __m2);
677 1.1 mrg }
678 1.1 mrg
679 1.1 mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
680 1.1 mrg 64-bit value in M2. */
681 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 1.1 mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2)
683 1.1 mrg {
684 1.1 mrg return (~__m1 & __m2);
685 1.1 mrg }
686 1.1 mrg
687 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 1.1 mrg _m_pandn (__m64 __m1, __m64 __m2)
689 1.1 mrg {
690 1.1 mrg return _mm_andnot_si64 (__m1, __m2);
691 1.1 mrg }
692 1.1 mrg
693 1.1 mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
694 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 1.1 mrg _mm_or_si64 (__m64 __m1, __m64 __m2)
696 1.1 mrg {
697 1.1 mrg return (__m1 | __m2);
698 1.1 mrg }
699 1.1 mrg
700 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 1.1 mrg _m_por (__m64 __m1, __m64 __m2)
702 1.1 mrg {
703 1.1 mrg return _mm_or_si64 (__m1, __m2);
704 1.1 mrg }
705 1.1 mrg
706 1.1 mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
707 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
708 1.1 mrg _mm_xor_si64 (__m64 __m1, __m64 __m2)
709 1.1 mrg {
710 1.1 mrg return (__m1 ^ __m2);
711 1.1 mrg }
712 1.1 mrg
713 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
714 1.1 mrg _m_pxor (__m64 __m1, __m64 __m2)
715 1.1 mrg {
716 1.1 mrg return _mm_xor_si64 (__m1, __m2);
717 1.1 mrg }
718 1.1 mrg
719 1.1 mrg /* Creates a 64-bit zero. */
720 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721 1.1 mrg _mm_setzero_si64 (void)
722 1.1 mrg {
723 1.1 mrg return (__m64) 0;
724 1.1 mrg }
725 1.1 mrg
726 1.1 mrg /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
727 1.1 mrg test is true and zero if false. */
728 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729 1.1 mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
730 1.1 mrg {
731 1.1.1.2 mrg #if defined(_ARCH_PWR6) && defined(__powerpc64__)
732 1.1.1.3 mrg __m64 __res;
733 1.1 mrg __asm__(
734 1.1 mrg "cmpb %0,%1,%2;\n"
735 1.1.1.3 mrg : "=r" (__res)
736 1.1 mrg : "r" (__m1),
737 1.1 mrg "r" (__m2)
738 1.1 mrg : );
739 1.1.1.3 mrg return (__res);
740 1.1 mrg #else
741 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
742 1.1 mrg
743 1.1.1.3 mrg __mu1.as_m64 = __m1;
744 1.1.1.3 mrg __mu2.as_m64 = __m2;
745 1.1 mrg
746 1.1.1.3 mrg __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0;
747 1.1.1.3 mrg __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0;
748 1.1.1.3 mrg __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0;
749 1.1.1.3 mrg __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0;
750 1.1.1.3 mrg __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0;
751 1.1.1.3 mrg __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0;
752 1.1.1.3 mrg __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0;
753 1.1.1.3 mrg __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0;
754 1.1 mrg
755 1.1.1.3 mrg return (__m64) __res.as_m64;
756 1.1 mrg #endif
757 1.1 mrg }
758 1.1 mrg
759 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760 1.1 mrg _m_pcmpeqb (__m64 __m1, __m64 __m2)
761 1.1 mrg {
762 1.1 mrg return _mm_cmpeq_pi8 (__m1, __m2);
763 1.1 mrg }
764 1.1 mrg
765 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766 1.1 mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
767 1.1 mrg {
768 1.1 mrg #if _ARCH_PWR8
769 1.1.1.3 mrg __vector signed char __a, __b, __c;
770 1.1 mrg
771 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1);
772 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2);
773 1.1.1.3 mrg __c = (__vector signed char)vec_cmpgt (__a, __b);
774 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
775 1.1 mrg #else
776 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
777 1.1 mrg
778 1.1.1.3 mrg __mu1.as_m64 = __m1;
779 1.1.1.3 mrg __mu2.as_m64 = __m2;
780 1.1 mrg
781 1.1.1.3 mrg __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0;
782 1.1.1.3 mrg __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0;
783 1.1.1.3 mrg __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0;
784 1.1.1.3 mrg __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0;
785 1.1.1.3 mrg __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0;
786 1.1.1.3 mrg __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0;
787 1.1.1.3 mrg __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0;
788 1.1.1.3 mrg __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0;
789 1.1 mrg
790 1.1.1.3 mrg return (__m64) __res.as_m64;
791 1.1 mrg #endif
792 1.1 mrg }
793 1.1 mrg
794 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
795 1.1 mrg _m_pcmpgtb (__m64 __m1, __m64 __m2)
796 1.1 mrg {
797 1.1 mrg return _mm_cmpgt_pi8 (__m1, __m2);
798 1.1 mrg }
799 1.1 mrg
800 1.1 mrg /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
801 1.1 mrg the test is true and zero if false. */
802 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 1.1 mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
804 1.1 mrg {
805 1.1 mrg #if _ARCH_PWR8
806 1.1.1.3 mrg __vector signed short __a, __b, __c;
807 1.1 mrg
808 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
809 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
810 1.1.1.3 mrg __c = (__vector signed short)vec_cmpeq (__a, __b);
811 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
812 1.1 mrg #else
813 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
814 1.1 mrg
815 1.1.1.3 mrg __mu1.as_m64 = __m1;
816 1.1.1.3 mrg __mu2.as_m64 = __m2;
817 1.1 mrg
818 1.1.1.3 mrg __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0;
819 1.1.1.3 mrg __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0;
820 1.1.1.3 mrg __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0;
821 1.1.1.3 mrg __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0;
822 1.1 mrg
823 1.1.1.3 mrg return (__m64) __res.as_m64;
824 1.1 mrg #endif
825 1.1 mrg }
826 1.1 mrg
827 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828 1.1 mrg _m_pcmpeqw (__m64 __m1, __m64 __m2)
829 1.1 mrg {
830 1.1 mrg return _mm_cmpeq_pi16 (__m1, __m2);
831 1.1 mrg }
832 1.1 mrg
833 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834 1.1 mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
835 1.1 mrg {
836 1.1 mrg #if _ARCH_PWR8
837 1.1.1.3 mrg __vector signed short __a, __b, __c;
838 1.1 mrg
839 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
840 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
841 1.1.1.3 mrg __c = (__vector signed short)vec_cmpgt (__a, __b);
842 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
843 1.1 mrg #else
844 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
845 1.1 mrg
846 1.1.1.3 mrg __mu1.as_m64 = __m1;
847 1.1.1.3 mrg __mu2.as_m64 = __m2;
848 1.1 mrg
849 1.1.1.3 mrg __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0;
850 1.1.1.3 mrg __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0;
851 1.1.1.3 mrg __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0;
852 1.1.1.3 mrg __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0;
853 1.1 mrg
854 1.1.1.3 mrg return (__m64) __res.as_m64;
855 1.1 mrg #endif
856 1.1 mrg }
857 1.1 mrg
858 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 1.1 mrg _m_pcmpgtw (__m64 __m1, __m64 __m2)
860 1.1 mrg {
861 1.1 mrg return _mm_cmpgt_pi16 (__m1, __m2);
862 1.1 mrg }
863 1.1 mrg
864 1.1 mrg /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
865 1.1 mrg the test is true and zero if false. */
866 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867 1.1 mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
868 1.1 mrg {
869 1.1 mrg #if _ARCH_PWR9
870 1.1.1.3 mrg __vector signed int __a, __b, __c;
871 1.1 mrg
872 1.1.1.3 mrg __a = (__vector signed int)vec_splats (__m1);
873 1.1.1.3 mrg __b = (__vector signed int)vec_splats (__m2);
874 1.1.1.3 mrg __c = (__vector signed int)vec_cmpeq (__a, __b);
875 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
876 1.1 mrg #else
877 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
878 1.1 mrg
879 1.1.1.3 mrg __mu1.as_m64 = __m1;
880 1.1.1.3 mrg __mu2.as_m64 = __m2;
881 1.1 mrg
882 1.1.1.3 mrg __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0;
883 1.1.1.3 mrg __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0;
884 1.1 mrg
885 1.1.1.3 mrg return (__m64) __res.as_m64;
886 1.1 mrg #endif
887 1.1 mrg }
888 1.1 mrg
889 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890 1.1 mrg _m_pcmpeqd (__m64 __m1, __m64 __m2)
891 1.1 mrg {
892 1.1 mrg return _mm_cmpeq_pi32 (__m1, __m2);
893 1.1 mrg }
894 1.1 mrg
895 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896 1.1 mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
897 1.1 mrg {
898 1.1 mrg #if _ARCH_PWR9
899 1.1.1.3 mrg __vector signed int __a, __b, __c;
900 1.1 mrg
901 1.1.1.3 mrg __a = (__vector signed int)vec_splats (__m1);
902 1.1.1.3 mrg __b = (__vector signed int)vec_splats (__m2);
903 1.1.1.3 mrg __c = (__vector signed int)vec_cmpgt (__a, __b);
904 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
905 1.1 mrg #else
906 1.1.1.3 mrg __m64_union __mu1, __mu2, __res;
907 1.1 mrg
908 1.1.1.3 mrg __mu1.as_m64 = __m1;
909 1.1.1.3 mrg __mu2.as_m64 = __m2;
910 1.1 mrg
911 1.1.1.3 mrg __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0;
912 1.1.1.3 mrg __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0;
913 1.1 mrg
914 1.1.1.3 mrg return (__m64) __res.as_m64;
915 1.1 mrg #endif
916 1.1 mrg }
917 1.1 mrg
918 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919 1.1 mrg _m_pcmpgtd (__m64 __m1, __m64 __m2)
920 1.1 mrg {
921 1.1 mrg return _mm_cmpgt_pi32 (__m1, __m2);
922 1.1 mrg }
923 1.1 mrg
924 1.1 mrg #if _ARCH_PWR8
925 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
926 1.1 mrg saturated arithmetic. */
927 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
928 1.1 mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2)
929 1.1 mrg {
930 1.1.1.3 mrg __vector signed char __a, __b, __c;
931 1.1 mrg
932 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1);
933 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2);
934 1.1.1.3 mrg __c = vec_adds (__a, __b);
935 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
936 1.1 mrg }
937 1.1 mrg
938 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 1.1 mrg _m_paddsb (__m64 __m1, __m64 __m2)
940 1.1 mrg {
941 1.1 mrg return _mm_adds_pi8 (__m1, __m2);
942 1.1 mrg }
943 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
944 1.1 mrg saturated arithmetic. */
945 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
946 1.1 mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2)
947 1.1 mrg {
948 1.1.1.3 mrg __vector signed short __a, __b, __c;
949 1.1 mrg
950 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
951 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
952 1.1.1.3 mrg __c = vec_adds (__a, __b);
953 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
954 1.1 mrg }
955 1.1 mrg
956 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957 1.1 mrg _m_paddsw (__m64 __m1, __m64 __m2)
958 1.1 mrg {
959 1.1 mrg return _mm_adds_pi16 (__m1, __m2);
960 1.1 mrg }
961 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
962 1.1 mrg saturated arithmetic. */
963 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
964 1.1 mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2)
965 1.1 mrg {
966 1.1.1.3 mrg __vector unsigned char __a, __b, __c;
967 1.1 mrg
968 1.1.1.3 mrg __a = (__vector unsigned char)vec_splats (__m1);
969 1.1.1.3 mrg __b = (__vector unsigned char)vec_splats (__m2);
970 1.1.1.3 mrg __c = vec_adds (__a, __b);
971 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
972 1.1 mrg }
973 1.1 mrg
974 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 1.1 mrg _m_paddusb (__m64 __m1, __m64 __m2)
976 1.1 mrg {
977 1.1 mrg return _mm_adds_pu8 (__m1, __m2);
978 1.1 mrg }
979 1.1 mrg
980 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
981 1.1 mrg saturated arithmetic. */
982 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 1.1 mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2)
984 1.1 mrg {
985 1.1.1.3 mrg __vector unsigned short __a, __b, __c;
986 1.1 mrg
987 1.1.1.3 mrg __a = (__vector unsigned short)vec_splats (__m1);
988 1.1.1.3 mrg __b = (__vector unsigned short)vec_splats (__m2);
989 1.1.1.3 mrg __c = vec_adds (__a, __b);
990 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
991 1.1 mrg }
992 1.1 mrg
993 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994 1.1 mrg _m_paddusw (__m64 __m1, __m64 __m2)
995 1.1 mrg {
996 1.1 mrg return _mm_adds_pu16 (__m1, __m2);
997 1.1 mrg }
998 1.1 mrg
999 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
1000 1.1 mrg saturating arithmetic. */
1001 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 1.1 mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2)
1003 1.1 mrg {
1004 1.1.1.3 mrg __vector signed char __a, __b, __c;
1005 1.1 mrg
1006 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1);
1007 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2);
1008 1.1.1.3 mrg __c = vec_subs (__a, __b);
1009 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
1010 1.1 mrg }
1011 1.1 mrg
1012 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013 1.1 mrg _m_psubsb (__m64 __m1, __m64 __m2)
1014 1.1 mrg {
1015 1.1 mrg return _mm_subs_pi8 (__m1, __m2);
1016 1.1 mrg }
1017 1.1 mrg
1018 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1019 1.1 mrg signed saturating arithmetic. */
1020 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021 1.1 mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1022 1.1 mrg {
1023 1.1.1.3 mrg __vector signed short __a, __b, __c;
1024 1.1 mrg
1025 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
1026 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
1027 1.1.1.3 mrg __c = vec_subs (__a, __b);
1028 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
1029 1.1 mrg }
1030 1.1 mrg
1031 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032 1.1 mrg _m_psubsw (__m64 __m1, __m64 __m2)
1033 1.1 mrg {
1034 1.1 mrg return _mm_subs_pi16 (__m1, __m2);
1035 1.1 mrg }
1036 1.1 mrg
1037 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1038 1.1 mrg unsigned saturating arithmetic. */
1039 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 1.1 mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1041 1.1 mrg {
1042 1.1.1.3 mrg __vector unsigned char __a, __b, __c;
1043 1.1 mrg
1044 1.1.1.3 mrg __a = (__vector unsigned char)vec_splats (__m1);
1045 1.1.1.3 mrg __b = (__vector unsigned char)vec_splats (__m2);
1046 1.1.1.3 mrg __c = vec_subs (__a, __b);
1047 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
1048 1.1 mrg }
1049 1.1 mrg
1050 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 1.1 mrg _m_psubusb (__m64 __m1, __m64 __m2)
1052 1.1 mrg {
1053 1.1 mrg return _mm_subs_pu8 (__m1, __m2);
1054 1.1 mrg }
1055 1.1 mrg
1056 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1057 1.1 mrg unsigned saturating arithmetic. */
1058 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059 1.1 mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1060 1.1 mrg {
1061 1.1.1.3 mrg __vector unsigned short __a, __b, __c;
1062 1.1 mrg
1063 1.1.1.3 mrg __a = (__vector unsigned short)vec_splats (__m1);
1064 1.1.1.3 mrg __b = (__vector unsigned short)vec_splats (__m2);
1065 1.1.1.3 mrg __c = vec_subs (__a, __b);
1066 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
1067 1.1 mrg }
1068 1.1 mrg
1069 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070 1.1 mrg _m_psubusw (__m64 __m1, __m64 __m2)
1071 1.1 mrg {
1072 1.1 mrg return _mm_subs_pu16 (__m1, __m2);
1073 1.1 mrg }
1074 1.1 mrg
1075 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1076 1.1 mrg four 32-bit intermediate results, which are then summed by pairs to
1077 1.1 mrg produce two 32-bit results. */
1078 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1079 1.1 mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1080 1.1 mrg {
1081 1.1.1.3 mrg __vector signed short __a, __b;
1082 1.1.1.3 mrg __vector signed int __c;
1083 1.1.1.3 mrg __vector signed int __zero = {0, 0, 0, 0};
1084 1.1.1.3 mrg
1085 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
1086 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
1087 1.1.1.3 mrg __c = vec_vmsumshm (__a, __b, __zero);
1088 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
1089 1.1 mrg }
1090 1.1 mrg
1091 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092 1.1 mrg _m_pmaddwd (__m64 __m1, __m64 __m2)
1093 1.1 mrg {
1094 1.1 mrg return _mm_madd_pi16 (__m1, __m2);
1095 1.1 mrg }
1096 1.1 mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1097 1.1 mrg M2 and produce the high 16 bits of the 32-bit results. */
1098 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099 1.1 mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1100 1.1 mrg {
1101 1.1.1.3 mrg __vector signed short __a, __b;
1102 1.1.1.3 mrg __vector signed short __c;
1103 1.1.1.3 mrg __vector signed int __w0, __w1;
1104 1.1.1.3 mrg __vector unsigned char __xform1 = {
1105 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__
1106 1.1 mrg 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1107 1.1 mrg 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1108 1.1.1.2 mrg #else
1109 1.1.1.2 mrg 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1110 1.1.1.2 mrg 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1111 1.1.1.2 mrg #endif
1112 1.1 mrg };
1113 1.1 mrg
1114 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
1115 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
1116 1.1 mrg
1117 1.1.1.3 mrg __w0 = vec_vmulesh (__a, __b);
1118 1.1.1.3 mrg __w1 = vec_vmulosh (__a, __b);
1119 1.1.1.3 mrg __c = (__vector signed short)vec_perm (__w0, __w1, __xform1);
1120 1.1 mrg
1121 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
1122 1.1 mrg }
1123 1.1 mrg
1124 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125 1.1 mrg _m_pmulhw (__m64 __m1, __m64 __m2)
1126 1.1 mrg {
1127 1.1 mrg return _mm_mulhi_pi16 (__m1, __m2);
1128 1.1 mrg }
1129 1.1 mrg
1130 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1131 1.1 mrg the low 16 bits of the results. */
1132 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133 1.1 mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1134 1.1 mrg {
1135 1.1.1.3 mrg __vector signed short __a, __b, __c;
1136 1.1 mrg
1137 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1);
1138 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2);
1139 1.1.1.3 mrg __c = __a * __b;
1140 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0];
1141 1.1 mrg }
1142 1.1 mrg
1143 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 1.1 mrg _m_pmullw (__m64 __m1, __m64 __m2)
1145 1.1 mrg {
1146 1.1 mrg return _mm_mullo_pi16 (__m1, __m2);
1147 1.1 mrg }
1148 1.1 mrg
1149 1.1 mrg /* Shift four 16-bit values in M left by COUNT. */
1150 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151 1.1 mrg _mm_sll_pi16 (__m64 __m, __m64 __count)
1152 1.1 mrg {
1153 1.1.1.3 mrg __vector signed short __r;
1154 1.1.1.3 mrg __vector unsigned short __c;
1155 1.1 mrg
1156 1.1 mrg if (__count <= 15)
1157 1.1 mrg {
1158 1.1.1.3 mrg __r = (__vector signed short)vec_splats (__m);
1159 1.1.1.3 mrg __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1160 1.1.1.3 mrg __r = vec_sl (__r, (__vector unsigned short)__c);
1161 1.1.1.3 mrg return (__m64) ((__vector long long) __r)[0];
1162 1.1 mrg }
1163 1.1 mrg else
1164 1.1 mrg return (0);
1165 1.1 mrg }
1166 1.1 mrg
1167 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 1.1 mrg _m_psllw (__m64 __m, __m64 __count)
1169 1.1 mrg {
1170 1.1 mrg return _mm_sll_pi16 (__m, __count);
1171 1.1 mrg }
1172 1.1 mrg
1173 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174 1.1 mrg _mm_slli_pi16 (__m64 __m, int __count)
1175 1.1 mrg {
1176 1.1 mrg /* Promote int to long then invoke mm_sll_pi16. */
1177 1.1 mrg return _mm_sll_pi16 (__m, __count);
1178 1.1 mrg }
1179 1.1 mrg
1180 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181 1.1 mrg _m_psllwi (__m64 __m, int __count)
1182 1.1 mrg {
1183 1.1 mrg return _mm_slli_pi16 (__m, __count);
1184 1.1 mrg }
1185 1.1 mrg
1186 1.1 mrg /* Shift two 32-bit values in M left by COUNT. */
1187 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 1.1 mrg _mm_sll_pi32 (__m64 __m, __m64 __count)
1189 1.1 mrg {
1190 1.1.1.3 mrg __m64_union __res;
1191 1.1 mrg
1192 1.1.1.3 mrg __res.as_m64 = __m;
1193 1.1 mrg
1194 1.1.1.3 mrg __res.as_int[0] = __res.as_int[0] << __count;
1195 1.1.1.3 mrg __res.as_int[1] = __res.as_int[1] << __count;
1196 1.1.1.3 mrg return (__res.as_m64);
1197 1.1 mrg }
1198 1.1 mrg
1199 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 1.1 mrg _m_pslld (__m64 __m, __m64 __count)
1201 1.1 mrg {
1202 1.1 mrg return _mm_sll_pi32 (__m, __count);
1203 1.1 mrg }
1204 1.1 mrg
1205 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 1.1 mrg _mm_slli_pi32 (__m64 __m, int __count)
1207 1.1 mrg {
1208 1.1 mrg /* Promote int to long then invoke mm_sll_pi32. */
1209 1.1 mrg return _mm_sll_pi32 (__m, __count);
1210 1.1 mrg }
1211 1.1 mrg
1212 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1213 1.1 mrg _m_pslldi (__m64 __m, int __count)
1214 1.1 mrg {
1215 1.1 mrg return _mm_slli_pi32 (__m, __count);
1216 1.1 mrg }
1217 1.1 mrg
1218 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1219 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 1.1 mrg _mm_sra_pi16 (__m64 __m, __m64 __count)
1221 1.1 mrg {
1222 1.1.1.3 mrg __vector signed short __r;
1223 1.1.1.3 mrg __vector unsigned short __c;
1224 1.1 mrg
1225 1.1 mrg if (__count <= 15)
1226 1.1 mrg {
1227 1.1.1.3 mrg __r = (__vector signed short)vec_splats (__m);
1228 1.1.1.3 mrg __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1229 1.1.1.3 mrg __r = vec_sra (__r, (__vector unsigned short)__c);
1230 1.1.1.3 mrg return (__m64) ((__vector long long) __r)[0];
1231 1.1 mrg }
1232 1.1 mrg else
1233 1.1 mrg return (0);
1234 1.1 mrg }
1235 1.1 mrg
1236 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237 1.1 mrg _m_psraw (__m64 __m, __m64 __count)
1238 1.1 mrg {
1239 1.1 mrg return _mm_sra_pi16 (__m, __count);
1240 1.1 mrg }
1241 1.1 mrg
1242 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243 1.1 mrg _mm_srai_pi16 (__m64 __m, int __count)
1244 1.1 mrg {
1245 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1246 1.1 mrg return _mm_sra_pi16 (__m, __count);
1247 1.1 mrg }
1248 1.1 mrg
1249 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250 1.1 mrg _m_psrawi (__m64 __m, int __count)
1251 1.1 mrg {
1252 1.1 mrg return _mm_srai_pi16 (__m, __count);
1253 1.1 mrg }
1254 1.1 mrg
1255 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1256 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 1.1 mrg _mm_sra_pi32 (__m64 __m, __m64 __count)
1258 1.1 mrg {
1259 1.1.1.3 mrg __m64_union __res;
1260 1.1 mrg
1261 1.1.1.3 mrg __res.as_m64 = __m;
1262 1.1 mrg
1263 1.1.1.3 mrg __res.as_int[0] = __res.as_int[0] >> __count;
1264 1.1.1.3 mrg __res.as_int[1] = __res.as_int[1] >> __count;
1265 1.1.1.3 mrg return (__res.as_m64);
1266 1.1 mrg }
1267 1.1 mrg
1268 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 1.1 mrg _m_psrad (__m64 __m, __m64 __count)
1270 1.1 mrg {
1271 1.1 mrg return _mm_sra_pi32 (__m, __count);
1272 1.1 mrg }
1273 1.1 mrg
1274 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 1.1 mrg _mm_srai_pi32 (__m64 __m, int __count)
1276 1.1 mrg {
1277 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1278 1.1 mrg return _mm_sra_pi32 (__m, __count);
1279 1.1 mrg }
1280 1.1 mrg
1281 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 1.1 mrg _m_psradi (__m64 __m, int __count)
1283 1.1 mrg {
1284 1.1 mrg return _mm_srai_pi32 (__m, __count);
1285 1.1 mrg }
1286 1.1 mrg
1287 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1288 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 1.1 mrg _mm_srl_pi16 (__m64 __m, __m64 __count)
1290 1.1 mrg {
1291 1.1.1.3 mrg __vector unsigned short __r;
1292 1.1.1.3 mrg __vector unsigned short __c;
1293 1.1 mrg
1294 1.1 mrg if (__count <= 15)
1295 1.1 mrg {
1296 1.1.1.3 mrg __r = (__vector unsigned short)vec_splats (__m);
1297 1.1.1.3 mrg __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1298 1.1.1.3 mrg __r = vec_sr (__r, (__vector unsigned short)__c);
1299 1.1.1.3 mrg return (__m64) ((__vector long long) __r)[0];
1300 1.1 mrg }
1301 1.1 mrg else
1302 1.1 mrg return (0);
1303 1.1 mrg }
1304 1.1 mrg
1305 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306 1.1 mrg _m_psrlw (__m64 __m, __m64 __count)
1307 1.1 mrg {
1308 1.1 mrg return _mm_srl_pi16 (__m, __count);
1309 1.1 mrg }
1310 1.1 mrg
1311 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 1.1 mrg _mm_srli_pi16 (__m64 __m, int __count)
1313 1.1 mrg {
1314 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1315 1.1 mrg return _mm_srl_pi16 (__m, __count);
1316 1.1 mrg }
1317 1.1 mrg
1318 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1319 1.1 mrg _m_psrlwi (__m64 __m, int __count)
1320 1.1 mrg {
1321 1.1 mrg return _mm_srli_pi16 (__m, __count);
1322 1.1 mrg }
1323 1.1 mrg
1324 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1325 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326 1.1 mrg _mm_srl_pi32 (__m64 __m, __m64 __count)
1327 1.1 mrg {
1328 1.1.1.3 mrg __m64_union __res;
1329 1.1 mrg
1330 1.1.1.3 mrg __res.as_m64 = __m;
1331 1.1 mrg
1332 1.1.1.3 mrg __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1333 1.1.1.3 mrg __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1334 1.1.1.3 mrg return (__res.as_m64);
1335 1.1 mrg }
1336 1.1 mrg
1337 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1338 1.1 mrg _m_psrld (__m64 __m, __m64 __count)
1339 1.1 mrg {
1340 1.1 mrg return _mm_srl_pi32 (__m, __count);
1341 1.1 mrg }
1342 1.1 mrg
1343 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1344 1.1 mrg _mm_srli_pi32 (__m64 __m, int __count)
1345 1.1 mrg {
1346 1.1 mrg /* Promote int to long then invoke mm_srl_pi32. */
1347 1.1 mrg return _mm_srl_pi32 (__m, __count);
1348 1.1 mrg }
1349 1.1 mrg
1350 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351 1.1 mrg _m_psrldi (__m64 __m, int __count)
1352 1.1 mrg {
1353 1.1 mrg return _mm_srli_pi32 (__m, __count);
1354 1.1 mrg }
1355 1.1 mrg #endif /* _ARCH_PWR8 */
1356 1.1 mrg
1357 1.1 mrg /* Creates a vector of two 32-bit values; I0 is least significant. */
1358 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359 1.1 mrg _mm_set_pi32 (int __i1, int __i0)
1360 1.1 mrg {
1361 1.1.1.3 mrg __m64_union __res;
1362 1.1 mrg
1363 1.1.1.3 mrg __res.as_int[0] = __i0;
1364 1.1.1.3 mrg __res.as_int[1] = __i1;
1365 1.1.1.3 mrg return (__res.as_m64);
1366 1.1 mrg }
1367 1.1 mrg
1368 1.1 mrg /* Creates a vector of four 16-bit values; W0 is least significant. */
1369 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 1.1 mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1371 1.1 mrg {
1372 1.1.1.3 mrg __m64_union __res;
1373 1.1 mrg
1374 1.1.1.3 mrg __res.as_short[0] = __w0;
1375 1.1.1.3 mrg __res.as_short[1] = __w1;
1376 1.1.1.3 mrg __res.as_short[2] = __w2;
1377 1.1.1.3 mrg __res.as_short[3] = __w3;
1378 1.1.1.3 mrg return (__res.as_m64);
1379 1.1 mrg }
1380 1.1 mrg
1381 1.1 mrg /* Creates a vector of eight 8-bit values; B0 is least significant. */
1382 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383 1.1 mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1384 1.1 mrg char __b3, char __b2, char __b1, char __b0)
1385 1.1 mrg {
1386 1.1.1.3 mrg __m64_union __res;
1387 1.1 mrg
1388 1.1.1.3 mrg __res.as_char[0] = __b0;
1389 1.1.1.3 mrg __res.as_char[1] = __b1;
1390 1.1.1.3 mrg __res.as_char[2] = __b2;
1391 1.1.1.3 mrg __res.as_char[3] = __b3;
1392 1.1.1.3 mrg __res.as_char[4] = __b4;
1393 1.1.1.3 mrg __res.as_char[5] = __b5;
1394 1.1.1.3 mrg __res.as_char[6] = __b6;
1395 1.1.1.3 mrg __res.as_char[7] = __b7;
1396 1.1.1.3 mrg return (__res.as_m64);
1397 1.1 mrg }
1398 1.1 mrg
1399 1.1 mrg /* Similar, but with the arguments in reverse order. */
1400 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401 1.1 mrg _mm_setr_pi32 (int __i0, int __i1)
1402 1.1 mrg {
1403 1.1.1.3 mrg __m64_union __res;
1404 1.1 mrg
1405 1.1.1.3 mrg __res.as_int[0] = __i0;
1406 1.1.1.3 mrg __res.as_int[1] = __i1;
1407 1.1.1.3 mrg return (__res.as_m64);
1408 1.1 mrg }
1409 1.1 mrg
1410 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 1.1 mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1412 1.1 mrg {
1413 1.1 mrg return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1414 1.1 mrg }
1415 1.1 mrg
1416 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417 1.1 mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1418 1.1 mrg char __b4, char __b5, char __b6, char __b7)
1419 1.1 mrg {
1420 1.1 mrg return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1421 1.1 mrg }
1422 1.1 mrg
1423 1.1 mrg /* Creates a vector of two 32-bit values, both elements containing I. */
1424 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425 1.1 mrg _mm_set1_pi32 (int __i)
1426 1.1 mrg {
1427 1.1.1.3 mrg __m64_union __res;
1428 1.1 mrg
1429 1.1.1.3 mrg __res.as_int[0] = __i;
1430 1.1.1.3 mrg __res.as_int[1] = __i;
1431 1.1.1.3 mrg return (__res.as_m64);
1432 1.1 mrg }
1433 1.1 mrg
1434 1.1 mrg /* Creates a vector of four 16-bit values, all elements containing W. */
1435 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436 1.1 mrg _mm_set1_pi16 (short __w)
1437 1.1 mrg {
1438 1.1 mrg #if _ARCH_PWR9
1439 1.1 mrg __vector signed short w;
1440 1.1 mrg
1441 1.1 mrg w = (__vector signed short)vec_splats (__w);
1442 1.1.1.2 mrg return (__m64) ((__vector long long) w)[0];
1443 1.1 mrg #else
1444 1.1.1.3 mrg __m64_union __res;
1445 1.1 mrg
1446 1.1.1.3 mrg __res.as_short[0] = __w;
1447 1.1.1.3 mrg __res.as_short[1] = __w;
1448 1.1.1.3 mrg __res.as_short[2] = __w;
1449 1.1.1.3 mrg __res.as_short[3] = __w;
1450 1.1.1.3 mrg return (__res.as_m64);
1451 1.1 mrg #endif
1452 1.1 mrg }
1453 1.1 mrg
1454 1.1 mrg /* Creates a vector of eight 8-bit values, all elements containing B. */
1455 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1456 1.1 mrg _mm_set1_pi8 (signed char __b)
1457 1.1 mrg {
1458 1.1 mrg #if _ARCH_PWR8
1459 1.1.1.3 mrg __vector signed char __res;
1460 1.1 mrg
1461 1.1.1.3 mrg __res = (__vector signed char)vec_splats (__b);
1462 1.1.1.3 mrg return (__m64) ((__vector long long) __res)[0];
1463 1.1 mrg #else
1464 1.1.1.3 mrg __m64_union __res;
1465 1.1 mrg
1466 1.1.1.3 mrg __res.as_char[0] = __b;
1467 1.1.1.3 mrg __res.as_char[1] = __b;
1468 1.1.1.3 mrg __res.as_char[2] = __b;
1469 1.1.1.3 mrg __res.as_char[3] = __b;
1470 1.1.1.3 mrg __res.as_char[4] = __b;
1471 1.1.1.3 mrg __res.as_char[5] = __b;
1472 1.1.1.3 mrg __res.as_char[6] = __b;
1473 1.1.1.3 mrg __res.as_char[7] = __b;
1474 1.1.1.3 mrg return (__res.as_m64);
1475 1.1 mrg #endif
1476 1.1 mrg }
1477 1.1 mrg #endif /* _MMINTRIN_H_INCLUDED */
1478