mmintrin.h revision 1.1.1.2 1 1.1.1.2 mrg /* Copyright (C) 2002-2019 Free Software Foundation, Inc.
2 1.1 mrg
3 1.1 mrg This file is part of GCC.
4 1.1 mrg
5 1.1 mrg GCC is free software; you can redistribute it and/or modify
6 1.1 mrg it under the terms of the GNU General Public License as published by
7 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
8 1.1 mrg any later version.
9 1.1 mrg
10 1.1 mrg GCC is distributed in the hope that it will be useful,
11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 1.1 mrg GNU General Public License for more details.
14 1.1 mrg
15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
16 1.1 mrg permissions described in the GCC Runtime Library Exception, version
17 1.1 mrg 3.1, as published by the Free Software Foundation.
18 1.1 mrg
19 1.1 mrg You should have received a copy of the GNU General Public License and
20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 1.1 mrg <http://www.gnu.org/licenses/>. */
23 1.1 mrg
24 1.1 mrg /* Implemented from the specification included in the Intel C++ Compiler
25 1.1 mrg User Guide and Reference, version 9.0. */
26 1.1 mrg
27 1.1 mrg #ifndef NO_WARN_X86_INTRINSICS
28 1.1 mrg /* This header is distributed to simplify porting x86_64 code that
29 1.1 mrg makes explicit use of Intel intrinsics to powerpc64le.
30 1.1 mrg It is the user's responsibility to determine if the results are
31 1.1 mrg acceptable and make additional changes as necessary.
32 1.1 mrg Note that much code that uses Intel intrinsics can be rewritten in
33 1.1 mrg standard C or GNU C extensions, which are more portable and better
34 1.1 mrg optimized across multiple targets.
35 1.1 mrg
36 1.1 mrg In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37 1.1 mrg target does not support a native __vector_size__ (8) type. Instead
38 1.1 mrg we typedef __m64 to a 64-bit unsigned long long, which is natively
39 1.1 mrg supported in 64-bit mode. This works well for the _si64 and some
40 1.1 mrg _pi32 operations, but starts to generate long sequences for _pi16
41 1.1 mrg and _pi8 operations. For those cases it better (faster and
42 1.1 mrg smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43 1.1 mrg unit, perform the operation, and then transfer the result back to
44 1.1 mrg the __m64 type. This implies that the direct register move
45 1.1 mrg instructions, introduced with power8, are available for efficient
46 1.1 mrg implementation of these transfers.
47 1.1 mrg
48 1.1 mrg Most MMX intrinsic operations can be performed efficiently as
49 1.1 mrg C language 64-bit scalar operation or optimized to use the newer
50 1.1 mrg 128-bit SSE/Altivec operations. We recomend this for new
51 1.1 mrg applications. */
52 1.1 mrg #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
53 1.1 mrg #endif
54 1.1 mrg
55 1.1 mrg #ifndef _MMINTRIN_H_INCLUDED
56 1.1 mrg #define _MMINTRIN_H_INCLUDED
57 1.1 mrg
58 1.1 mrg #include <altivec.h>
59 1.1 mrg /* The Intel API is flexible enough that we must allow aliasing with other
60 1.1 mrg vector types, and their scalar components. */
61 1.1 mrg typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
62 1.1 mrg
63 1.1 mrg typedef __attribute__ ((__aligned__ (8)))
64 1.1 mrg union
65 1.1 mrg {
66 1.1 mrg __m64 as_m64;
67 1.1 mrg char as_char[8];
68 1.1 mrg signed char as_signed_char [8];
69 1.1 mrg short as_short[4];
70 1.1 mrg int as_int[2];
71 1.1 mrg long long as_long_long;
72 1.1 mrg float as_float[2];
73 1.1 mrg double as_double;
74 1.1 mrg } __m64_union;
75 1.1 mrg
76 1.1 mrg /* Empty the multimedia state. */
77 1.1 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 1.1 mrg _mm_empty (void)
79 1.1 mrg {
80 1.1 mrg /* nothing to do on PowerPC. */
81 1.1 mrg }
82 1.1 mrg
83 1.1 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84 1.1 mrg _m_empty (void)
85 1.1 mrg {
86 1.1 mrg /* nothing to do on PowerPC. */
87 1.1 mrg }
88 1.1 mrg
89 1.1 mrg /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
90 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91 1.1 mrg _mm_cvtsi32_si64 (int __i)
92 1.1 mrg {
93 1.1 mrg return (__m64) (unsigned int) __i;
94 1.1 mrg }
95 1.1 mrg
96 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 1.1 mrg _m_from_int (int __i)
98 1.1 mrg {
99 1.1 mrg return _mm_cvtsi32_si64 (__i);
100 1.1 mrg }
101 1.1 mrg
102 1.1 mrg /* Convert the lower 32 bits of the __m64 object into an integer. */
103 1.1 mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 1.1 mrg _mm_cvtsi64_si32 (__m64 __i)
105 1.1 mrg {
106 1.1 mrg return ((int) __i);
107 1.1 mrg }
108 1.1 mrg
109 1.1 mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 1.1 mrg _m_to_int (__m64 __i)
111 1.1 mrg {
112 1.1 mrg return _mm_cvtsi64_si32 (__i);
113 1.1 mrg }
114 1.1 mrg
115 1.1 mrg /* Convert I to a __m64 object. */
116 1.1 mrg
117 1.1 mrg /* Intel intrinsic. */
118 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 1.1 mrg _m_from_int64 (long long __i)
120 1.1 mrg {
121 1.1 mrg return (__m64) __i;
122 1.1 mrg }
123 1.1 mrg
124 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 1.1 mrg _mm_cvtsi64_m64 (long long __i)
126 1.1 mrg {
127 1.1 mrg return (__m64) __i;
128 1.1 mrg }
129 1.1 mrg
130 1.1 mrg /* Microsoft intrinsic. */
131 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 1.1 mrg _mm_cvtsi64x_si64 (long long __i)
133 1.1 mrg {
134 1.1 mrg return (__m64) __i;
135 1.1 mrg }
136 1.1 mrg
137 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
138 1.1 mrg _mm_set_pi64x (long long __i)
139 1.1 mrg {
140 1.1 mrg return (__m64) __i;
141 1.1 mrg }
142 1.1 mrg
143 1.1 mrg /* Convert the __m64 object to a 64bit integer. */
144 1.1 mrg
145 1.1 mrg /* Intel intrinsic. */
146 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 1.1 mrg _m_to_int64 (__m64 __i)
148 1.1 mrg {
149 1.1 mrg return (long long)__i;
150 1.1 mrg }
151 1.1 mrg
152 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 1.1 mrg _mm_cvtm64_si64 (__m64 __i)
154 1.1 mrg {
155 1.1 mrg return (long long) __i;
156 1.1 mrg }
157 1.1 mrg
158 1.1 mrg /* Microsoft intrinsic. */
159 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 1.1 mrg _mm_cvtsi64_si64x (__m64 __i)
161 1.1 mrg {
162 1.1 mrg return (long long) __i;
163 1.1 mrg }
164 1.1 mrg
165 1.1 mrg #ifdef _ARCH_PWR8
166 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
167 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit
168 1.1 mrg values of the result, all with signed saturation. */
169 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 1.1 mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2)
171 1.1 mrg {
172 1.1 mrg __vector signed short vm1;
173 1.1 mrg __vector signed char vresult;
174 1.1 mrg
175 1.1.1.2 mrg vm1 = (__vector signed short) (__vector unsigned long long)
176 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__
177 1.1.1.2 mrg { __m1, __m2 };
178 1.1.1.2 mrg #else
179 1.1.1.2 mrg { __m2, __m1 };
180 1.1.1.2 mrg #endif
181 1.1.1.2 mrg vresult = vec_packs (vm1, vm1);
182 1.1.1.2 mrg return (__m64) ((__vector long long) vresult)[0];
183 1.1 mrg }
184 1.1 mrg
185 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 1.1 mrg _m_packsswb (__m64 __m1, __m64 __m2)
187 1.1 mrg {
188 1.1 mrg return _mm_packs_pi16 (__m1, __m2);
189 1.1 mrg }
190 1.1 mrg
191 1.1 mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
192 1.1 mrg the result, and the two 32-bit values from M2 into the upper two 16-bit
193 1.1 mrg values of the result, all with signed saturation. */
194 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195 1.1 mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2)
196 1.1 mrg {
197 1.1 mrg __vector signed int vm1;
198 1.1 mrg __vector signed short vresult;
199 1.1 mrg
200 1.1.1.2 mrg vm1 = (__vector signed int) (__vector unsigned long long)
201 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__
202 1.1.1.2 mrg { __m1, __m2 };
203 1.1.1.2 mrg #else
204 1.1.1.2 mrg { __m2, __m1 };
205 1.1.1.2 mrg #endif
206 1.1.1.2 mrg vresult = vec_packs (vm1, vm1);
207 1.1.1.2 mrg return (__m64) ((__vector long long) vresult)[0];
208 1.1 mrg }
209 1.1 mrg
210 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 1.1 mrg _m_packssdw (__m64 __m1, __m64 __m2)
212 1.1 mrg {
213 1.1 mrg return _mm_packs_pi32 (__m1, __m2);
214 1.1 mrg }
215 1.1 mrg
216 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
217 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit
218 1.1 mrg values of the result, all with unsigned saturation. */
219 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220 1.1 mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2)
221 1.1 mrg {
222 1.1.1.2 mrg __vector unsigned char r;
223 1.1.1.2 mrg __vector signed short vm1 = (__vector signed short) (__vector long long)
224 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__
225 1.1.1.2 mrg { __m1, __m2 };
226 1.1.1.2 mrg #else
227 1.1.1.2 mrg { __m2, __m1 };
228 1.1.1.2 mrg #endif
229 1.1.1.2 mrg const __vector signed short __zero = { 0 };
230 1.1.1.2 mrg __vector __bool short __select = vec_cmplt (vm1, __zero);
231 1.1.1.2 mrg r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1);
232 1.1.1.2 mrg __vector __bool char packsel = vec_pack (__select, __select);
233 1.1.1.2 mrg r = vec_sel (r, (const __vector unsigned char) __zero, packsel);
234 1.1.1.2 mrg return (__m64) ((__vector long long) r)[0];
235 1.1 mrg }
236 1.1 mrg
237 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
238 1.1 mrg _m_packuswb (__m64 __m1, __m64 __m2)
239 1.1 mrg {
240 1.1 mrg return _mm_packs_pu16 (__m1, __m2);
241 1.1 mrg }
242 1.1 mrg #endif /* end ARCH_PWR8 */
243 1.1 mrg
244 1.1 mrg /* Interleave the four 8-bit values from the high half of M1 with the four
245 1.1 mrg 8-bit values from the high half of M2. */
246 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
247 1.1 mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
248 1.1 mrg {
249 1.1 mrg #if _ARCH_PWR8
250 1.1 mrg __vector unsigned char a, b, c;
251 1.1 mrg
252 1.1 mrg a = (__vector unsigned char)vec_splats (__m1);
253 1.1 mrg b = (__vector unsigned char)vec_splats (__m2);
254 1.1 mrg c = vec_mergel (a, b);
255 1.1.1.2 mrg return (__m64) ((__vector long long) c)[1];
256 1.1 mrg #else
257 1.1 mrg __m64_union m1, m2, res;
258 1.1 mrg
259 1.1 mrg m1.as_m64 = __m1;
260 1.1 mrg m2.as_m64 = __m2;
261 1.1 mrg
262 1.1 mrg res.as_char[0] = m1.as_char[4];
263 1.1 mrg res.as_char[1] = m2.as_char[4];
264 1.1 mrg res.as_char[2] = m1.as_char[5];
265 1.1 mrg res.as_char[3] = m2.as_char[5];
266 1.1 mrg res.as_char[4] = m1.as_char[6];
267 1.1 mrg res.as_char[5] = m2.as_char[6];
268 1.1 mrg res.as_char[6] = m1.as_char[7];
269 1.1 mrg res.as_char[7] = m2.as_char[7];
270 1.1 mrg
271 1.1 mrg return (__m64) res.as_m64;
272 1.1 mrg #endif
273 1.1 mrg }
274 1.1 mrg
275 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276 1.1 mrg _m_punpckhbw (__m64 __m1, __m64 __m2)
277 1.1 mrg {
278 1.1 mrg return _mm_unpackhi_pi8 (__m1, __m2);
279 1.1 mrg }
280 1.1 mrg
281 1.1 mrg /* Interleave the two 16-bit values from the high half of M1 with the two
282 1.1 mrg 16-bit values from the high half of M2. */
283 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 1.1 mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
285 1.1 mrg {
286 1.1 mrg __m64_union m1, m2, res;
287 1.1 mrg
288 1.1 mrg m1.as_m64 = __m1;
289 1.1 mrg m2.as_m64 = __m2;
290 1.1 mrg
291 1.1 mrg res.as_short[0] = m1.as_short[2];
292 1.1 mrg res.as_short[1] = m2.as_short[2];
293 1.1 mrg res.as_short[2] = m1.as_short[3];
294 1.1 mrg res.as_short[3] = m2.as_short[3];
295 1.1 mrg
296 1.1 mrg return (__m64) res.as_m64;
297 1.1 mrg }
298 1.1 mrg
299 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300 1.1 mrg _m_punpckhwd (__m64 __m1, __m64 __m2)
301 1.1 mrg {
302 1.1 mrg return _mm_unpackhi_pi16 (__m1, __m2);
303 1.1 mrg }
304 1.1 mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit
305 1.1 mrg value from the high half of M2. */
306 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307 1.1 mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
308 1.1 mrg {
309 1.1 mrg __m64_union m1, m2, res;
310 1.1 mrg
311 1.1 mrg m1.as_m64 = __m1;
312 1.1 mrg m2.as_m64 = __m2;
313 1.1 mrg
314 1.1 mrg res.as_int[0] = m1.as_int[1];
315 1.1 mrg res.as_int[1] = m2.as_int[1];
316 1.1 mrg
317 1.1 mrg return (__m64) res.as_m64;
318 1.1 mrg }
319 1.1 mrg
320 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321 1.1 mrg _m_punpckhdq (__m64 __m1, __m64 __m2)
322 1.1 mrg {
323 1.1 mrg return _mm_unpackhi_pi32 (__m1, __m2);
324 1.1 mrg }
325 1.1 mrg /* Interleave the four 8-bit values from the low half of M1 with the four
326 1.1 mrg 8-bit values from the low half of M2. */
327 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
328 1.1 mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
329 1.1 mrg {
330 1.1 mrg #if _ARCH_PWR8
331 1.1 mrg __vector unsigned char a, b, c;
332 1.1 mrg
333 1.1 mrg a = (__vector unsigned char)vec_splats (__m1);
334 1.1 mrg b = (__vector unsigned char)vec_splats (__m2);
335 1.1 mrg c = vec_mergel (a, b);
336 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
337 1.1 mrg #else
338 1.1 mrg __m64_union m1, m2, res;
339 1.1 mrg
340 1.1 mrg m1.as_m64 = __m1;
341 1.1 mrg m2.as_m64 = __m2;
342 1.1 mrg
343 1.1 mrg res.as_char[0] = m1.as_char[0];
344 1.1 mrg res.as_char[1] = m2.as_char[0];
345 1.1 mrg res.as_char[2] = m1.as_char[1];
346 1.1 mrg res.as_char[3] = m2.as_char[1];
347 1.1 mrg res.as_char[4] = m1.as_char[2];
348 1.1 mrg res.as_char[5] = m2.as_char[2];
349 1.1 mrg res.as_char[6] = m1.as_char[3];
350 1.1 mrg res.as_char[7] = m2.as_char[3];
351 1.1 mrg
352 1.1 mrg return (__m64) res.as_m64;
353 1.1 mrg #endif
354 1.1 mrg }
355 1.1 mrg
356 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 1.1 mrg _m_punpcklbw (__m64 __m1, __m64 __m2)
358 1.1 mrg {
359 1.1 mrg return _mm_unpacklo_pi8 (__m1, __m2);
360 1.1 mrg }
361 1.1 mrg /* Interleave the two 16-bit values from the low half of M1 with the two
362 1.1 mrg 16-bit values from the low half of M2. */
363 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
364 1.1 mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
365 1.1 mrg {
366 1.1 mrg __m64_union m1, m2, res;
367 1.1 mrg
368 1.1 mrg m1.as_m64 = __m1;
369 1.1 mrg m2.as_m64 = __m2;
370 1.1 mrg
371 1.1 mrg res.as_short[0] = m1.as_short[0];
372 1.1 mrg res.as_short[1] = m2.as_short[0];
373 1.1 mrg res.as_short[2] = m1.as_short[1];
374 1.1 mrg res.as_short[3] = m2.as_short[1];
375 1.1 mrg
376 1.1 mrg return (__m64) res.as_m64;
377 1.1 mrg }
378 1.1 mrg
379 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380 1.1 mrg _m_punpcklwd (__m64 __m1, __m64 __m2)
381 1.1 mrg {
382 1.1 mrg return _mm_unpacklo_pi16 (__m1, __m2);
383 1.1 mrg }
384 1.1 mrg
385 1.1 mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit
386 1.1 mrg value from the low half of M2. */
387 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 1.1 mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
389 1.1 mrg {
390 1.1 mrg __m64_union m1, m2, res;
391 1.1 mrg
392 1.1 mrg m1.as_m64 = __m1;
393 1.1 mrg m2.as_m64 = __m2;
394 1.1 mrg
395 1.1 mrg res.as_int[0] = m1.as_int[0];
396 1.1 mrg res.as_int[1] = m2.as_int[0];
397 1.1 mrg
398 1.1 mrg return (__m64) res.as_m64;
399 1.1 mrg }
400 1.1 mrg
401 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 1.1 mrg _m_punpckldq (__m64 __m1, __m64 __m2)
403 1.1 mrg {
404 1.1 mrg return _mm_unpacklo_pi32 (__m1, __m2);
405 1.1 mrg }
406 1.1 mrg
407 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2. */
408 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
409 1.1 mrg _mm_add_pi8 (__m64 __m1, __m64 __m2)
410 1.1 mrg {
411 1.1 mrg #if _ARCH_PWR8
412 1.1 mrg __vector signed char a, b, c;
413 1.1 mrg
414 1.1 mrg a = (__vector signed char)vec_splats (__m1);
415 1.1 mrg b = (__vector signed char)vec_splats (__m2);
416 1.1 mrg c = vec_add (a, b);
417 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
418 1.1 mrg #else
419 1.1 mrg __m64_union m1, m2, res;
420 1.1 mrg
421 1.1 mrg m1.as_m64 = __m1;
422 1.1 mrg m2.as_m64 = __m2;
423 1.1 mrg
424 1.1 mrg res.as_char[0] = m1.as_char[0] + m2.as_char[0];
425 1.1 mrg res.as_char[1] = m1.as_char[1] + m2.as_char[1];
426 1.1 mrg res.as_char[2] = m1.as_char[2] + m2.as_char[2];
427 1.1 mrg res.as_char[3] = m1.as_char[3] + m2.as_char[3];
428 1.1 mrg res.as_char[4] = m1.as_char[4] + m2.as_char[4];
429 1.1 mrg res.as_char[5] = m1.as_char[5] + m2.as_char[5];
430 1.1 mrg res.as_char[6] = m1.as_char[6] + m2.as_char[6];
431 1.1 mrg res.as_char[7] = m1.as_char[7] + m2.as_char[7];
432 1.1 mrg
433 1.1 mrg return (__m64) res.as_m64;
434 1.1 mrg #endif
435 1.1 mrg }
436 1.1 mrg
437 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438 1.1 mrg _m_paddb (__m64 __m1, __m64 __m2)
439 1.1 mrg {
440 1.1 mrg return _mm_add_pi8 (__m1, __m2);
441 1.1 mrg }
442 1.1 mrg
443 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2. */
444 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
445 1.1 mrg _mm_add_pi16 (__m64 __m1, __m64 __m2)
446 1.1 mrg {
447 1.1 mrg #if _ARCH_PWR8
448 1.1 mrg __vector signed short a, b, c;
449 1.1 mrg
450 1.1 mrg a = (__vector signed short)vec_splats (__m1);
451 1.1 mrg b = (__vector signed short)vec_splats (__m2);
452 1.1 mrg c = vec_add (a, b);
453 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
454 1.1 mrg #else
455 1.1 mrg __m64_union m1, m2, res;
456 1.1 mrg
457 1.1 mrg m1.as_m64 = __m1;
458 1.1 mrg m2.as_m64 = __m2;
459 1.1 mrg
460 1.1 mrg res.as_short[0] = m1.as_short[0] + m2.as_short[0];
461 1.1 mrg res.as_short[1] = m1.as_short[1] + m2.as_short[1];
462 1.1 mrg res.as_short[2] = m1.as_short[2] + m2.as_short[2];
463 1.1 mrg res.as_short[3] = m1.as_short[3] + m2.as_short[3];
464 1.1 mrg
465 1.1 mrg return (__m64) res.as_m64;
466 1.1 mrg #endif
467 1.1 mrg }
468 1.1 mrg
469 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
470 1.1 mrg _m_paddw (__m64 __m1, __m64 __m2)
471 1.1 mrg {
472 1.1 mrg return _mm_add_pi16 (__m1, __m2);
473 1.1 mrg }
474 1.1 mrg
475 1.1 mrg /* Add the 32-bit values in M1 to the 32-bit values in M2. */
476 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477 1.1 mrg _mm_add_pi32 (__m64 __m1, __m64 __m2)
478 1.1 mrg {
479 1.1 mrg #if _ARCH_PWR9
480 1.1 mrg __vector signed int a, b, c;
481 1.1 mrg
482 1.1 mrg a = (__vector signed int)vec_splats (__m1);
483 1.1 mrg b = (__vector signed int)vec_splats (__m2);
484 1.1 mrg c = vec_add (a, b);
485 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
486 1.1 mrg #else
487 1.1 mrg __m64_union m1, m2, res;
488 1.1 mrg
489 1.1 mrg m1.as_m64 = __m1;
490 1.1 mrg m2.as_m64 = __m2;
491 1.1 mrg
492 1.1 mrg res.as_int[0] = m1.as_int[0] + m2.as_int[0];
493 1.1 mrg res.as_int[1] = m1.as_int[1] + m2.as_int[1];
494 1.1 mrg
495 1.1 mrg return (__m64) res.as_m64;
496 1.1 mrg #endif
497 1.1 mrg }
498 1.1 mrg
499 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
500 1.1 mrg _m_paddd (__m64 __m1, __m64 __m2)
501 1.1 mrg {
502 1.1 mrg return _mm_add_pi32 (__m1, __m2);
503 1.1 mrg }
504 1.1 mrg
505 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
506 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 1.1 mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2)
508 1.1 mrg {
509 1.1 mrg #if _ARCH_PWR8
510 1.1 mrg __vector signed char a, b, c;
511 1.1 mrg
512 1.1 mrg a = (__vector signed char)vec_splats (__m1);
513 1.1 mrg b = (__vector signed char)vec_splats (__m2);
514 1.1 mrg c = vec_sub (a, b);
515 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
516 1.1 mrg #else
517 1.1 mrg __m64_union m1, m2, res;
518 1.1 mrg
519 1.1 mrg m1.as_m64 = __m1;
520 1.1 mrg m2.as_m64 = __m2;
521 1.1 mrg
522 1.1 mrg res.as_char[0] = m1.as_char[0] - m2.as_char[0];
523 1.1 mrg res.as_char[1] = m1.as_char[1] - m2.as_char[1];
524 1.1 mrg res.as_char[2] = m1.as_char[2] - m2.as_char[2];
525 1.1 mrg res.as_char[3] = m1.as_char[3] - m2.as_char[3];
526 1.1 mrg res.as_char[4] = m1.as_char[4] - m2.as_char[4];
527 1.1 mrg res.as_char[5] = m1.as_char[5] - m2.as_char[5];
528 1.1 mrg res.as_char[6] = m1.as_char[6] - m2.as_char[6];
529 1.1 mrg res.as_char[7] = m1.as_char[7] - m2.as_char[7];
530 1.1 mrg
531 1.1 mrg return (__m64) res.as_m64;
532 1.1 mrg #endif
533 1.1 mrg }
534 1.1 mrg
535 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
536 1.1 mrg _m_psubb (__m64 __m1, __m64 __m2)
537 1.1 mrg {
538 1.1 mrg return _mm_sub_pi8 (__m1, __m2);
539 1.1 mrg }
540 1.1 mrg
541 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
542 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
543 1.1 mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2)
544 1.1 mrg {
545 1.1 mrg #if _ARCH_PWR8
546 1.1 mrg __vector signed short a, b, c;
547 1.1 mrg
548 1.1 mrg a = (__vector signed short)vec_splats (__m1);
549 1.1 mrg b = (__vector signed short)vec_splats (__m2);
550 1.1 mrg c = vec_sub (a, b);
551 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
552 1.1 mrg #else
553 1.1 mrg __m64_union m1, m2, res;
554 1.1 mrg
555 1.1 mrg m1.as_m64 = __m1;
556 1.1 mrg m2.as_m64 = __m2;
557 1.1 mrg
558 1.1 mrg res.as_short[0] = m1.as_short[0] - m2.as_short[0];
559 1.1 mrg res.as_short[1] = m1.as_short[1] - m2.as_short[1];
560 1.1 mrg res.as_short[2] = m1.as_short[2] - m2.as_short[2];
561 1.1 mrg res.as_short[3] = m1.as_short[3] - m2.as_short[3];
562 1.1 mrg
563 1.1 mrg return (__m64) res.as_m64;
564 1.1 mrg #endif
565 1.1 mrg }
566 1.1 mrg
567 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568 1.1 mrg _m_psubw (__m64 __m1, __m64 __m2)
569 1.1 mrg {
570 1.1 mrg return _mm_sub_pi16 (__m1, __m2);
571 1.1 mrg }
572 1.1 mrg
573 1.1 mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
574 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
575 1.1 mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2)
576 1.1 mrg {
577 1.1 mrg #if _ARCH_PWR9
578 1.1 mrg __vector signed int a, b, c;
579 1.1 mrg
580 1.1 mrg a = (__vector signed int)vec_splats (__m1);
581 1.1 mrg b = (__vector signed int)vec_splats (__m2);
582 1.1 mrg c = vec_sub (a, b);
583 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
584 1.1 mrg #else
585 1.1 mrg __m64_union m1, m2, res;
586 1.1 mrg
587 1.1 mrg m1.as_m64 = __m1;
588 1.1 mrg m2.as_m64 = __m2;
589 1.1 mrg
590 1.1 mrg res.as_int[0] = m1.as_int[0] - m2.as_int[0];
591 1.1 mrg res.as_int[1] = m1.as_int[1] - m2.as_int[1];
592 1.1 mrg
593 1.1 mrg return (__m64) res.as_m64;
594 1.1 mrg #endif
595 1.1 mrg }
596 1.1 mrg
597 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598 1.1 mrg _m_psubd (__m64 __m1, __m64 __m2)
599 1.1 mrg {
600 1.1 mrg return _mm_sub_pi32 (__m1, __m2);
601 1.1 mrg }
602 1.1 mrg
603 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604 1.1 mrg _mm_add_si64 (__m64 __m1, __m64 __m2)
605 1.1 mrg {
606 1.1 mrg return (__m1 + __m2);
607 1.1 mrg }
608 1.1 mrg
609 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 1.1 mrg _mm_sub_si64 (__m64 __m1, __m64 __m2)
611 1.1 mrg {
612 1.1 mrg return (__m1 - __m2);
613 1.1 mrg }
614 1.1 mrg
615 1.1 mrg /* Shift the 64-bit value in M left by COUNT. */
616 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617 1.1 mrg _mm_sll_si64 (__m64 __m, __m64 __count)
618 1.1 mrg {
619 1.1 mrg return (__m << __count);
620 1.1 mrg }
621 1.1 mrg
622 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623 1.1 mrg _m_psllq (__m64 __m, __m64 __count)
624 1.1 mrg {
625 1.1 mrg return _mm_sll_si64 (__m, __count);
626 1.1 mrg }
627 1.1 mrg
628 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629 1.1 mrg _mm_slli_si64 (__m64 __m, const int __count)
630 1.1 mrg {
631 1.1 mrg return (__m << __count);
632 1.1 mrg }
633 1.1 mrg
634 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635 1.1 mrg _m_psllqi (__m64 __m, const int __count)
636 1.1 mrg {
637 1.1 mrg return _mm_slli_si64 (__m, __count);
638 1.1 mrg }
639 1.1 mrg
640 1.1 mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
641 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 1.1 mrg _mm_srl_si64 (__m64 __m, __m64 __count)
643 1.1 mrg {
644 1.1 mrg return (__m >> __count);
645 1.1 mrg }
646 1.1 mrg
647 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648 1.1 mrg _m_psrlq (__m64 __m, __m64 __count)
649 1.1 mrg {
650 1.1 mrg return _mm_srl_si64 (__m, __count);
651 1.1 mrg }
652 1.1 mrg
653 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654 1.1 mrg _mm_srli_si64 (__m64 __m, const int __count)
655 1.1 mrg {
656 1.1 mrg return (__m >> __count);
657 1.1 mrg }
658 1.1 mrg
659 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 1.1 mrg _m_psrlqi (__m64 __m, const int __count)
661 1.1 mrg {
662 1.1 mrg return _mm_srli_si64 (__m, __count);
663 1.1 mrg }
664 1.1 mrg
665 1.1 mrg /* Bit-wise AND the 64-bit values in M1 and M2. */
666 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667 1.1 mrg _mm_and_si64 (__m64 __m1, __m64 __m2)
668 1.1 mrg {
669 1.1 mrg return (__m1 & __m2);
670 1.1 mrg }
671 1.1 mrg
672 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673 1.1 mrg _m_pand (__m64 __m1, __m64 __m2)
674 1.1 mrg {
675 1.1 mrg return _mm_and_si64 (__m1, __m2);
676 1.1 mrg }
677 1.1 mrg
678 1.1 mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
679 1.1 mrg 64-bit value in M2. */
680 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681 1.1 mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2)
682 1.1 mrg {
683 1.1 mrg return (~__m1 & __m2);
684 1.1 mrg }
685 1.1 mrg
686 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
687 1.1 mrg _m_pandn (__m64 __m1, __m64 __m2)
688 1.1 mrg {
689 1.1 mrg return _mm_andnot_si64 (__m1, __m2);
690 1.1 mrg }
691 1.1 mrg
692 1.1 mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
693 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694 1.1 mrg _mm_or_si64 (__m64 __m1, __m64 __m2)
695 1.1 mrg {
696 1.1 mrg return (__m1 | __m2);
697 1.1 mrg }
698 1.1 mrg
699 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
700 1.1 mrg _m_por (__m64 __m1, __m64 __m2)
701 1.1 mrg {
702 1.1 mrg return _mm_or_si64 (__m1, __m2);
703 1.1 mrg }
704 1.1 mrg
705 1.1 mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
706 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707 1.1 mrg _mm_xor_si64 (__m64 __m1, __m64 __m2)
708 1.1 mrg {
709 1.1 mrg return (__m1 ^ __m2);
710 1.1 mrg }
711 1.1 mrg
712 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713 1.1 mrg _m_pxor (__m64 __m1, __m64 __m2)
714 1.1 mrg {
715 1.1 mrg return _mm_xor_si64 (__m1, __m2);
716 1.1 mrg }
717 1.1 mrg
718 1.1 mrg /* Creates a 64-bit zero. */
719 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
720 1.1 mrg _mm_setzero_si64 (void)
721 1.1 mrg {
722 1.1 mrg return (__m64) 0;
723 1.1 mrg }
724 1.1 mrg
725 1.1 mrg /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
726 1.1 mrg test is true and zero if false. */
727 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728 1.1 mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
729 1.1 mrg {
730 1.1.1.2 mrg #if defined(_ARCH_PWR6) && defined(__powerpc64__)
731 1.1 mrg __m64 res;
732 1.1 mrg __asm__(
733 1.1 mrg "cmpb %0,%1,%2;\n"
734 1.1 mrg : "=r" (res)
735 1.1 mrg : "r" (__m1),
736 1.1 mrg "r" (__m2)
737 1.1 mrg : );
738 1.1 mrg return (res);
739 1.1 mrg #else
740 1.1 mrg __m64_union m1, m2, res;
741 1.1 mrg
742 1.1 mrg m1.as_m64 = __m1;
743 1.1 mrg m2.as_m64 = __m2;
744 1.1 mrg
745 1.1 mrg res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
746 1.1 mrg res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
747 1.1 mrg res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
748 1.1 mrg res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
749 1.1 mrg res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
750 1.1 mrg res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
751 1.1 mrg res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
752 1.1 mrg res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
753 1.1 mrg
754 1.1 mrg return (__m64) res.as_m64;
755 1.1 mrg #endif
756 1.1 mrg }
757 1.1 mrg
758 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
759 1.1 mrg _m_pcmpeqb (__m64 __m1, __m64 __m2)
760 1.1 mrg {
761 1.1 mrg return _mm_cmpeq_pi8 (__m1, __m2);
762 1.1 mrg }
763 1.1 mrg
764 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
765 1.1 mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
766 1.1 mrg {
767 1.1 mrg #if _ARCH_PWR8
768 1.1 mrg __vector signed char a, b, c;
769 1.1 mrg
770 1.1 mrg a = (__vector signed char)vec_splats (__m1);
771 1.1 mrg b = (__vector signed char)vec_splats (__m2);
772 1.1 mrg c = (__vector signed char)vec_cmpgt (a, b);
773 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
774 1.1 mrg #else
775 1.1 mrg __m64_union m1, m2, res;
776 1.1 mrg
777 1.1 mrg m1.as_m64 = __m1;
778 1.1 mrg m2.as_m64 = __m2;
779 1.1 mrg
780 1.1 mrg res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
781 1.1 mrg res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
782 1.1 mrg res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
783 1.1 mrg res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
784 1.1 mrg res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
785 1.1 mrg res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
786 1.1 mrg res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
787 1.1 mrg res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
788 1.1 mrg
789 1.1 mrg return (__m64) res.as_m64;
790 1.1 mrg #endif
791 1.1 mrg }
792 1.1 mrg
793 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 1.1 mrg _m_pcmpgtb (__m64 __m1, __m64 __m2)
795 1.1 mrg {
796 1.1 mrg return _mm_cmpgt_pi8 (__m1, __m2);
797 1.1 mrg }
798 1.1 mrg
799 1.1 mrg /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
800 1.1 mrg the test is true and zero if false. */
801 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
802 1.1 mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
803 1.1 mrg {
804 1.1 mrg #if _ARCH_PWR8
805 1.1 mrg __vector signed short a, b, c;
806 1.1 mrg
807 1.1 mrg a = (__vector signed short)vec_splats (__m1);
808 1.1 mrg b = (__vector signed short)vec_splats (__m2);
809 1.1 mrg c = (__vector signed short)vec_cmpeq (a, b);
810 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
811 1.1 mrg #else
812 1.1 mrg __m64_union m1, m2, res;
813 1.1 mrg
814 1.1 mrg m1.as_m64 = __m1;
815 1.1 mrg m2.as_m64 = __m2;
816 1.1 mrg
817 1.1 mrg res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
818 1.1 mrg res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
819 1.1 mrg res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
820 1.1 mrg res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
821 1.1 mrg
822 1.1 mrg return (__m64) res.as_m64;
823 1.1 mrg #endif
824 1.1 mrg }
825 1.1 mrg
826 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 1.1 mrg _m_pcmpeqw (__m64 __m1, __m64 __m2)
828 1.1 mrg {
829 1.1 mrg return _mm_cmpeq_pi16 (__m1, __m2);
830 1.1 mrg }
831 1.1 mrg
832 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833 1.1 mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
834 1.1 mrg {
835 1.1 mrg #if _ARCH_PWR8
836 1.1 mrg __vector signed short a, b, c;
837 1.1 mrg
838 1.1 mrg a = (__vector signed short)vec_splats (__m1);
839 1.1 mrg b = (__vector signed short)vec_splats (__m2);
840 1.1 mrg c = (__vector signed short)vec_cmpgt (a, b);
841 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
842 1.1 mrg #else
843 1.1 mrg __m64_union m1, m2, res;
844 1.1 mrg
845 1.1 mrg m1.as_m64 = __m1;
846 1.1 mrg m2.as_m64 = __m2;
847 1.1 mrg
848 1.1 mrg res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
849 1.1 mrg res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
850 1.1 mrg res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
851 1.1 mrg res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
852 1.1 mrg
853 1.1 mrg return (__m64) res.as_m64;
854 1.1 mrg #endif
855 1.1 mrg }
856 1.1 mrg
857 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
858 1.1 mrg _m_pcmpgtw (__m64 __m1, __m64 __m2)
859 1.1 mrg {
860 1.1 mrg return _mm_cmpgt_pi16 (__m1, __m2);
861 1.1 mrg }
862 1.1 mrg
863 1.1 mrg /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
864 1.1 mrg the test is true and zero if false. */
865 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866 1.1 mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
867 1.1 mrg {
868 1.1 mrg #if _ARCH_PWR9
869 1.1 mrg __vector signed int a, b, c;
870 1.1 mrg
871 1.1 mrg a = (__vector signed int)vec_splats (__m1);
872 1.1 mrg b = (__vector signed int)vec_splats (__m2);
873 1.1 mrg c = (__vector signed int)vec_cmpeq (a, b);
874 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
875 1.1 mrg #else
876 1.1 mrg __m64_union m1, m2, res;
877 1.1 mrg
878 1.1 mrg m1.as_m64 = __m1;
879 1.1 mrg m2.as_m64 = __m2;
880 1.1 mrg
881 1.1 mrg res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
882 1.1 mrg res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
883 1.1 mrg
884 1.1 mrg return (__m64) res.as_m64;
885 1.1 mrg #endif
886 1.1 mrg }
887 1.1 mrg
888 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
889 1.1 mrg _m_pcmpeqd (__m64 __m1, __m64 __m2)
890 1.1 mrg {
891 1.1 mrg return _mm_cmpeq_pi32 (__m1, __m2);
892 1.1 mrg }
893 1.1 mrg
894 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
895 1.1 mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
896 1.1 mrg {
897 1.1 mrg #if _ARCH_PWR9
898 1.1 mrg __vector signed int a, b, c;
899 1.1 mrg
900 1.1 mrg a = (__vector signed int)vec_splats (__m1);
901 1.1 mrg b = (__vector signed int)vec_splats (__m2);
902 1.1 mrg c = (__vector signed int)vec_cmpgt (a, b);
903 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
904 1.1 mrg #else
905 1.1 mrg __m64_union m1, m2, res;
906 1.1 mrg
907 1.1 mrg m1.as_m64 = __m1;
908 1.1 mrg m2.as_m64 = __m2;
909 1.1 mrg
910 1.1 mrg res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
911 1.1 mrg res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
912 1.1 mrg
913 1.1 mrg return (__m64) res.as_m64;
914 1.1 mrg #endif
915 1.1 mrg }
916 1.1 mrg
917 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
918 1.1 mrg _m_pcmpgtd (__m64 __m1, __m64 __m2)
919 1.1 mrg {
920 1.1 mrg return _mm_cmpgt_pi32 (__m1, __m2);
921 1.1 mrg }
922 1.1 mrg
923 1.1 mrg #if _ARCH_PWR8
924 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
925 1.1 mrg saturated arithmetic. */
926 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
927 1.1 mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2)
928 1.1 mrg {
929 1.1 mrg __vector signed char a, b, c;
930 1.1 mrg
931 1.1 mrg a = (__vector signed char)vec_splats (__m1);
932 1.1 mrg b = (__vector signed char)vec_splats (__m2);
933 1.1 mrg c = vec_adds (a, b);
934 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
935 1.1 mrg }
936 1.1 mrg
937 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
938 1.1 mrg _m_paddsb (__m64 __m1, __m64 __m2)
939 1.1 mrg {
940 1.1 mrg return _mm_adds_pi8 (__m1, __m2);
941 1.1 mrg }
942 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
943 1.1 mrg saturated arithmetic. */
944 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 1.1 mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2)
946 1.1 mrg {
947 1.1 mrg __vector signed short a, b, c;
948 1.1 mrg
949 1.1 mrg a = (__vector signed short)vec_splats (__m1);
950 1.1 mrg b = (__vector signed short)vec_splats (__m2);
951 1.1 mrg c = vec_adds (a, b);
952 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
953 1.1 mrg }
954 1.1 mrg
955 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956 1.1 mrg _m_paddsw (__m64 __m1, __m64 __m2)
957 1.1 mrg {
958 1.1 mrg return _mm_adds_pi16 (__m1, __m2);
959 1.1 mrg }
960 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
961 1.1 mrg saturated arithmetic. */
962 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 1.1 mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2)
964 1.1 mrg {
965 1.1 mrg __vector unsigned char a, b, c;
966 1.1 mrg
967 1.1 mrg a = (__vector unsigned char)vec_splats (__m1);
968 1.1 mrg b = (__vector unsigned char)vec_splats (__m2);
969 1.1 mrg c = vec_adds (a, b);
970 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
971 1.1 mrg }
972 1.1 mrg
973 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
974 1.1 mrg _m_paddusb (__m64 __m1, __m64 __m2)
975 1.1 mrg {
976 1.1 mrg return _mm_adds_pu8 (__m1, __m2);
977 1.1 mrg }
978 1.1 mrg
979 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
980 1.1 mrg saturated arithmetic. */
981 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
982 1.1 mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2)
983 1.1 mrg {
984 1.1 mrg __vector unsigned short a, b, c;
985 1.1 mrg
986 1.1 mrg a = (__vector unsigned short)vec_splats (__m1);
987 1.1 mrg b = (__vector unsigned short)vec_splats (__m2);
988 1.1 mrg c = vec_adds (a, b);
989 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
990 1.1 mrg }
991 1.1 mrg
992 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
993 1.1 mrg _m_paddusw (__m64 __m1, __m64 __m2)
994 1.1 mrg {
995 1.1 mrg return _mm_adds_pu16 (__m1, __m2);
996 1.1 mrg }
997 1.1 mrg
998 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
999 1.1 mrg saturating arithmetic. */
1000 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1001 1.1 mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2)
1002 1.1 mrg {
1003 1.1 mrg __vector signed char a, b, c;
1004 1.1 mrg
1005 1.1 mrg a = (__vector signed char)vec_splats (__m1);
1006 1.1 mrg b = (__vector signed char)vec_splats (__m2);
1007 1.1 mrg c = vec_subs (a, b);
1008 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
1009 1.1 mrg }
1010 1.1 mrg
1011 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 1.1 mrg _m_psubsb (__m64 __m1, __m64 __m2)
1013 1.1 mrg {
1014 1.1 mrg return _mm_subs_pi8 (__m1, __m2);
1015 1.1 mrg }
1016 1.1 mrg
1017 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1018 1.1 mrg signed saturating arithmetic. */
1019 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1020 1.1 mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1021 1.1 mrg {
1022 1.1 mrg __vector signed short a, b, c;
1023 1.1 mrg
1024 1.1 mrg a = (__vector signed short)vec_splats (__m1);
1025 1.1 mrg b = (__vector signed short)vec_splats (__m2);
1026 1.1 mrg c = vec_subs (a, b);
1027 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
1028 1.1 mrg }
1029 1.1 mrg
1030 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031 1.1 mrg _m_psubsw (__m64 __m1, __m64 __m2)
1032 1.1 mrg {
1033 1.1 mrg return _mm_subs_pi16 (__m1, __m2);
1034 1.1 mrg }
1035 1.1 mrg
1036 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1037 1.1 mrg unsigned saturating arithmetic. */
1038 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039 1.1 mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1040 1.1 mrg {
1041 1.1 mrg __vector unsigned char a, b, c;
1042 1.1 mrg
1043 1.1 mrg a = (__vector unsigned char)vec_splats (__m1);
1044 1.1 mrg b = (__vector unsigned char)vec_splats (__m2);
1045 1.1 mrg c = vec_subs (a, b);
1046 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
1047 1.1 mrg }
1048 1.1 mrg
1049 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 1.1 mrg _m_psubusb (__m64 __m1, __m64 __m2)
1051 1.1 mrg {
1052 1.1 mrg return _mm_subs_pu8 (__m1, __m2);
1053 1.1 mrg }
1054 1.1 mrg
1055 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1056 1.1 mrg unsigned saturating arithmetic. */
1057 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1058 1.1 mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1059 1.1 mrg {
1060 1.1 mrg __vector unsigned short a, b, c;
1061 1.1 mrg
1062 1.1 mrg a = (__vector unsigned short)vec_splats (__m1);
1063 1.1 mrg b = (__vector unsigned short)vec_splats (__m2);
1064 1.1 mrg c = vec_subs (a, b);
1065 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
1066 1.1 mrg }
1067 1.1 mrg
1068 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 1.1 mrg _m_psubusw (__m64 __m1, __m64 __m2)
1070 1.1 mrg {
1071 1.1 mrg return _mm_subs_pu16 (__m1, __m2);
1072 1.1 mrg }
1073 1.1 mrg
1074 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1075 1.1 mrg four 32-bit intermediate results, which are then summed by pairs to
1076 1.1 mrg produce two 32-bit results. */
1077 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078 1.1 mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1079 1.1 mrg {
1080 1.1 mrg __vector signed short a, b;
1081 1.1 mrg __vector signed int c;
1082 1.1 mrg __vector signed int zero = {0, 0, 0, 0};
1083 1.1 mrg
1084 1.1 mrg a = (__vector signed short)vec_splats (__m1);
1085 1.1 mrg b = (__vector signed short)vec_splats (__m2);
1086 1.1 mrg c = vec_vmsumshm (a, b, zero);
1087 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
1088 1.1 mrg }
1089 1.1 mrg
1090 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1091 1.1 mrg _m_pmaddwd (__m64 __m1, __m64 __m2)
1092 1.1 mrg {
1093 1.1 mrg return _mm_madd_pi16 (__m1, __m2);
1094 1.1 mrg }
1095 1.1 mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1096 1.1 mrg M2 and produce the high 16 bits of the 32-bit results. */
1097 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 1.1 mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1099 1.1 mrg {
1100 1.1 mrg __vector signed short a, b;
1101 1.1 mrg __vector signed short c;
1102 1.1 mrg __vector signed int w0, w1;
1103 1.1 mrg __vector unsigned char xform1 = {
1104 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__
1105 1.1 mrg 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1106 1.1 mrg 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1107 1.1.1.2 mrg #else
1108 1.1.1.2 mrg 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1109 1.1.1.2 mrg 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1110 1.1.1.2 mrg #endif
1111 1.1 mrg };
1112 1.1 mrg
1113 1.1 mrg a = (__vector signed short)vec_splats (__m1);
1114 1.1 mrg b = (__vector signed short)vec_splats (__m2);
1115 1.1 mrg
1116 1.1 mrg w0 = vec_vmulesh (a, b);
1117 1.1 mrg w1 = vec_vmulosh (a, b);
1118 1.1 mrg c = (__vector signed short)vec_perm (w0, w1, xform1);
1119 1.1 mrg
1120 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
1121 1.1 mrg }
1122 1.1 mrg
1123 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1124 1.1 mrg _m_pmulhw (__m64 __m1, __m64 __m2)
1125 1.1 mrg {
1126 1.1 mrg return _mm_mulhi_pi16 (__m1, __m2);
1127 1.1 mrg }
1128 1.1 mrg
1129 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1130 1.1 mrg the low 16 bits of the results. */
1131 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132 1.1 mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1133 1.1 mrg {
1134 1.1 mrg __vector signed short a, b, c;
1135 1.1 mrg
1136 1.1 mrg a = (__vector signed short)vec_splats (__m1);
1137 1.1 mrg b = (__vector signed short)vec_splats (__m2);
1138 1.1 mrg c = a * b;
1139 1.1.1.2 mrg return (__m64) ((__vector long long) c)[0];
1140 1.1 mrg }
1141 1.1 mrg
1142 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143 1.1 mrg _m_pmullw (__m64 __m1, __m64 __m2)
1144 1.1 mrg {
1145 1.1 mrg return _mm_mullo_pi16 (__m1, __m2);
1146 1.1 mrg }
1147 1.1 mrg
1148 1.1 mrg /* Shift four 16-bit values in M left by COUNT. */
1149 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150 1.1 mrg _mm_sll_pi16 (__m64 __m, __m64 __count)
1151 1.1 mrg {
1152 1.1 mrg __vector signed short m, r;
1153 1.1 mrg __vector unsigned short c;
1154 1.1 mrg
1155 1.1 mrg if (__count <= 15)
1156 1.1 mrg {
1157 1.1 mrg m = (__vector signed short)vec_splats (__m);
1158 1.1 mrg c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1159 1.1 mrg r = vec_sl (m, (__vector unsigned short)c);
1160 1.1.1.2 mrg return (__m64) ((__vector long long) r)[0];
1161 1.1 mrg }
1162 1.1 mrg else
1163 1.1 mrg return (0);
1164 1.1 mrg }
1165 1.1 mrg
1166 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167 1.1 mrg _m_psllw (__m64 __m, __m64 __count)
1168 1.1 mrg {
1169 1.1 mrg return _mm_sll_pi16 (__m, __count);
1170 1.1 mrg }
1171 1.1 mrg
1172 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1173 1.1 mrg _mm_slli_pi16 (__m64 __m, int __count)
1174 1.1 mrg {
1175 1.1 mrg /* Promote int to long then invoke mm_sll_pi16. */
1176 1.1 mrg return _mm_sll_pi16 (__m, __count);
1177 1.1 mrg }
1178 1.1 mrg
1179 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180 1.1 mrg _m_psllwi (__m64 __m, int __count)
1181 1.1 mrg {
1182 1.1 mrg return _mm_slli_pi16 (__m, __count);
1183 1.1 mrg }
1184 1.1 mrg
1185 1.1 mrg /* Shift two 32-bit values in M left by COUNT. */
1186 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187 1.1 mrg _mm_sll_pi32 (__m64 __m, __m64 __count)
1188 1.1 mrg {
1189 1.1 mrg __m64_union m, res;
1190 1.1 mrg
1191 1.1 mrg m.as_m64 = __m;
1192 1.1 mrg
1193 1.1 mrg res.as_int[0] = m.as_int[0] << __count;
1194 1.1 mrg res.as_int[1] = m.as_int[1] << __count;
1195 1.1 mrg return (res.as_m64);
1196 1.1 mrg }
1197 1.1 mrg
1198 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1199 1.1 mrg _m_pslld (__m64 __m, __m64 __count)
1200 1.1 mrg {
1201 1.1 mrg return _mm_sll_pi32 (__m, __count);
1202 1.1 mrg }
1203 1.1 mrg
1204 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1205 1.1 mrg _mm_slli_pi32 (__m64 __m, int __count)
1206 1.1 mrg {
1207 1.1 mrg /* Promote int to long then invoke mm_sll_pi32. */
1208 1.1 mrg return _mm_sll_pi32 (__m, __count);
1209 1.1 mrg }
1210 1.1 mrg
1211 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 1.1 mrg _m_pslldi (__m64 __m, int __count)
1213 1.1 mrg {
1214 1.1 mrg return _mm_slli_pi32 (__m, __count);
1215 1.1 mrg }
1216 1.1 mrg
1217 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1218 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1219 1.1 mrg _mm_sra_pi16 (__m64 __m, __m64 __count)
1220 1.1 mrg {
1221 1.1 mrg __vector signed short m, r;
1222 1.1 mrg __vector unsigned short c;
1223 1.1 mrg
1224 1.1 mrg if (__count <= 15)
1225 1.1 mrg {
1226 1.1 mrg m = (__vector signed short)vec_splats (__m);
1227 1.1 mrg c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1228 1.1 mrg r = vec_sra (m, (__vector unsigned short)c);
1229 1.1.1.2 mrg return (__m64) ((__vector long long) r)[0];
1230 1.1 mrg }
1231 1.1 mrg else
1232 1.1 mrg return (0);
1233 1.1 mrg }
1234 1.1 mrg
1235 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236 1.1 mrg _m_psraw (__m64 __m, __m64 __count)
1237 1.1 mrg {
1238 1.1 mrg return _mm_sra_pi16 (__m, __count);
1239 1.1 mrg }
1240 1.1 mrg
1241 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242 1.1 mrg _mm_srai_pi16 (__m64 __m, int __count)
1243 1.1 mrg {
1244 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1245 1.1 mrg return _mm_sra_pi16 (__m, __count);
1246 1.1 mrg }
1247 1.1 mrg
1248 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1249 1.1 mrg _m_psrawi (__m64 __m, int __count)
1250 1.1 mrg {
1251 1.1 mrg return _mm_srai_pi16 (__m, __count);
1252 1.1 mrg }
1253 1.1 mrg
1254 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1255 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256 1.1 mrg _mm_sra_pi32 (__m64 __m, __m64 __count)
1257 1.1 mrg {
1258 1.1 mrg __m64_union m, res;
1259 1.1 mrg
1260 1.1 mrg m.as_m64 = __m;
1261 1.1 mrg
1262 1.1 mrg res.as_int[0] = m.as_int[0] >> __count;
1263 1.1 mrg res.as_int[1] = m.as_int[1] >> __count;
1264 1.1 mrg return (res.as_m64);
1265 1.1 mrg }
1266 1.1 mrg
1267 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 1.1 mrg _m_psrad (__m64 __m, __m64 __count)
1269 1.1 mrg {
1270 1.1 mrg return _mm_sra_pi32 (__m, __count);
1271 1.1 mrg }
1272 1.1 mrg
1273 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 1.1 mrg _mm_srai_pi32 (__m64 __m, int __count)
1275 1.1 mrg {
1276 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1277 1.1 mrg return _mm_sra_pi32 (__m, __count);
1278 1.1 mrg }
1279 1.1 mrg
1280 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 1.1 mrg _m_psradi (__m64 __m, int __count)
1282 1.1 mrg {
1283 1.1 mrg return _mm_srai_pi32 (__m, __count);
1284 1.1 mrg }
1285 1.1 mrg
1286 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1287 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288 1.1 mrg _mm_srl_pi16 (__m64 __m, __m64 __count)
1289 1.1 mrg {
1290 1.1 mrg __vector unsigned short m, r;
1291 1.1 mrg __vector unsigned short c;
1292 1.1 mrg
1293 1.1 mrg if (__count <= 15)
1294 1.1 mrg {
1295 1.1 mrg m = (__vector unsigned short)vec_splats (__m);
1296 1.1 mrg c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1297 1.1 mrg r = vec_sr (m, (__vector unsigned short)c);
1298 1.1.1.2 mrg return (__m64) ((__vector long long) r)[0];
1299 1.1 mrg }
1300 1.1 mrg else
1301 1.1 mrg return (0);
1302 1.1 mrg }
1303 1.1 mrg
1304 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305 1.1 mrg _m_psrlw (__m64 __m, __m64 __count)
1306 1.1 mrg {
1307 1.1 mrg return _mm_srl_pi16 (__m, __count);
1308 1.1 mrg }
1309 1.1 mrg
1310 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 1.1 mrg _mm_srli_pi16 (__m64 __m, int __count)
1312 1.1 mrg {
1313 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */
1314 1.1 mrg return _mm_srl_pi16 (__m, __count);
1315 1.1 mrg }
1316 1.1 mrg
1317 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 1.1 mrg _m_psrlwi (__m64 __m, int __count)
1319 1.1 mrg {
1320 1.1 mrg return _mm_srli_pi16 (__m, __count);
1321 1.1 mrg }
1322 1.1 mrg
1323 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1324 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1325 1.1 mrg _mm_srl_pi32 (__m64 __m, __m64 __count)
1326 1.1 mrg {
1327 1.1 mrg __m64_union m, res;
1328 1.1 mrg
1329 1.1 mrg m.as_m64 = __m;
1330 1.1 mrg
1331 1.1 mrg res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1332 1.1 mrg res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1333 1.1 mrg return (res.as_m64);
1334 1.1 mrg }
1335 1.1 mrg
1336 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 1.1 mrg _m_psrld (__m64 __m, __m64 __count)
1338 1.1 mrg {
1339 1.1 mrg return _mm_srl_pi32 (__m, __count);
1340 1.1 mrg }
1341 1.1 mrg
1342 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 1.1 mrg _mm_srli_pi32 (__m64 __m, int __count)
1344 1.1 mrg {
1345 1.1 mrg /* Promote int to long then invoke mm_srl_pi32. */
1346 1.1 mrg return _mm_srl_pi32 (__m, __count);
1347 1.1 mrg }
1348 1.1 mrg
1349 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1350 1.1 mrg _m_psrldi (__m64 __m, int __count)
1351 1.1 mrg {
1352 1.1 mrg return _mm_srli_pi32 (__m, __count);
1353 1.1 mrg }
1354 1.1 mrg #endif /* _ARCH_PWR8 */
1355 1.1 mrg
1356 1.1 mrg /* Creates a vector of two 32-bit values; I0 is least significant. */
1357 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358 1.1 mrg _mm_set_pi32 (int __i1, int __i0)
1359 1.1 mrg {
1360 1.1 mrg __m64_union res;
1361 1.1 mrg
1362 1.1 mrg res.as_int[0] = __i0;
1363 1.1 mrg res.as_int[1] = __i1;
1364 1.1 mrg return (res.as_m64);
1365 1.1 mrg }
1366 1.1 mrg
1367 1.1 mrg /* Creates a vector of four 16-bit values; W0 is least significant. */
1368 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369 1.1 mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1370 1.1 mrg {
1371 1.1 mrg __m64_union res;
1372 1.1 mrg
1373 1.1 mrg res.as_short[0] = __w0;
1374 1.1 mrg res.as_short[1] = __w1;
1375 1.1 mrg res.as_short[2] = __w2;
1376 1.1 mrg res.as_short[3] = __w3;
1377 1.1 mrg return (res.as_m64);
1378 1.1 mrg }
1379 1.1 mrg
1380 1.1 mrg /* Creates a vector of eight 8-bit values; B0 is least significant. */
1381 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 1.1 mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1383 1.1 mrg char __b3, char __b2, char __b1, char __b0)
1384 1.1 mrg {
1385 1.1 mrg __m64_union res;
1386 1.1 mrg
1387 1.1 mrg res.as_char[0] = __b0;
1388 1.1 mrg res.as_char[1] = __b1;
1389 1.1 mrg res.as_char[2] = __b2;
1390 1.1 mrg res.as_char[3] = __b3;
1391 1.1 mrg res.as_char[4] = __b4;
1392 1.1 mrg res.as_char[5] = __b5;
1393 1.1 mrg res.as_char[6] = __b6;
1394 1.1 mrg res.as_char[7] = __b7;
1395 1.1 mrg return (res.as_m64);
1396 1.1 mrg }
1397 1.1 mrg
1398 1.1 mrg /* Similar, but with the arguments in reverse order. */
1399 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400 1.1 mrg _mm_setr_pi32 (int __i0, int __i1)
1401 1.1 mrg {
1402 1.1 mrg __m64_union res;
1403 1.1 mrg
1404 1.1 mrg res.as_int[0] = __i0;
1405 1.1 mrg res.as_int[1] = __i1;
1406 1.1 mrg return (res.as_m64);
1407 1.1 mrg }
1408 1.1 mrg
1409 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410 1.1 mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1411 1.1 mrg {
1412 1.1 mrg return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1413 1.1 mrg }
1414 1.1 mrg
1415 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1416 1.1 mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1417 1.1 mrg char __b4, char __b5, char __b6, char __b7)
1418 1.1 mrg {
1419 1.1 mrg return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1420 1.1 mrg }
1421 1.1 mrg
1422 1.1 mrg /* Creates a vector of two 32-bit values, both elements containing I. */
1423 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1424 1.1 mrg _mm_set1_pi32 (int __i)
1425 1.1 mrg {
1426 1.1 mrg __m64_union res;
1427 1.1 mrg
1428 1.1 mrg res.as_int[0] = __i;
1429 1.1 mrg res.as_int[1] = __i;
1430 1.1 mrg return (res.as_m64);
1431 1.1 mrg }
1432 1.1 mrg
1433 1.1 mrg /* Creates a vector of four 16-bit values, all elements containing W. */
1434 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435 1.1 mrg _mm_set1_pi16 (short __w)
1436 1.1 mrg {
1437 1.1 mrg #if _ARCH_PWR9
1438 1.1 mrg __vector signed short w;
1439 1.1 mrg
1440 1.1 mrg w = (__vector signed short)vec_splats (__w);
1441 1.1.1.2 mrg return (__m64) ((__vector long long) w)[0];
1442 1.1 mrg #else
1443 1.1 mrg __m64_union res;
1444 1.1 mrg
1445 1.1 mrg res.as_short[0] = __w;
1446 1.1 mrg res.as_short[1] = __w;
1447 1.1 mrg res.as_short[2] = __w;
1448 1.1 mrg res.as_short[3] = __w;
1449 1.1 mrg return (res.as_m64);
1450 1.1 mrg #endif
1451 1.1 mrg }
1452 1.1 mrg
1453 1.1 mrg /* Creates a vector of eight 8-bit values, all elements containing B. */
1454 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1455 1.1 mrg _mm_set1_pi8 (signed char __b)
1456 1.1 mrg {
1457 1.1 mrg #if _ARCH_PWR8
1458 1.1 mrg __vector signed char b;
1459 1.1 mrg
1460 1.1 mrg b = (__vector signed char)vec_splats (__b);
1461 1.1.1.2 mrg return (__m64) ((__vector long long) b)[0];
1462 1.1 mrg #else
1463 1.1 mrg __m64_union res;
1464 1.1 mrg
1465 1.1 mrg res.as_char[0] = __b;
1466 1.1 mrg res.as_char[1] = __b;
1467 1.1 mrg res.as_char[2] = __b;
1468 1.1 mrg res.as_char[3] = __b;
1469 1.1 mrg res.as_char[4] = __b;
1470 1.1 mrg res.as_char[5] = __b;
1471 1.1 mrg res.as_char[6] = __b;
1472 1.1 mrg res.as_char[7] = __b;
1473 1.1 mrg return (res.as_m64);
1474 1.1 mrg #endif
1475 1.1 mrg }
1476 1.1 mrg #endif /* _MMINTRIN_H_INCLUDED */
1477