mmintrin.h revision 1.1.1.7 1 1.1.1.7 mrg /* Copyright (C) 2002-2019 Free Software Foundation, Inc.
2 1.1 mrg
3 1.1 mrg This file is part of GCC.
4 1.1 mrg
5 1.1 mrg GCC is free software; you can redistribute it and/or modify it
6 1.1 mrg under the terms of the GNU General Public License as published
7 1.1 mrg by the Free Software Foundation; either version 3, or (at your
8 1.1 mrg option) any later version.
9 1.1 mrg
10 1.1 mrg GCC is distributed in the hope that it will be useful, but WITHOUT
11 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
13 1.1 mrg License for more details.
14 1.1 mrg
15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
16 1.1 mrg permissions described in the GCC Runtime Library Exception, version
17 1.1 mrg 3.1, as published by the Free Software Foundation.
18 1.1 mrg
19 1.1 mrg You should have received a copy of the GNU General Public License and
20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 1.1 mrg <http://www.gnu.org/licenses/>. */
23 1.1 mrg
24 1.1 mrg #ifndef _MMINTRIN_H_INCLUDED
25 1.1 mrg #define _MMINTRIN_H_INCLUDED
26 1.1 mrg
27 1.1.1.2 mrg #ifndef __IWMMXT__
28 1.1.1.2 mrg #error mmintrin.h included without enabling WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2)
29 1.1.1.2 mrg #endif
30 1.1.1.2 mrg
31 1.1.1.2 mrg
32 1.1.1.2 mrg #if defined __cplusplus
33 1.1.1.2 mrg extern "C" {
34 1.1.1.2 mrg /* Intrinsics use C name-mangling. */
35 1.1.1.2 mrg #endif /* __cplusplus */
36 1.1.1.2 mrg
37 1.1 mrg /* The data type intended for user use. */
38 1.1 mrg typedef unsigned long long __m64, __int64;
39 1.1 mrg
40 1.1 mrg /* Internal data types for implementing the intrinsics. */
41 1.1 mrg typedef int __v2si __attribute__ ((vector_size (8)));
42 1.1 mrg typedef short __v4hi __attribute__ ((vector_size (8)));
43 1.1.1.2 mrg typedef signed char __v8qi __attribute__ ((vector_size (8)));
44 1.1.1.2 mrg
45 1.1.1.2 mrg /* Provided for source compatibility with MMX. */
46 1.1.1.2 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
47 1.1.1.2 mrg _mm_empty (void)
48 1.1.1.2 mrg {
49 1.1.1.2 mrg }
50 1.1 mrg
51 1.1 mrg /* "Convert" __m64 and __int64 into each other. */
52 1.1.1.2 mrg static __inline __m64
53 1.1 mrg _mm_cvtsi64_m64 (__int64 __i)
54 1.1 mrg {
55 1.1 mrg return __i;
56 1.1 mrg }
57 1.1 mrg
58 1.1 mrg static __inline __int64
59 1.1 mrg _mm_cvtm64_si64 (__m64 __i)
60 1.1 mrg {
61 1.1 mrg return __i;
62 1.1 mrg }
63 1.1 mrg
64 1.1 mrg static __inline int
65 1.1 mrg _mm_cvtsi64_si32 (__int64 __i)
66 1.1 mrg {
67 1.1 mrg return __i;
68 1.1 mrg }
69 1.1 mrg
70 1.1 mrg static __inline __int64
71 1.1 mrg _mm_cvtsi32_si64 (int __i)
72 1.1 mrg {
73 1.1.1.2 mrg return (__i & 0xffffffff);
74 1.1 mrg }
75 1.1 mrg
76 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
77 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit
78 1.1 mrg values of the result, all with signed saturation. */
79 1.1 mrg static __inline __m64
80 1.1 mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2)
81 1.1 mrg {
82 1.1 mrg return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2);
83 1.1 mrg }
84 1.1 mrg
85 1.1 mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
86 1.1 mrg the result, and the two 32-bit values from M2 into the upper two 16-bit
87 1.1 mrg values of the result, all with signed saturation. */
88 1.1 mrg static __inline __m64
89 1.1 mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2)
90 1.1 mrg {
91 1.1 mrg return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2);
92 1.1 mrg }
93 1.1 mrg
94 1.1 mrg /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
95 1.1 mrg the 64-bit value from M2 into the upper 32-bits of the result, all with
96 1.1 mrg signed saturation for values that do not fit exactly into 32-bits. */
97 1.1 mrg static __inline __m64
98 1.1 mrg _mm_packs_pi64 (__m64 __m1, __m64 __m2)
99 1.1 mrg {
100 1.1 mrg return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2);
101 1.1 mrg }
102 1.1 mrg
103 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
104 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit
105 1.1 mrg values of the result, all with unsigned saturation. */
106 1.1 mrg static __inline __m64
107 1.1 mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2)
108 1.1 mrg {
109 1.1 mrg return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2);
110 1.1 mrg }
111 1.1 mrg
112 1.1 mrg /* Pack the two 32-bit values from M1 into the lower two 16-bit values of
113 1.1 mrg the result, and the two 32-bit values from M2 into the upper two 16-bit
114 1.1 mrg values of the result, all with unsigned saturation. */
115 1.1 mrg static __inline __m64
116 1.1 mrg _mm_packs_pu32 (__m64 __m1, __m64 __m2)
117 1.1 mrg {
118 1.1 mrg return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2);
119 1.1 mrg }
120 1.1 mrg
121 1.1 mrg /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
122 1.1 mrg the 64-bit value from M2 into the upper 32-bits of the result, all with
123 1.1 mrg unsigned saturation for values that do not fit exactly into 32-bits. */
124 1.1 mrg static __inline __m64
125 1.1 mrg _mm_packs_pu64 (__m64 __m1, __m64 __m2)
126 1.1 mrg {
127 1.1 mrg return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2);
128 1.1 mrg }
129 1.1 mrg
130 1.1 mrg /* Interleave the four 8-bit values from the high half of M1 with the four
131 1.1 mrg 8-bit values from the high half of M2. */
132 1.1 mrg static __inline __m64
133 1.1 mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
134 1.1 mrg {
135 1.1 mrg return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2);
136 1.1 mrg }
137 1.1 mrg
138 1.1 mrg /* Interleave the two 16-bit values from the high half of M1 with the two
139 1.1 mrg 16-bit values from the high half of M2. */
140 1.1 mrg static __inline __m64
141 1.1 mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
142 1.1 mrg {
143 1.1 mrg return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2);
144 1.1 mrg }
145 1.1 mrg
146 1.1 mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit
147 1.1 mrg value from the high half of M2. */
148 1.1 mrg static __inline __m64
149 1.1 mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
150 1.1 mrg {
151 1.1 mrg return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2);
152 1.1 mrg }
153 1.1 mrg
154 1.1 mrg /* Interleave the four 8-bit values from the low half of M1 with the four
155 1.1 mrg 8-bit values from the low half of M2. */
156 1.1 mrg static __inline __m64
157 1.1 mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
158 1.1 mrg {
159 1.1 mrg return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2);
160 1.1 mrg }
161 1.1 mrg
162 1.1 mrg /* Interleave the two 16-bit values from the low half of M1 with the two
163 1.1 mrg 16-bit values from the low half of M2. */
164 1.1 mrg static __inline __m64
165 1.1 mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
166 1.1 mrg {
167 1.1 mrg return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2);
168 1.1 mrg }
169 1.1 mrg
170 1.1 mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit
171 1.1 mrg value from the low half of M2. */
172 1.1 mrg static __inline __m64
173 1.1 mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
174 1.1 mrg {
175 1.1 mrg return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2);
176 1.1 mrg }
177 1.1 mrg
178 1.1 mrg /* Take the four 8-bit values from the low half of M1, sign extend them,
179 1.1 mrg and return the result as a vector of four 16-bit quantities. */
180 1.1 mrg static __inline __m64
181 1.1 mrg _mm_unpackel_pi8 (__m64 __m1)
182 1.1 mrg {
183 1.1 mrg return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1);
184 1.1 mrg }
185 1.1 mrg
186 1.1 mrg /* Take the two 16-bit values from the low half of M1, sign extend them,
187 1.1 mrg and return the result as a vector of two 32-bit quantities. */
188 1.1 mrg static __inline __m64
189 1.1 mrg _mm_unpackel_pi16 (__m64 __m1)
190 1.1 mrg {
191 1.1 mrg return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1);
192 1.1 mrg }
193 1.1 mrg
194 1.1 mrg /* Take the 32-bit value from the low half of M1, and return it sign extended
195 1.1 mrg to 64 bits. */
196 1.1 mrg static __inline __m64
197 1.1 mrg _mm_unpackel_pi32 (__m64 __m1)
198 1.1 mrg {
199 1.1 mrg return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1);
200 1.1 mrg }
201 1.1 mrg
202 1.1 mrg /* Take the four 8-bit values from the high half of M1, sign extend them,
203 1.1 mrg and return the result as a vector of four 16-bit quantities. */
204 1.1 mrg static __inline __m64
205 1.1 mrg _mm_unpackeh_pi8 (__m64 __m1)
206 1.1 mrg {
207 1.1 mrg return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1);
208 1.1 mrg }
209 1.1 mrg
210 1.1 mrg /* Take the two 16-bit values from the high half of M1, sign extend them,
211 1.1 mrg and return the result as a vector of two 32-bit quantities. */
212 1.1 mrg static __inline __m64
213 1.1 mrg _mm_unpackeh_pi16 (__m64 __m1)
214 1.1 mrg {
215 1.1 mrg return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1);
216 1.1 mrg }
217 1.1 mrg
218 1.1 mrg /* Take the 32-bit value from the high half of M1, and return it sign extended
219 1.1 mrg to 64 bits. */
220 1.1 mrg static __inline __m64
221 1.1 mrg _mm_unpackeh_pi32 (__m64 __m1)
222 1.1 mrg {
223 1.1 mrg return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1);
224 1.1 mrg }
225 1.1 mrg
226 1.1 mrg /* Take the four 8-bit values from the low half of M1, zero extend them,
227 1.1 mrg and return the result as a vector of four 16-bit quantities. */
228 1.1 mrg static __inline __m64
229 1.1 mrg _mm_unpackel_pu8 (__m64 __m1)
230 1.1 mrg {
231 1.1 mrg return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1);
232 1.1 mrg }
233 1.1 mrg
234 1.1 mrg /* Take the two 16-bit values from the low half of M1, zero extend them,
235 1.1 mrg and return the result as a vector of two 32-bit quantities. */
236 1.1 mrg static __inline __m64
237 1.1 mrg _mm_unpackel_pu16 (__m64 __m1)
238 1.1 mrg {
239 1.1 mrg return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1);
240 1.1 mrg }
241 1.1 mrg
242 1.1 mrg /* Take the 32-bit value from the low half of M1, and return it zero extended
243 1.1 mrg to 64 bits. */
244 1.1 mrg static __inline __m64
245 1.1 mrg _mm_unpackel_pu32 (__m64 __m1)
246 1.1 mrg {
247 1.1 mrg return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1);
248 1.1 mrg }
249 1.1 mrg
250 1.1 mrg /* Take the four 8-bit values from the high half of M1, zero extend them,
251 1.1 mrg and return the result as a vector of four 16-bit quantities. */
252 1.1 mrg static __inline __m64
253 1.1 mrg _mm_unpackeh_pu8 (__m64 __m1)
254 1.1 mrg {
255 1.1 mrg return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1);
256 1.1 mrg }
257 1.1 mrg
258 1.1 mrg /* Take the two 16-bit values from the high half of M1, zero extend them,
259 1.1 mrg and return the result as a vector of two 32-bit quantities. */
260 1.1 mrg static __inline __m64
261 1.1 mrg _mm_unpackeh_pu16 (__m64 __m1)
262 1.1 mrg {
263 1.1 mrg return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1);
264 1.1 mrg }
265 1.1 mrg
266 1.1 mrg /* Take the 32-bit value from the high half of M1, and return it zero extended
267 1.1 mrg to 64 bits. */
268 1.1 mrg static __inline __m64
269 1.1 mrg _mm_unpackeh_pu32 (__m64 __m1)
270 1.1 mrg {
271 1.1 mrg return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1);
272 1.1 mrg }
273 1.1 mrg
274 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2. */
275 1.1 mrg static __inline __m64
276 1.1 mrg _mm_add_pi8 (__m64 __m1, __m64 __m2)
277 1.1 mrg {
278 1.1 mrg return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2);
279 1.1 mrg }
280 1.1 mrg
281 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2. */
282 1.1 mrg static __inline __m64
283 1.1 mrg _mm_add_pi16 (__m64 __m1, __m64 __m2)
284 1.1 mrg {
285 1.1 mrg return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2);
286 1.1 mrg }
287 1.1 mrg
288 1.1 mrg /* Add the 32-bit values in M1 to the 32-bit values in M2. */
289 1.1 mrg static __inline __m64
290 1.1 mrg _mm_add_pi32 (__m64 __m1, __m64 __m2)
291 1.1 mrg {
292 1.1 mrg return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2);
293 1.1 mrg }
294 1.1 mrg
295 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
296 1.1 mrg saturated arithmetic. */
297 1.1 mrg static __inline __m64
298 1.1 mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2)
299 1.1 mrg {
300 1.1 mrg return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2);
301 1.1 mrg }
302 1.1 mrg
303 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
304 1.1 mrg saturated arithmetic. */
305 1.1 mrg static __inline __m64
306 1.1 mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2)
307 1.1 mrg {
308 1.1 mrg return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2);
309 1.1 mrg }
310 1.1 mrg
311 1.1 mrg /* Add the 32-bit values in M1 to the 32-bit values in M2 using signed
312 1.1 mrg saturated arithmetic. */
313 1.1 mrg static __inline __m64
314 1.1 mrg _mm_adds_pi32 (__m64 __m1, __m64 __m2)
315 1.1 mrg {
316 1.1 mrg return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2);
317 1.1 mrg }
318 1.1 mrg
319 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
320 1.1 mrg saturated arithmetic. */
321 1.1 mrg static __inline __m64
322 1.1 mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2)
323 1.1 mrg {
324 1.1 mrg return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2);
325 1.1 mrg }
326 1.1 mrg
327 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
328 1.1 mrg saturated arithmetic. */
329 1.1 mrg static __inline __m64
330 1.1 mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2)
331 1.1 mrg {
332 1.1 mrg return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2);
333 1.1 mrg }
334 1.1 mrg
335 1.1 mrg /* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned
336 1.1 mrg saturated arithmetic. */
337 1.1 mrg static __inline __m64
338 1.1 mrg _mm_adds_pu32 (__m64 __m1, __m64 __m2)
339 1.1 mrg {
340 1.1 mrg return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2);
341 1.1 mrg }
342 1.1 mrg
343 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
344 1.1 mrg static __inline __m64
345 1.1 mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2)
346 1.1 mrg {
347 1.1 mrg return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2);
348 1.1 mrg }
349 1.1 mrg
350 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
351 1.1 mrg static __inline __m64
352 1.1 mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2)
353 1.1 mrg {
354 1.1 mrg return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2);
355 1.1 mrg }
356 1.1 mrg
357 1.1 mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
358 1.1 mrg static __inline __m64
359 1.1 mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2)
360 1.1 mrg {
361 1.1 mrg return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2);
362 1.1 mrg }
363 1.1 mrg
364 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
365 1.1 mrg saturating arithmetic. */
366 1.1 mrg static __inline __m64
367 1.1 mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2)
368 1.1 mrg {
369 1.1 mrg return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2);
370 1.1 mrg }
371 1.1 mrg
372 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
373 1.1 mrg signed saturating arithmetic. */
374 1.1 mrg static __inline __m64
375 1.1 mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2)
376 1.1 mrg {
377 1.1 mrg return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2);
378 1.1 mrg }
379 1.1 mrg
380 1.1 mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
381 1.1 mrg signed saturating arithmetic. */
382 1.1 mrg static __inline __m64
383 1.1 mrg _mm_subs_pi32 (__m64 __m1, __m64 __m2)
384 1.1 mrg {
385 1.1 mrg return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2);
386 1.1 mrg }
387 1.1 mrg
388 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
389 1.1 mrg unsigned saturating arithmetic. */
390 1.1 mrg static __inline __m64
391 1.1 mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2)
392 1.1 mrg {
393 1.1 mrg return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2);
394 1.1 mrg }
395 1.1 mrg
396 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
397 1.1 mrg unsigned saturating arithmetic. */
398 1.1 mrg static __inline __m64
399 1.1 mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2)
400 1.1 mrg {
401 1.1 mrg return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2);
402 1.1 mrg }
403 1.1 mrg
404 1.1 mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
405 1.1 mrg unsigned saturating arithmetic. */
406 1.1 mrg static __inline __m64
407 1.1 mrg _mm_subs_pu32 (__m64 __m1, __m64 __m2)
408 1.1 mrg {
409 1.1 mrg return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2);
410 1.1 mrg }
411 1.1 mrg
412 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
413 1.1 mrg four 32-bit intermediate results, which are then summed by pairs to
414 1.1 mrg produce two 32-bit results. */
415 1.1 mrg static __inline __m64
416 1.1 mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2)
417 1.1 mrg {
418 1.1 mrg return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2);
419 1.1 mrg }
420 1.1 mrg
421 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
422 1.1 mrg four 32-bit intermediate results, which are then summed by pairs to
423 1.1 mrg produce two 32-bit results. */
424 1.1 mrg static __inline __m64
425 1.1 mrg _mm_madd_pu16 (__m64 __m1, __m64 __m2)
426 1.1 mrg {
427 1.1 mrg return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2);
428 1.1 mrg }
429 1.1 mrg
430 1.1 mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
431 1.1 mrg M2 and produce the high 16 bits of the 32-bit results. */
432 1.1 mrg static __inline __m64
433 1.1 mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
434 1.1 mrg {
435 1.1 mrg return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2);
436 1.1 mrg }
437 1.1 mrg
438 1.1 mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
439 1.1 mrg M2 and produce the high 16 bits of the 32-bit results. */
440 1.1 mrg static __inline __m64
441 1.1 mrg _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
442 1.1 mrg {
443 1.1 mrg return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2);
444 1.1 mrg }
445 1.1 mrg
446 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
447 1.1 mrg the low 16 bits of the results. */
448 1.1 mrg static __inline __m64
449 1.1 mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
450 1.1 mrg {
451 1.1 mrg return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2);
452 1.1 mrg }
453 1.1 mrg
454 1.1 mrg /* Shift four 16-bit values in M left by COUNT. */
455 1.1 mrg static __inline __m64
456 1.1 mrg _mm_sll_pi16 (__m64 __m, __m64 __count)
457 1.1 mrg {
458 1.1 mrg return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count);
459 1.1 mrg }
460 1.1 mrg
461 1.1 mrg static __inline __m64
462 1.1 mrg _mm_slli_pi16 (__m64 __m, int __count)
463 1.1 mrg {
464 1.1 mrg return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count);
465 1.1 mrg }
466 1.1 mrg
467 1.1 mrg /* Shift two 32-bit values in M left by COUNT. */
468 1.1 mrg static __inline __m64
469 1.1 mrg _mm_sll_pi32 (__m64 __m, __m64 __count)
470 1.1 mrg {
471 1.1 mrg return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count);
472 1.1 mrg }
473 1.1 mrg
474 1.1 mrg static __inline __m64
475 1.1 mrg _mm_slli_pi32 (__m64 __m, int __count)
476 1.1 mrg {
477 1.1 mrg return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count);
478 1.1 mrg }
479 1.1 mrg
480 1.1 mrg /* Shift the 64-bit value in M left by COUNT. */
481 1.1 mrg static __inline __m64
482 1.1 mrg _mm_sll_si64 (__m64 __m, __m64 __count)
483 1.1 mrg {
484 1.1 mrg return (__m64) __builtin_arm_wslld (__m, __count);
485 1.1 mrg }
486 1.1 mrg
487 1.1 mrg static __inline __m64
488 1.1 mrg _mm_slli_si64 (__m64 __m, int __count)
489 1.1 mrg {
490 1.1 mrg return (__m64) __builtin_arm_wslldi (__m, __count);
491 1.1 mrg }
492 1.1 mrg
493 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
494 1.1 mrg static __inline __m64
495 1.1 mrg _mm_sra_pi16 (__m64 __m, __m64 __count)
496 1.1 mrg {
497 1.1 mrg return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count);
498 1.1 mrg }
499 1.1 mrg
500 1.1 mrg static __inline __m64
501 1.1 mrg _mm_srai_pi16 (__m64 __m, int __count)
502 1.1 mrg {
503 1.1 mrg return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count);
504 1.1 mrg }
505 1.1 mrg
506 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
507 1.1 mrg static __inline __m64
508 1.1 mrg _mm_sra_pi32 (__m64 __m, __m64 __count)
509 1.1 mrg {
510 1.1 mrg return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count);
511 1.1 mrg }
512 1.1 mrg
513 1.1 mrg static __inline __m64
514 1.1 mrg _mm_srai_pi32 (__m64 __m, int __count)
515 1.1 mrg {
516 1.1 mrg return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count);
517 1.1 mrg }
518 1.1 mrg
519 1.1 mrg /* Shift the 64-bit value in M right by COUNT; shift in the sign bit. */
520 1.1 mrg static __inline __m64
521 1.1 mrg _mm_sra_si64 (__m64 __m, __m64 __count)
522 1.1 mrg {
523 1.1 mrg return (__m64) __builtin_arm_wsrad (__m, __count);
524 1.1 mrg }
525 1.1 mrg
526 1.1 mrg static __inline __m64
527 1.1 mrg _mm_srai_si64 (__m64 __m, int __count)
528 1.1 mrg {
529 1.1 mrg return (__m64) __builtin_arm_wsradi (__m, __count);
530 1.1 mrg }
531 1.1 mrg
532 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
533 1.1 mrg static __inline __m64
534 1.1 mrg _mm_srl_pi16 (__m64 __m, __m64 __count)
535 1.1 mrg {
536 1.1 mrg return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count);
537 1.1 mrg }
538 1.1 mrg
539 1.1 mrg static __inline __m64
540 1.1 mrg _mm_srli_pi16 (__m64 __m, int __count)
541 1.1 mrg {
542 1.1 mrg return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count);
543 1.1 mrg }
544 1.1 mrg
545 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
546 1.1 mrg static __inline __m64
547 1.1 mrg _mm_srl_pi32 (__m64 __m, __m64 __count)
548 1.1 mrg {
549 1.1 mrg return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count);
550 1.1 mrg }
551 1.1 mrg
552 1.1 mrg static __inline __m64
553 1.1 mrg _mm_srli_pi32 (__m64 __m, int __count)
554 1.1 mrg {
555 1.1 mrg return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count);
556 1.1 mrg }
557 1.1 mrg
558 1.1 mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
559 1.1 mrg static __inline __m64
560 1.1 mrg _mm_srl_si64 (__m64 __m, __m64 __count)
561 1.1 mrg {
562 1.1 mrg return (__m64) __builtin_arm_wsrld (__m, __count);
563 1.1 mrg }
564 1.1 mrg
565 1.1 mrg static __inline __m64
566 1.1 mrg _mm_srli_si64 (__m64 __m, int __count)
567 1.1 mrg {
568 1.1 mrg return (__m64) __builtin_arm_wsrldi (__m, __count);
569 1.1 mrg }
570 1.1 mrg
571 1.1 mrg /* Rotate four 16-bit values in M right by COUNT. */
572 1.1 mrg static __inline __m64
573 1.1 mrg _mm_ror_pi16 (__m64 __m, __m64 __count)
574 1.1 mrg {
575 1.1 mrg return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count);
576 1.1 mrg }
577 1.1 mrg
578 1.1 mrg static __inline __m64
579 1.1 mrg _mm_rori_pi16 (__m64 __m, int __count)
580 1.1 mrg {
581 1.1 mrg return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count);
582 1.1 mrg }
583 1.1 mrg
584 1.1 mrg /* Rotate two 32-bit values in M right by COUNT. */
585 1.1 mrg static __inline __m64
586 1.1 mrg _mm_ror_pi32 (__m64 __m, __m64 __count)
587 1.1 mrg {
588 1.1 mrg return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count);
589 1.1 mrg }
590 1.1 mrg
591 1.1 mrg static __inline __m64
592 1.1 mrg _mm_rori_pi32 (__m64 __m, int __count)
593 1.1 mrg {
594 1.1 mrg return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count);
595 1.1 mrg }
596 1.1 mrg
597 1.1 mrg /* Rotate two 64-bit values in M right by COUNT. */
598 1.1 mrg static __inline __m64
599 1.1 mrg _mm_ror_si64 (__m64 __m, __m64 __count)
600 1.1 mrg {
601 1.1 mrg return (__m64) __builtin_arm_wrord (__m, __count);
602 1.1 mrg }
603 1.1 mrg
604 1.1 mrg static __inline __m64
605 1.1 mrg _mm_rori_si64 (__m64 __m, int __count)
606 1.1 mrg {
607 1.1 mrg return (__m64) __builtin_arm_wrordi (__m, __count);
608 1.1 mrg }
609 1.1 mrg
610 1.1 mrg /* Bit-wise AND the 64-bit values in M1 and M2. */
611 1.1 mrg static __inline __m64
612 1.1 mrg _mm_and_si64 (__m64 __m1, __m64 __m2)
613 1.1 mrg {
614 1.1 mrg return __builtin_arm_wand (__m1, __m2);
615 1.1 mrg }
616 1.1 mrg
617 1.1 mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
618 1.1 mrg 64-bit value in M2. */
619 1.1 mrg static __inline __m64
620 1.1 mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2)
621 1.1 mrg {
622 1.1.1.2 mrg return __builtin_arm_wandn (__m2, __m1);
623 1.1 mrg }
624 1.1 mrg
625 1.1 mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
626 1.1 mrg static __inline __m64
627 1.1 mrg _mm_or_si64 (__m64 __m1, __m64 __m2)
628 1.1 mrg {
629 1.1 mrg return __builtin_arm_wor (__m1, __m2);
630 1.1 mrg }
631 1.1 mrg
632 1.1 mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
633 1.1 mrg static __inline __m64
634 1.1 mrg _mm_xor_si64 (__m64 __m1, __m64 __m2)
635 1.1 mrg {
636 1.1 mrg return __builtin_arm_wxor (__m1, __m2);
637 1.1 mrg }
638 1.1 mrg
639 1.1 mrg /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
640 1.1 mrg test is true and zero if false. */
641 1.1 mrg static __inline __m64
642 1.1 mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
643 1.1 mrg {
644 1.1 mrg return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
645 1.1 mrg }
646 1.1 mrg
647 1.1 mrg static __inline __m64
648 1.1 mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
649 1.1 mrg {
650 1.1 mrg return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2);
651 1.1 mrg }
652 1.1 mrg
653 1.1 mrg static __inline __m64
654 1.1 mrg _mm_cmpgt_pu8 (__m64 __m1, __m64 __m2)
655 1.1 mrg {
656 1.1 mrg return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2);
657 1.1 mrg }
658 1.1 mrg
659 1.1 mrg /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
660 1.1 mrg the test is true and zero if false. */
661 1.1 mrg static __inline __m64
662 1.1 mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
663 1.1 mrg {
664 1.1 mrg return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2);
665 1.1 mrg }
666 1.1 mrg
667 1.1 mrg static __inline __m64
668 1.1 mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
669 1.1 mrg {
670 1.1 mrg return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2);
671 1.1 mrg }
672 1.1 mrg
673 1.1 mrg static __inline __m64
674 1.1 mrg _mm_cmpgt_pu16 (__m64 __m1, __m64 __m2)
675 1.1 mrg {
676 1.1 mrg return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2);
677 1.1 mrg }
678 1.1 mrg
679 1.1 mrg /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
680 1.1 mrg the test is true and zero if false. */
681 1.1 mrg static __inline __m64
682 1.1 mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
683 1.1 mrg {
684 1.1 mrg return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2);
685 1.1 mrg }
686 1.1 mrg
687 1.1 mrg static __inline __m64
688 1.1 mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
689 1.1 mrg {
690 1.1 mrg return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2);
691 1.1 mrg }
692 1.1 mrg
693 1.1 mrg static __inline __m64
694 1.1 mrg _mm_cmpgt_pu32 (__m64 __m1, __m64 __m2)
695 1.1 mrg {
696 1.1 mrg return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2);
697 1.1 mrg }
698 1.1 mrg
699 1.1 mrg /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
700 1.1 mrg by accumulate across all elements and __A. */
701 1.1 mrg static __inline __m64
702 1.1 mrg _mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C)
703 1.1 mrg {
704 1.1 mrg return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C);
705 1.1 mrg }
706 1.1 mrg
707 1.1 mrg /* Element-wise multiplication of signed 16-bit values __B and __C, followed
708 1.1 mrg by accumulate across all elements and __A. */
709 1.1 mrg static __inline __m64
710 1.1 mrg _mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C)
711 1.1 mrg {
712 1.1 mrg return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C);
713 1.1 mrg }
714 1.1 mrg
715 1.1 mrg /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
716 1.1 mrg by accumulate across all elements. */
717 1.1 mrg static __inline __m64
718 1.1 mrg _mm_macz_pu16 (__m64 __A, __m64 __B)
719 1.1 mrg {
720 1.1 mrg return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B);
721 1.1 mrg }
722 1.1 mrg
723 1.1 mrg /* Element-wise multiplication of signed 16-bit values __B and __C, followed
724 1.1 mrg by accumulate across all elements. */
725 1.1 mrg static __inline __m64
726 1.1 mrg _mm_macz_pi16 (__m64 __A, __m64 __B)
727 1.1 mrg {
728 1.1 mrg return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B);
729 1.1 mrg }
730 1.1 mrg
731 1.1 mrg /* Accumulate across all unsigned 8-bit values in __A. */
732 1.1 mrg static __inline __m64
733 1.1 mrg _mm_acc_pu8 (__m64 __A)
734 1.1 mrg {
735 1.1 mrg return __builtin_arm_waccb ((__v8qi)__A);
736 1.1 mrg }
737 1.1 mrg
738 1.1 mrg /* Accumulate across all unsigned 16-bit values in __A. */
739 1.1 mrg static __inline __m64
740 1.1 mrg _mm_acc_pu16 (__m64 __A)
741 1.1 mrg {
742 1.1 mrg return __builtin_arm_wacch ((__v4hi)__A);
743 1.1 mrg }
744 1.1 mrg
745 1.1 mrg /* Accumulate across all unsigned 32-bit values in __A. */
746 1.1 mrg static __inline __m64
747 1.1 mrg _mm_acc_pu32 (__m64 __A)
748 1.1 mrg {
749 1.1 mrg return __builtin_arm_waccw ((__v2si)__A);
750 1.1 mrg }
751 1.1 mrg
752 1.1 mrg static __inline __m64
753 1.1 mrg _mm_mia_si64 (__m64 __A, int __B, int __C)
754 1.1 mrg {
755 1.1 mrg return __builtin_arm_tmia (__A, __B, __C);
756 1.1 mrg }
757 1.1 mrg
758 1.1 mrg static __inline __m64
759 1.1 mrg _mm_miaph_si64 (__m64 __A, int __B, int __C)
760 1.1 mrg {
761 1.1 mrg return __builtin_arm_tmiaph (__A, __B, __C);
762 1.1 mrg }
763 1.1 mrg
764 1.1 mrg static __inline __m64
765 1.1 mrg _mm_miabb_si64 (__m64 __A, int __B, int __C)
766 1.1 mrg {
767 1.1 mrg return __builtin_arm_tmiabb (__A, __B, __C);
768 1.1 mrg }
769 1.1 mrg
770 1.1 mrg static __inline __m64
771 1.1 mrg _mm_miabt_si64 (__m64 __A, int __B, int __C)
772 1.1 mrg {
773 1.1 mrg return __builtin_arm_tmiabt (__A, __B, __C);
774 1.1 mrg }
775 1.1 mrg
776 1.1 mrg static __inline __m64
777 1.1 mrg _mm_miatb_si64 (__m64 __A, int __B, int __C)
778 1.1 mrg {
779 1.1 mrg return __builtin_arm_tmiatb (__A, __B, __C);
780 1.1 mrg }
781 1.1 mrg
782 1.1 mrg static __inline __m64
783 1.1 mrg _mm_miatt_si64 (__m64 __A, int __B, int __C)
784 1.1 mrg {
785 1.1 mrg return __builtin_arm_tmiatt (__A, __B, __C);
786 1.1 mrg }
787 1.1 mrg
788 1.1 mrg /* Extract one of the elements of A and sign extend. The selector N must
789 1.1 mrg be immediate. */
790 1.1 mrg #define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N))
791 1.1 mrg #define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N))
792 1.1 mrg #define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N))
793 1.1 mrg
794 1.1 mrg /* Extract one of the elements of A and zero extend. The selector N must
795 1.1 mrg be immediate. */
796 1.1 mrg #define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N))
797 1.1 mrg #define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N))
798 1.1 mrg #define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N))
799 1.1 mrg
800 1.1 mrg /* Inserts word D into one of the elements of A. The selector N must be
801 1.1 mrg immediate. */
802 1.1 mrg #define _mm_insert_pi8(A, D, N) \
803 1.1 mrg ((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N)))
804 1.1 mrg #define _mm_insert_pi16(A, D, N) \
805 1.1 mrg ((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N)))
806 1.1 mrg #define _mm_insert_pi32(A, D, N) \
807 1.1 mrg ((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N)))
808 1.1 mrg
809 1.1 mrg /* Compute the element-wise maximum of signed 8-bit values. */
810 1.1 mrg static __inline __m64
811 1.1 mrg _mm_max_pi8 (__m64 __A, __m64 __B)
812 1.1 mrg {
813 1.1 mrg return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B);
814 1.1 mrg }
815 1.1 mrg
816 1.1 mrg /* Compute the element-wise maximum of signed 16-bit values. */
817 1.1 mrg static __inline __m64
818 1.1 mrg _mm_max_pi16 (__m64 __A, __m64 __B)
819 1.1 mrg {
820 1.1 mrg return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B);
821 1.1 mrg }
822 1.1 mrg
823 1.1 mrg /* Compute the element-wise maximum of signed 32-bit values. */
824 1.1 mrg static __inline __m64
825 1.1 mrg _mm_max_pi32 (__m64 __A, __m64 __B)
826 1.1 mrg {
827 1.1 mrg return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B);
828 1.1 mrg }
829 1.1 mrg
830 1.1 mrg /* Compute the element-wise maximum of unsigned 8-bit values. */
831 1.1 mrg static __inline __m64
832 1.1 mrg _mm_max_pu8 (__m64 __A, __m64 __B)
833 1.1 mrg {
834 1.1 mrg return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B);
835 1.1 mrg }
836 1.1 mrg
837 1.1 mrg /* Compute the element-wise maximum of unsigned 16-bit values. */
838 1.1 mrg static __inline __m64
839 1.1 mrg _mm_max_pu16 (__m64 __A, __m64 __B)
840 1.1 mrg {
841 1.1 mrg return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B);
842 1.1 mrg }
843 1.1 mrg
844 1.1 mrg /* Compute the element-wise maximum of unsigned 32-bit values. */
845 1.1 mrg static __inline __m64
846 1.1 mrg _mm_max_pu32 (__m64 __A, __m64 __B)
847 1.1 mrg {
848 1.1 mrg return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B);
849 1.1 mrg }
850 1.1 mrg
851 1.1 mrg /* Compute the element-wise minimum of signed 16-bit values. */
852 1.1 mrg static __inline __m64
853 1.1 mrg _mm_min_pi8 (__m64 __A, __m64 __B)
854 1.1 mrg {
855 1.1 mrg return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B);
856 1.1 mrg }
857 1.1 mrg
858 1.1 mrg /* Compute the element-wise minimum of signed 16-bit values. */
859 1.1 mrg static __inline __m64
860 1.1 mrg _mm_min_pi16 (__m64 __A, __m64 __B)
861 1.1 mrg {
862 1.1 mrg return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B);
863 1.1 mrg }
864 1.1 mrg
865 1.1 mrg /* Compute the element-wise minimum of signed 32-bit values. */
866 1.1 mrg static __inline __m64
867 1.1 mrg _mm_min_pi32 (__m64 __A, __m64 __B)
868 1.1 mrg {
869 1.1 mrg return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B);
870 1.1 mrg }
871 1.1 mrg
872 1.1 mrg /* Compute the element-wise minimum of unsigned 16-bit values. */
873 1.1 mrg static __inline __m64
874 1.1 mrg _mm_min_pu8 (__m64 __A, __m64 __B)
875 1.1 mrg {
876 1.1 mrg return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B);
877 1.1 mrg }
878 1.1 mrg
879 1.1 mrg /* Compute the element-wise minimum of unsigned 16-bit values. */
880 1.1 mrg static __inline __m64
881 1.1 mrg _mm_min_pu16 (__m64 __A, __m64 __B)
882 1.1 mrg {
883 1.1 mrg return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B);
884 1.1 mrg }
885 1.1 mrg
886 1.1 mrg /* Compute the element-wise minimum of unsigned 32-bit values. */
887 1.1 mrg static __inline __m64
888 1.1 mrg _mm_min_pu32 (__m64 __A, __m64 __B)
889 1.1 mrg {
890 1.1 mrg return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B);
891 1.1 mrg }
892 1.1 mrg
893 1.1 mrg /* Create an 8-bit mask of the signs of 8-bit values. */
894 1.1 mrg static __inline int
895 1.1 mrg _mm_movemask_pi8 (__m64 __A)
896 1.1 mrg {
897 1.1 mrg return __builtin_arm_tmovmskb ((__v8qi)__A);
898 1.1 mrg }
899 1.1 mrg
900 1.1 mrg /* Create an 8-bit mask of the signs of 16-bit values. */
901 1.1 mrg static __inline int
902 1.1 mrg _mm_movemask_pi16 (__m64 __A)
903 1.1 mrg {
904 1.1 mrg return __builtin_arm_tmovmskh ((__v4hi)__A);
905 1.1 mrg }
906 1.1 mrg
907 1.1 mrg /* Create an 8-bit mask of the signs of 32-bit values. */
908 1.1 mrg static __inline int
909 1.1 mrg _mm_movemask_pi32 (__m64 __A)
910 1.1 mrg {
911 1.1 mrg return __builtin_arm_tmovmskw ((__v2si)__A);
912 1.1 mrg }
913 1.1 mrg
914 1.1 mrg /* Return a combination of the four 16-bit values in A. The selector
915 1.1 mrg must be an immediate. */
916 1.1 mrg #define _mm_shuffle_pi16(A, N) \
917 1.1 mrg ((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N)))
918 1.1 mrg
919 1.1 mrg
920 1.1 mrg /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
921 1.1 mrg static __inline __m64
922 1.1 mrg _mm_avg_pu8 (__m64 __A, __m64 __B)
923 1.1 mrg {
924 1.1 mrg return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B);
925 1.1 mrg }
926 1.1 mrg
927 1.1 mrg /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
928 1.1 mrg static __inline __m64
929 1.1 mrg _mm_avg_pu16 (__m64 __A, __m64 __B)
930 1.1 mrg {
931 1.1 mrg return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B);
932 1.1 mrg }
933 1.1 mrg
934 1.1 mrg /* Compute the averages of the unsigned 8-bit values in A and B. */
935 1.1 mrg static __inline __m64
936 1.1 mrg _mm_avg2_pu8 (__m64 __A, __m64 __B)
937 1.1 mrg {
938 1.1 mrg return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B);
939 1.1 mrg }
940 1.1 mrg
941 1.1 mrg /* Compute the averages of the unsigned 16-bit values in A and B. */
942 1.1 mrg static __inline __m64
943 1.1 mrg _mm_avg2_pu16 (__m64 __A, __m64 __B)
944 1.1 mrg {
945 1.1 mrg return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B);
946 1.1 mrg }
947 1.1 mrg
948 1.1 mrg /* Compute the sum of the absolute differences of the unsigned 8-bit
949 1.1 mrg values in A and B. Return the value in the lower 16-bit word; the
950 1.1 mrg upper words are cleared. */
951 1.1 mrg static __inline __m64
952 1.1 mrg _mm_sad_pu8 (__m64 __A, __m64 __B)
953 1.1 mrg {
954 1.1.1.2 mrg return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
955 1.1.1.2 mrg }
956 1.1.1.2 mrg
957 1.1.1.2 mrg static __inline __m64
958 1.1.1.2 mrg _mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
959 1.1.1.2 mrg {
960 1.1.1.2 mrg return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
961 1.1 mrg }
962 1.1 mrg
963 1.1 mrg /* Compute the sum of the absolute differences of the unsigned 16-bit
964 1.1 mrg values in A and B. Return the value in the lower 32-bit word; the
965 1.1 mrg upper words are cleared. */
966 1.1 mrg static __inline __m64
967 1.1 mrg _mm_sad_pu16 (__m64 __A, __m64 __B)
968 1.1 mrg {
969 1.1.1.2 mrg return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
970 1.1 mrg }
971 1.1 mrg
972 1.1.1.2 mrg static __inline __m64
973 1.1.1.2 mrg _mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
974 1.1.1.2 mrg {
975 1.1.1.2 mrg return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
976 1.1.1.2 mrg }
977 1.1.1.2 mrg
978 1.1.1.2 mrg
979 1.1 mrg /* Compute the sum of the absolute differences of the unsigned 8-bit
980 1.1 mrg values in A and B. Return the value in the lower 16-bit word; the
981 1.1 mrg upper words are cleared. */
982 1.1 mrg static __inline __m64
983 1.1 mrg _mm_sadz_pu8 (__m64 __A, __m64 __B)
984 1.1 mrg {
985 1.1 mrg return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
986 1.1 mrg }
987 1.1 mrg
988 1.1 mrg /* Compute the sum of the absolute differences of the unsigned 16-bit
989 1.1 mrg values in A and B. Return the value in the lower 32-bit word; the
990 1.1 mrg upper words are cleared. */
991 1.1 mrg static __inline __m64
992 1.1 mrg _mm_sadz_pu16 (__m64 __A, __m64 __B)
993 1.1 mrg {
994 1.1 mrg return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
995 1.1 mrg }
996 1.1 mrg
997 1.1.1.2 mrg #define _mm_align_si64(__A,__B, N) \
998 1.1.1.2 mrg (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
999 1.1 mrg
1000 1.1 mrg /* Creates a 64-bit zero. */
1001 1.1 mrg static __inline __m64
1002 1.1 mrg _mm_setzero_si64 (void)
1003 1.1 mrg {
1004 1.1 mrg return __builtin_arm_wzero ();
1005 1.1 mrg }
1006 1.1 mrg
1007 1.1 mrg /* Set and Get arbitrary iWMMXt Control registers.
1008 1.1 mrg Note only registers 0-3 and 8-11 are currently defined,
1009 1.1 mrg the rest are reserved. */
1010 1.1 mrg
1011 1.1 mrg static __inline void
1012 1.1 mrg _mm_setwcx (const int __value, const int __regno)
1013 1.1 mrg {
1014 1.1 mrg switch (__regno)
1015 1.1 mrg {
1016 1.1.1.2 mrg case 0:
1017 1.1.1.2 mrg __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
1018 1.1.1.2 mrg break;
1019 1.1.1.2 mrg case 1:
1020 1.1.1.2 mrg __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
1021 1.1.1.2 mrg break;
1022 1.1.1.2 mrg case 2:
1023 1.1.1.2 mrg __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
1024 1.1.1.2 mrg break;
1025 1.1.1.2 mrg case 3:
1026 1.1.1.2 mrg __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
1027 1.1.1.2 mrg break;
1028 1.1.1.2 mrg case 8:
1029 1.1.1.2 mrg __builtin_arm_setwcgr0 (__value);
1030 1.1.1.2 mrg break;
1031 1.1.1.2 mrg case 9:
1032 1.1.1.2 mrg __builtin_arm_setwcgr1 (__value);
1033 1.1.1.2 mrg break;
1034 1.1.1.2 mrg case 10:
1035 1.1.1.2 mrg __builtin_arm_setwcgr2 (__value);
1036 1.1.1.2 mrg break;
1037 1.1.1.2 mrg case 11:
1038 1.1.1.2 mrg __builtin_arm_setwcgr3 (__value);
1039 1.1.1.2 mrg break;
1040 1.1.1.2 mrg default:
1041 1.1.1.2 mrg break;
1042 1.1 mrg }
1043 1.1 mrg }
1044 1.1 mrg
1045 1.1 mrg static __inline int
1046 1.1 mrg _mm_getwcx (const int __regno)
1047 1.1 mrg {
1048 1.1.1.2 mrg int __value;
1049 1.1 mrg switch (__regno)
1050 1.1 mrg {
1051 1.1.1.2 mrg case 0:
1052 1.1.1.2 mrg __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
1053 1.1.1.2 mrg break;
1054 1.1.1.2 mrg case 1:
1055 1.1.1.2 mrg __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
1056 1.1.1.2 mrg break;
1057 1.1.1.2 mrg case 2:
1058 1.1.1.2 mrg __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
1059 1.1.1.2 mrg break;
1060 1.1.1.2 mrg case 3:
1061 1.1.1.2 mrg __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
1062 1.1.1.2 mrg break;
1063 1.1.1.2 mrg case 8:
1064 1.1.1.2 mrg return __builtin_arm_getwcgr0 ();
1065 1.1.1.2 mrg case 9:
1066 1.1.1.2 mrg return __builtin_arm_getwcgr1 ();
1067 1.1.1.2 mrg case 10:
1068 1.1.1.2 mrg return __builtin_arm_getwcgr2 ();
1069 1.1.1.2 mrg case 11:
1070 1.1.1.2 mrg return __builtin_arm_getwcgr3 ();
1071 1.1.1.2 mrg default:
1072 1.1.1.2 mrg break;
1073 1.1 mrg }
1074 1.1.1.2 mrg return __value;
1075 1.1 mrg }
1076 1.1 mrg
1077 1.1 mrg /* Creates a vector of two 32-bit values; I0 is least significant. */
1078 1.1 mrg static __inline __m64
1079 1.1 mrg _mm_set_pi32 (int __i1, int __i0)
1080 1.1 mrg {
1081 1.1.1.2 mrg union
1082 1.1.1.2 mrg {
1083 1.1 mrg __m64 __q;
1084 1.1.1.2 mrg struct
1085 1.1.1.2 mrg {
1086 1.1 mrg unsigned int __i0;
1087 1.1 mrg unsigned int __i1;
1088 1.1 mrg } __s;
1089 1.1 mrg } __u;
1090 1.1 mrg
1091 1.1 mrg __u.__s.__i0 = __i0;
1092 1.1 mrg __u.__s.__i1 = __i1;
1093 1.1 mrg
1094 1.1 mrg return __u.__q;
1095 1.1 mrg }
1096 1.1 mrg
1097 1.1 mrg /* Creates a vector of four 16-bit values; W0 is least significant. */
1098 1.1 mrg static __inline __m64
1099 1.1 mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1100 1.1 mrg {
1101 1.1.1.2 mrg unsigned int __i1 = (unsigned short) __w3 << 16 | (unsigned short) __w2;
1102 1.1.1.2 mrg unsigned int __i0 = (unsigned short) __w1 << 16 | (unsigned short) __w0;
1103 1.1.1.2 mrg
1104 1.1 mrg return _mm_set_pi32 (__i1, __i0);
1105 1.1 mrg }
1106 1.1 mrg
1107 1.1 mrg /* Creates a vector of eight 8-bit values; B0 is least significant. */
1108 1.1 mrg static __inline __m64
1109 1.1 mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1110 1.1 mrg char __b3, char __b2, char __b1, char __b0)
1111 1.1 mrg {
1112 1.1 mrg unsigned int __i1, __i0;
1113 1.1 mrg
1114 1.1 mrg __i1 = (unsigned char)__b7;
1115 1.1 mrg __i1 = __i1 << 8 | (unsigned char)__b6;
1116 1.1 mrg __i1 = __i1 << 8 | (unsigned char)__b5;
1117 1.1 mrg __i1 = __i1 << 8 | (unsigned char)__b4;
1118 1.1 mrg
1119 1.1 mrg __i0 = (unsigned char)__b3;
1120 1.1 mrg __i0 = __i0 << 8 | (unsigned char)__b2;
1121 1.1 mrg __i0 = __i0 << 8 | (unsigned char)__b1;
1122 1.1 mrg __i0 = __i0 << 8 | (unsigned char)__b0;
1123 1.1 mrg
1124 1.1 mrg return _mm_set_pi32 (__i1, __i0);
1125 1.1 mrg }
1126 1.1 mrg
1127 1.1 mrg /* Similar, but with the arguments in reverse order. */
1128 1.1 mrg static __inline __m64
1129 1.1 mrg _mm_setr_pi32 (int __i0, int __i1)
1130 1.1 mrg {
1131 1.1 mrg return _mm_set_pi32 (__i1, __i0);
1132 1.1 mrg }
1133 1.1 mrg
1134 1.1 mrg static __inline __m64
1135 1.1 mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1136 1.1 mrg {
1137 1.1 mrg return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1138 1.1 mrg }
1139 1.1 mrg
1140 1.1 mrg static __inline __m64
1141 1.1 mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1142 1.1 mrg char __b4, char __b5, char __b6, char __b7)
1143 1.1 mrg {
1144 1.1 mrg return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1145 1.1 mrg }
1146 1.1 mrg
1147 1.1 mrg /* Creates a vector of two 32-bit values, both elements containing I. */
1148 1.1 mrg static __inline __m64
1149 1.1 mrg _mm_set1_pi32 (int __i)
1150 1.1 mrg {
1151 1.1 mrg return _mm_set_pi32 (__i, __i);
1152 1.1 mrg }
1153 1.1 mrg
1154 1.1 mrg /* Creates a vector of four 16-bit values, all elements containing W. */
1155 1.1 mrg static __inline __m64
1156 1.1 mrg _mm_set1_pi16 (short __w)
1157 1.1 mrg {
1158 1.1 mrg unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
1159 1.1 mrg return _mm_set1_pi32 (__i);
1160 1.1 mrg }
1161 1.1 mrg
1162 1.1 mrg /* Creates a vector of four 16-bit values, all elements containing B. */
1163 1.1 mrg static __inline __m64
1164 1.1 mrg _mm_set1_pi8 (char __b)
1165 1.1 mrg {
1166 1.1 mrg unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
1167 1.1 mrg unsigned int __i = __w << 16 | __w;
1168 1.1 mrg return _mm_set1_pi32 (__i);
1169 1.1 mrg }
1170 1.1 mrg
1171 1.1.1.2 mrg #ifdef __IWMMXT2__
1172 1.1.1.2 mrg static __inline __m64
1173 1.1.1.2 mrg _mm_abs_pi8 (__m64 m1)
1174 1.1.1.2 mrg {
1175 1.1.1.2 mrg return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
1176 1.1.1.2 mrg }
1177 1.1.1.2 mrg
1178 1.1.1.2 mrg static __inline __m64
1179 1.1.1.2 mrg _mm_abs_pi16 (__m64 m1)
1180 1.1.1.2 mrg {
1181 1.1.1.2 mrg return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
1182 1.1.1.2 mrg
1183 1.1.1.2 mrg }
1184 1.1.1.2 mrg
1185 1.1.1.2 mrg static __inline __m64
1186 1.1.1.2 mrg _mm_abs_pi32 (__m64 m1)
1187 1.1.1.2 mrg {
1188 1.1.1.2 mrg return (__m64) __builtin_arm_wabsw ((__v2si)m1);
1189 1.1.1.2 mrg
1190 1.1.1.2 mrg }
1191 1.1.1.2 mrg
1192 1.1.1.2 mrg static __inline __m64
1193 1.1.1.2 mrg _mm_addsubhx_pi16 (__m64 a, __m64 b)
1194 1.1.1.2 mrg {
1195 1.1.1.2 mrg return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
1196 1.1.1.2 mrg }
1197 1.1.1.2 mrg
1198 1.1.1.2 mrg static __inline __m64
1199 1.1.1.2 mrg _mm_absdiff_pu8 (__m64 a, __m64 b)
1200 1.1.1.2 mrg {
1201 1.1.1.2 mrg return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
1202 1.1.1.2 mrg }
1203 1.1.1.2 mrg
1204 1.1.1.2 mrg static __inline __m64
1205 1.1.1.2 mrg _mm_absdiff_pu16 (__m64 a, __m64 b)
1206 1.1.1.2 mrg {
1207 1.1.1.2 mrg return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
1208 1.1.1.2 mrg }
1209 1.1.1.2 mrg
1210 1.1.1.2 mrg static __inline __m64
1211 1.1.1.2 mrg _mm_absdiff_pu32 (__m64 a, __m64 b)
1212 1.1.1.2 mrg {
1213 1.1.1.2 mrg return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
1214 1.1.1.2 mrg }
1215 1.1.1.2 mrg
1216 1.1.1.2 mrg static __inline __m64
1217 1.1.1.2 mrg _mm_addc_pu16 (__m64 a, __m64 b)
1218 1.1.1.2 mrg {
1219 1.1.1.2 mrg __m64 result;
1220 1.1.1.2 mrg __asm__ __volatile__ ("waddhc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b));
1221 1.1.1.2 mrg return result;
1222 1.1.1.2 mrg }
1223 1.1.1.2 mrg
1224 1.1 mrg static __inline __m64
1225 1.1.1.2 mrg _mm_addc_pu32 (__m64 a, __m64 b)
1226 1.1 mrg {
1227 1.1.1.2 mrg __m64 result;
1228 1.1.1.2 mrg __asm__ __volatile__ ("waddwc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b));
1229 1.1.1.2 mrg return result;
1230 1.1 mrg }
1231 1.1 mrg
1232 1.1.1.2 mrg static __inline __m64
1233 1.1.1.2 mrg _mm_avg4_pu8 (__m64 a, __m64 b)
1234 1.1.1.2 mrg {
1235 1.1.1.2 mrg return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
1236 1.1.1.2 mrg }
1237 1.1.1.2 mrg
1238 1.1.1.2 mrg static __inline __m64
1239 1.1.1.2 mrg _mm_avg4r_pu8 (__m64 a, __m64 b)
1240 1.1.1.2 mrg {
1241 1.1.1.2 mrg return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
1242 1.1.1.2 mrg }
1243 1.1.1.2 mrg
1244 1.1.1.2 mrg static __inline __m64
1245 1.1.1.2 mrg _mm_maddx_pi16 (__m64 a, __m64 b)
1246 1.1.1.2 mrg {
1247 1.1.1.2 mrg return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
1248 1.1.1.2 mrg }
1249 1.1.1.2 mrg
1250 1.1.1.2 mrg static __inline __m64
1251 1.1.1.2 mrg _mm_maddx_pu16 (__m64 a, __m64 b)
1252 1.1.1.2 mrg {
1253 1.1.1.2 mrg return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
1254 1.1.1.2 mrg }
1255 1.1.1.2 mrg
1256 1.1.1.2 mrg static __inline __m64
1257 1.1.1.2 mrg _mm_msub_pi16 (__m64 a, __m64 b)
1258 1.1.1.2 mrg {
1259 1.1.1.2 mrg return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
1260 1.1.1.2 mrg }
1261 1.1.1.2 mrg
1262 1.1.1.2 mrg static __inline __m64
1263 1.1.1.2 mrg _mm_msub_pu16 (__m64 a, __m64 b)
1264 1.1.1.2 mrg {
1265 1.1.1.2 mrg return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
1266 1.1.1.2 mrg }
1267 1.1.1.2 mrg
1268 1.1.1.2 mrg static __inline __m64
1269 1.1.1.2 mrg _mm_mulhi_pi32 (__m64 a, __m64 b)
1270 1.1.1.2 mrg {
1271 1.1.1.2 mrg return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
1272 1.1.1.2 mrg }
1273 1.1.1.2 mrg
1274 1.1.1.2 mrg static __inline __m64
1275 1.1.1.2 mrg _mm_mulhi_pu32 (__m64 a, __m64 b)
1276 1.1.1.2 mrg {
1277 1.1.1.2 mrg return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
1278 1.1.1.2 mrg }
1279 1.1.1.2 mrg
1280 1.1.1.2 mrg static __inline __m64
1281 1.1.1.2 mrg _mm_mulhir_pi16 (__m64 a, __m64 b)
1282 1.1.1.2 mrg {
1283 1.1.1.2 mrg return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
1284 1.1.1.2 mrg }
1285 1.1.1.2 mrg
1286 1.1.1.2 mrg static __inline __m64
1287 1.1.1.2 mrg _mm_mulhir_pi32 (__m64 a, __m64 b)
1288 1.1.1.2 mrg {
1289 1.1.1.2 mrg return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
1290 1.1.1.2 mrg }
1291 1.1.1.2 mrg
1292 1.1.1.2 mrg static __inline __m64
1293 1.1.1.2 mrg _mm_mulhir_pu16 (__m64 a, __m64 b)
1294 1.1.1.2 mrg {
1295 1.1.1.2 mrg return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
1296 1.1.1.2 mrg }
1297 1.1.1.2 mrg
1298 1.1.1.2 mrg static __inline __m64
1299 1.1.1.2 mrg _mm_mulhir_pu32 (__m64 a, __m64 b)
1300 1.1.1.2 mrg {
1301 1.1.1.2 mrg return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
1302 1.1.1.2 mrg }
1303 1.1.1.2 mrg
1304 1.1.1.2 mrg static __inline __m64
1305 1.1.1.2 mrg _mm_mullo_pi32 (__m64 a, __m64 b)
1306 1.1.1.2 mrg {
1307 1.1.1.2 mrg return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
1308 1.1.1.2 mrg }
1309 1.1.1.2 mrg
1310 1.1.1.2 mrg static __inline __m64
1311 1.1.1.2 mrg _mm_qmulm_pi16 (__m64 a, __m64 b)
1312 1.1.1.2 mrg {
1313 1.1.1.2 mrg return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
1314 1.1.1.2 mrg }
1315 1.1.1.2 mrg
1316 1.1.1.2 mrg static __inline __m64
1317 1.1.1.2 mrg _mm_qmulm_pi32 (__m64 a, __m64 b)
1318 1.1.1.2 mrg {
1319 1.1.1.2 mrg return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
1320 1.1.1.2 mrg }
1321 1.1.1.2 mrg
1322 1.1.1.2 mrg static __inline __m64
1323 1.1.1.2 mrg _mm_qmulmr_pi16 (__m64 a, __m64 b)
1324 1.1.1.2 mrg {
1325 1.1.1.2 mrg return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
1326 1.1.1.2 mrg }
1327 1.1.1.2 mrg
1328 1.1.1.2 mrg static __inline __m64
1329 1.1.1.2 mrg _mm_qmulmr_pi32 (__m64 a, __m64 b)
1330 1.1.1.2 mrg {
1331 1.1.1.2 mrg return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
1332 1.1.1.2 mrg }
1333 1.1.1.2 mrg
1334 1.1.1.2 mrg static __inline __m64
1335 1.1.1.2 mrg _mm_subaddhx_pi16 (__m64 a, __m64 b)
1336 1.1.1.2 mrg {
1337 1.1.1.2 mrg return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
1338 1.1.1.2 mrg }
1339 1.1.1.2 mrg
1340 1.1.1.2 mrg static __inline __m64
1341 1.1.1.2 mrg _mm_addbhusl_pu8 (__m64 a, __m64 b)
1342 1.1.1.2 mrg {
1343 1.1.1.2 mrg return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
1344 1.1.1.2 mrg }
1345 1.1.1.2 mrg
1346 1.1.1.2 mrg static __inline __m64
1347 1.1.1.2 mrg _mm_addbhusm_pu8 (__m64 a, __m64 b)
1348 1.1.1.2 mrg {
1349 1.1.1.2 mrg return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
1350 1.1.1.2 mrg }
1351 1.1.1.2 mrg
1352 1.1.1.2 mrg #define _mm_qmiabb_pi32(acc, m1, m2) \
1353 1.1.1.2 mrg ({\
1354 1.1.1.2 mrg __m64 _acc = acc;\
1355 1.1.1.2 mrg __m64 _m1 = m1;\
1356 1.1.1.2 mrg __m64 _m2 = m2;\
1357 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
1358 1.1.1.2 mrg _acc;\
1359 1.1.1.2 mrg })
1360 1.1.1.2 mrg
1361 1.1.1.2 mrg #define _mm_qmiabbn_pi32(acc, m1, m2) \
1362 1.1.1.2 mrg ({\
1363 1.1.1.2 mrg __m64 _acc = acc;\
1364 1.1.1.2 mrg __m64 _m1 = m1;\
1365 1.1.1.2 mrg __m64 _m2 = m2;\
1366 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
1367 1.1.1.2 mrg _acc;\
1368 1.1.1.2 mrg })
1369 1.1.1.2 mrg
1370 1.1.1.2 mrg #define _mm_qmiabt_pi32(acc, m1, m2) \
1371 1.1.1.2 mrg ({\
1372 1.1.1.2 mrg __m64 _acc = acc;\
1373 1.1.1.2 mrg __m64 _m1 = m1;\
1374 1.1.1.2 mrg __m64 _m2 = m2;\
1375 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
1376 1.1.1.2 mrg _acc;\
1377 1.1.1.2 mrg })
1378 1.1.1.2 mrg
1379 1.1.1.2 mrg #define _mm_qmiabtn_pi32(acc, m1, m2) \
1380 1.1.1.2 mrg ({\
1381 1.1.1.2 mrg __m64 _acc=acc;\
1382 1.1.1.2 mrg __m64 _m1=m1;\
1383 1.1.1.2 mrg __m64 _m2=m2;\
1384 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
1385 1.1.1.2 mrg _acc;\
1386 1.1.1.2 mrg })
1387 1.1.1.2 mrg
1388 1.1.1.2 mrg #define _mm_qmiatb_pi32(acc, m1, m2) \
1389 1.1.1.2 mrg ({\
1390 1.1.1.2 mrg __m64 _acc = acc;\
1391 1.1.1.2 mrg __m64 _m1 = m1;\
1392 1.1.1.2 mrg __m64 _m2 = m2;\
1393 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
1394 1.1.1.2 mrg _acc;\
1395 1.1.1.2 mrg })
1396 1.1.1.2 mrg
1397 1.1.1.2 mrg #define _mm_qmiatbn_pi32(acc, m1, m2) \
1398 1.1.1.2 mrg ({\
1399 1.1.1.2 mrg __m64 _acc = acc;\
1400 1.1.1.2 mrg __m64 _m1 = m1;\
1401 1.1.1.2 mrg __m64 _m2 = m2;\
1402 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
1403 1.1.1.2 mrg _acc;\
1404 1.1.1.2 mrg })
1405 1.1.1.2 mrg
1406 1.1.1.2 mrg #define _mm_qmiatt_pi32(acc, m1, m2) \
1407 1.1.1.2 mrg ({\
1408 1.1.1.2 mrg __m64 _acc = acc;\
1409 1.1.1.2 mrg __m64 _m1 = m1;\
1410 1.1.1.2 mrg __m64 _m2 = m2;\
1411 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
1412 1.1.1.2 mrg _acc;\
1413 1.1.1.2 mrg })
1414 1.1.1.2 mrg
1415 1.1.1.2 mrg #define _mm_qmiattn_pi32(acc, m1, m2) \
1416 1.1.1.2 mrg ({\
1417 1.1.1.2 mrg __m64 _acc = acc;\
1418 1.1.1.2 mrg __m64 _m1 = m1;\
1419 1.1.1.2 mrg __m64 _m2 = m2;\
1420 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
1421 1.1.1.2 mrg _acc;\
1422 1.1.1.2 mrg })
1423 1.1.1.2 mrg
1424 1.1.1.2 mrg #define _mm_wmiabb_si64(acc, m1, m2) \
1425 1.1.1.2 mrg ({\
1426 1.1.1.2 mrg __m64 _acc = acc;\
1427 1.1.1.2 mrg __m64 _m1 = m1;\
1428 1.1.1.2 mrg __m64 _m2 = m2;\
1429 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
1430 1.1.1.2 mrg _acc;\
1431 1.1.1.2 mrg })
1432 1.1.1.2 mrg
1433 1.1.1.2 mrg #define _mm_wmiabbn_si64(acc, m1, m2) \
1434 1.1.1.2 mrg ({\
1435 1.1.1.2 mrg __m64 _acc = acc;\
1436 1.1.1.2 mrg __m64 _m1 = m1;\
1437 1.1.1.2 mrg __m64 _m2 = m2;\
1438 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
1439 1.1.1.2 mrg _acc;\
1440 1.1.1.2 mrg })
1441 1.1.1.2 mrg
1442 1.1.1.2 mrg #define _mm_wmiabt_si64(acc, m1, m2) \
1443 1.1.1.2 mrg ({\
1444 1.1.1.2 mrg __m64 _acc = acc;\
1445 1.1.1.2 mrg __m64 _m1 = m1;\
1446 1.1.1.2 mrg __m64 _m2 = m2;\
1447 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
1448 1.1.1.2 mrg _acc;\
1449 1.1.1.2 mrg })
1450 1.1.1.2 mrg
1451 1.1.1.2 mrg #define _mm_wmiabtn_si64(acc, m1, m2) \
1452 1.1.1.2 mrg ({\
1453 1.1.1.2 mrg __m64 _acc = acc;\
1454 1.1.1.2 mrg __m64 _m1 = m1;\
1455 1.1.1.2 mrg __m64 _m2 = m2;\
1456 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
1457 1.1.1.2 mrg _acc;\
1458 1.1.1.2 mrg })
1459 1.1.1.2 mrg
1460 1.1.1.2 mrg #define _mm_wmiatb_si64(acc, m1, m2) \
1461 1.1.1.2 mrg ({\
1462 1.1.1.2 mrg __m64 _acc = acc;\
1463 1.1.1.2 mrg __m64 _m1 = m1;\
1464 1.1.1.2 mrg __m64 _m2 = m2;\
1465 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
1466 1.1.1.2 mrg _acc;\
1467 1.1.1.2 mrg })
1468 1.1.1.2 mrg
1469 1.1.1.2 mrg #define _mm_wmiatbn_si64(acc, m1, m2) \
1470 1.1.1.2 mrg ({\
1471 1.1.1.2 mrg __m64 _acc = acc;\
1472 1.1.1.2 mrg __m64 _m1 = m1;\
1473 1.1.1.2 mrg __m64 _m2 = m2;\
1474 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
1475 1.1.1.2 mrg _acc;\
1476 1.1.1.2 mrg })
1477 1.1.1.2 mrg
1478 1.1.1.2 mrg #define _mm_wmiatt_si64(acc, m1, m2) \
1479 1.1.1.2 mrg ({\
1480 1.1.1.2 mrg __m64 _acc = acc;\
1481 1.1.1.2 mrg __m64 _m1 = m1;\
1482 1.1.1.2 mrg __m64 _m2 = m2;\
1483 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
1484 1.1.1.2 mrg _acc;\
1485 1.1.1.2 mrg })
1486 1.1.1.2 mrg
1487 1.1.1.2 mrg #define _mm_wmiattn_si64(acc, m1, m2) \
1488 1.1.1.2 mrg ({\
1489 1.1.1.2 mrg __m64 _acc = acc;\
1490 1.1.1.2 mrg __m64 _m1 = m1;\
1491 1.1.1.2 mrg __m64 _m2 = m2;\
1492 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
1493 1.1.1.2 mrg _acc;\
1494 1.1.1.2 mrg })
1495 1.1.1.2 mrg
1496 1.1.1.2 mrg #define _mm_wmiawbb_si64(acc, m1, m2) \
1497 1.1.1.2 mrg ({\
1498 1.1.1.2 mrg __m64 _acc = acc;\
1499 1.1.1.2 mrg __m64 _m1 = m1;\
1500 1.1.1.2 mrg __m64 _m2 = m2;\
1501 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
1502 1.1.1.2 mrg _acc;\
1503 1.1.1.2 mrg })
1504 1.1.1.2 mrg
1505 1.1.1.2 mrg #define _mm_wmiawbbn_si64(acc, m1, m2) \
1506 1.1.1.2 mrg ({\
1507 1.1.1.2 mrg __m64 _acc = acc;\
1508 1.1.1.2 mrg __m64 _m1 = m1;\
1509 1.1.1.2 mrg __m64 _m2 = m2;\
1510 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
1511 1.1.1.2 mrg _acc;\
1512 1.1.1.2 mrg })
1513 1.1.1.2 mrg
1514 1.1.1.2 mrg #define _mm_wmiawbt_si64(acc, m1, m2) \
1515 1.1.1.2 mrg ({\
1516 1.1.1.2 mrg __m64 _acc = acc;\
1517 1.1.1.2 mrg __m64 _m1 = m1;\
1518 1.1.1.2 mrg __m64 _m2 = m2;\
1519 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
1520 1.1.1.2 mrg _acc;\
1521 1.1.1.2 mrg })
1522 1.1.1.2 mrg
1523 1.1.1.2 mrg #define _mm_wmiawbtn_si64(acc, m1, m2) \
1524 1.1.1.2 mrg ({\
1525 1.1.1.2 mrg __m64 _acc = acc;\
1526 1.1.1.2 mrg __m64 _m1 = m1;\
1527 1.1.1.2 mrg __m64 _m2 = m2;\
1528 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
1529 1.1.1.2 mrg _acc;\
1530 1.1.1.2 mrg })
1531 1.1.1.2 mrg
1532 1.1.1.2 mrg #define _mm_wmiawtb_si64(acc, m1, m2) \
1533 1.1.1.2 mrg ({\
1534 1.1.1.2 mrg __m64 _acc = acc;\
1535 1.1.1.2 mrg __m64 _m1 = m1;\
1536 1.1.1.2 mrg __m64 _m2 = m2;\
1537 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
1538 1.1.1.2 mrg _acc;\
1539 1.1.1.2 mrg })
1540 1.1.1.2 mrg
1541 1.1.1.2 mrg #define _mm_wmiawtbn_si64(acc, m1, m2) \
1542 1.1.1.2 mrg ({\
1543 1.1.1.2 mrg __m64 _acc = acc;\
1544 1.1.1.2 mrg __m64 _m1 = m1;\
1545 1.1.1.2 mrg __m64 _m2 = m2;\
1546 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
1547 1.1.1.2 mrg _acc;\
1548 1.1.1.2 mrg })
1549 1.1.1.2 mrg
1550 1.1.1.2 mrg #define _mm_wmiawtt_si64(acc, m1, m2) \
1551 1.1.1.2 mrg ({\
1552 1.1.1.2 mrg __m64 _acc = acc;\
1553 1.1.1.2 mrg __m64 _m1 = m1;\
1554 1.1.1.2 mrg __m64 _m2 = m2;\
1555 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
1556 1.1.1.2 mrg _acc;\
1557 1.1.1.2 mrg })
1558 1.1.1.2 mrg
1559 1.1.1.2 mrg #define _mm_wmiawttn_si64(acc, m1, m2) \
1560 1.1.1.2 mrg ({\
1561 1.1.1.2 mrg __m64 _acc = acc;\
1562 1.1.1.2 mrg __m64 _m1 = m1;\
1563 1.1.1.2 mrg __m64 _m2 = m2;\
1564 1.1.1.2 mrg _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
1565 1.1.1.2 mrg _acc;\
1566 1.1.1.2 mrg })
1567 1.1.1.2 mrg
1568 1.1.1.2 mrg /* The third arguments should be an immediate. */
1569 1.1.1.2 mrg #define _mm_merge_si64(a, b, n) \
1570 1.1.1.2 mrg ({\
1571 1.1.1.2 mrg __m64 result;\
1572 1.1.1.2 mrg result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
1573 1.1.1.2 mrg result;\
1574 1.1.1.2 mrg })
1575 1.1.1.2 mrg #endif /* __IWMMXT2__ */
1576 1.1.1.2 mrg
1577 1.1.1.2 mrg static __inline __m64
1578 1.1.1.2 mrg _mm_alignr0_si64 (__m64 a, __m64 b)
1579 1.1.1.2 mrg {
1580 1.1.1.2 mrg return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
1581 1.1.1.2 mrg }
1582 1.1.1.2 mrg
1583 1.1.1.2 mrg static __inline __m64
1584 1.1.1.2 mrg _mm_alignr1_si64 (__m64 a, __m64 b)
1585 1.1.1.2 mrg {
1586 1.1.1.2 mrg return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
1587 1.1.1.2 mrg }
1588 1.1.1.2 mrg
1589 1.1.1.2 mrg static __inline __m64
1590 1.1.1.2 mrg _mm_alignr2_si64 (__m64 a, __m64 b)
1591 1.1.1.2 mrg {
1592 1.1.1.2 mrg return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
1593 1.1.1.2 mrg }
1594 1.1.1.2 mrg
1595 1.1.1.2 mrg static __inline __m64
1596 1.1.1.2 mrg _mm_alignr3_si64 (__m64 a, __m64 b)
1597 1.1.1.2 mrg {
1598 1.1.1.2 mrg return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
1599 1.1.1.2 mrg }
1600 1.1.1.2 mrg
1601 1.1.1.2 mrg static __inline void
1602 1.1.1.2 mrg _mm_tandcb ()
1603 1.1.1.2 mrg {
1604 1.1.1.2 mrg __asm __volatile ("tandcb r15");
1605 1.1.1.2 mrg }
1606 1.1.1.2 mrg
1607 1.1.1.2 mrg static __inline void
1608 1.1.1.2 mrg _mm_tandch ()
1609 1.1.1.2 mrg {
1610 1.1.1.2 mrg __asm __volatile ("tandch r15");
1611 1.1.1.2 mrg }
1612 1.1.1.2 mrg
1613 1.1.1.2 mrg static __inline void
1614 1.1.1.2 mrg _mm_tandcw ()
1615 1.1.1.2 mrg {
1616 1.1.1.2 mrg __asm __volatile ("tandcw r15");
1617 1.1.1.2 mrg }
1618 1.1.1.2 mrg
1619 1.1.1.2 mrg #define _mm_textrcb(n) \
1620 1.1.1.2 mrg ({\
1621 1.1.1.2 mrg __asm__ __volatile__ (\
1622 1.1.1.2 mrg "textrcb r15, %0" : : "i" (n));\
1623 1.1.1.2 mrg })
1624 1.1.1.2 mrg
1625 1.1.1.2 mrg #define _mm_textrch(n) \
1626 1.1.1.2 mrg ({\
1627 1.1.1.2 mrg __asm__ __volatile__ (\
1628 1.1.1.2 mrg "textrch r15, %0" : : "i" (n));\
1629 1.1.1.2 mrg })
1630 1.1.1.2 mrg
1631 1.1.1.2 mrg #define _mm_textrcw(n) \
1632 1.1.1.2 mrg ({\
1633 1.1.1.2 mrg __asm__ __volatile__ (\
1634 1.1.1.2 mrg "textrcw r15, %0" : : "i" (n));\
1635 1.1.1.2 mrg })
1636 1.1.1.2 mrg
1637 1.1.1.2 mrg static __inline void
1638 1.1.1.2 mrg _mm_torcb ()
1639 1.1.1.2 mrg {
1640 1.1.1.2 mrg __asm __volatile ("torcb r15");
1641 1.1.1.2 mrg }
1642 1.1.1.2 mrg
1643 1.1.1.2 mrg static __inline void
1644 1.1.1.2 mrg _mm_torch ()
1645 1.1.1.2 mrg {
1646 1.1.1.2 mrg __asm __volatile ("torch r15");
1647 1.1.1.2 mrg }
1648 1.1.1.2 mrg
1649 1.1.1.2 mrg static __inline void
1650 1.1.1.2 mrg _mm_torcw ()
1651 1.1.1.2 mrg {
1652 1.1.1.2 mrg __asm __volatile ("torcw r15");
1653 1.1.1.2 mrg }
1654 1.1.1.2 mrg
1655 1.1.1.2 mrg #ifdef __IWMMXT2__
1656 1.1.1.2 mrg static __inline void
1657 1.1.1.2 mrg _mm_torvscb ()
1658 1.1.1.2 mrg {
1659 1.1.1.2 mrg __asm __volatile ("torvscb r15");
1660 1.1.1.2 mrg }
1661 1.1.1.2 mrg
1662 1.1.1.2 mrg static __inline void
1663 1.1.1.2 mrg _mm_torvsch ()
1664 1.1.1.2 mrg {
1665 1.1.1.2 mrg __asm __volatile ("torvsch r15");
1666 1.1.1.2 mrg }
1667 1.1.1.2 mrg
1668 1.1.1.2 mrg static __inline void
1669 1.1.1.2 mrg _mm_torvscw ()
1670 1.1.1.2 mrg {
1671 1.1.1.2 mrg __asm __volatile ("torvscw r15");
1672 1.1.1.2 mrg }
1673 1.1.1.2 mrg #endif /* __IWMMXT2__ */
1674 1.1.1.2 mrg
1675 1.1.1.2 mrg static __inline __m64
1676 1.1.1.2 mrg _mm_tbcst_pi8 (int value)
1677 1.1.1.2 mrg {
1678 1.1.1.2 mrg return (__m64) __builtin_arm_tbcstb ((signed char) value);
1679 1.1.1.2 mrg }
1680 1.1.1.2 mrg
1681 1.1.1.2 mrg static __inline __m64
1682 1.1.1.2 mrg _mm_tbcst_pi16 (int value)
1683 1.1.1.2 mrg {
1684 1.1.1.2 mrg return (__m64) __builtin_arm_tbcsth ((short) value);
1685 1.1.1.2 mrg }
1686 1.1.1.2 mrg
1687 1.1.1.2 mrg static __inline __m64
1688 1.1.1.2 mrg _mm_tbcst_pi32 (int value)
1689 1.1.1.2 mrg {
1690 1.1.1.2 mrg return (__m64) __builtin_arm_tbcstw (value);
1691 1.1.1.2 mrg }
1692 1.1.1.2 mrg
1693 1.1.1.2 mrg #define _m_empty _mm_empty
1694 1.1 mrg #define _m_packsswb _mm_packs_pi16
1695 1.1 mrg #define _m_packssdw _mm_packs_pi32
1696 1.1 mrg #define _m_packuswb _mm_packs_pu16
1697 1.1 mrg #define _m_packusdw _mm_packs_pu32
1698 1.1 mrg #define _m_packssqd _mm_packs_pi64
1699 1.1 mrg #define _m_packusqd _mm_packs_pu64
1700 1.1 mrg #define _mm_packs_si64 _mm_packs_pi64
1701 1.1 mrg #define _mm_packs_su64 _mm_packs_pu64
1702 1.1 mrg #define _m_punpckhbw _mm_unpackhi_pi8
1703 1.1 mrg #define _m_punpckhwd _mm_unpackhi_pi16
1704 1.1 mrg #define _m_punpckhdq _mm_unpackhi_pi32
1705 1.1 mrg #define _m_punpcklbw _mm_unpacklo_pi8
1706 1.1 mrg #define _m_punpcklwd _mm_unpacklo_pi16
1707 1.1 mrg #define _m_punpckldq _mm_unpacklo_pi32
1708 1.1 mrg #define _m_punpckehsbw _mm_unpackeh_pi8
1709 1.1 mrg #define _m_punpckehswd _mm_unpackeh_pi16
1710 1.1 mrg #define _m_punpckehsdq _mm_unpackeh_pi32
1711 1.1 mrg #define _m_punpckehubw _mm_unpackeh_pu8
1712 1.1 mrg #define _m_punpckehuwd _mm_unpackeh_pu16
1713 1.1 mrg #define _m_punpckehudq _mm_unpackeh_pu32
1714 1.1 mrg #define _m_punpckelsbw _mm_unpackel_pi8
1715 1.1 mrg #define _m_punpckelswd _mm_unpackel_pi16
1716 1.1 mrg #define _m_punpckelsdq _mm_unpackel_pi32
1717 1.1 mrg #define _m_punpckelubw _mm_unpackel_pu8
1718 1.1 mrg #define _m_punpckeluwd _mm_unpackel_pu16
1719 1.1 mrg #define _m_punpckeludq _mm_unpackel_pu32
1720 1.1 mrg #define _m_paddb _mm_add_pi8
1721 1.1 mrg #define _m_paddw _mm_add_pi16
1722 1.1 mrg #define _m_paddd _mm_add_pi32
1723 1.1 mrg #define _m_paddsb _mm_adds_pi8
1724 1.1 mrg #define _m_paddsw _mm_adds_pi16
1725 1.1 mrg #define _m_paddsd _mm_adds_pi32
1726 1.1 mrg #define _m_paddusb _mm_adds_pu8
1727 1.1 mrg #define _m_paddusw _mm_adds_pu16
1728 1.1 mrg #define _m_paddusd _mm_adds_pu32
1729 1.1 mrg #define _m_psubb _mm_sub_pi8
1730 1.1 mrg #define _m_psubw _mm_sub_pi16
1731 1.1 mrg #define _m_psubd _mm_sub_pi32
1732 1.1 mrg #define _m_psubsb _mm_subs_pi8
1733 1.1 mrg #define _m_psubsw _mm_subs_pi16
1734 1.1 mrg #define _m_psubuw _mm_subs_pi32
1735 1.1 mrg #define _m_psubusb _mm_subs_pu8
1736 1.1 mrg #define _m_psubusw _mm_subs_pu16
1737 1.1 mrg #define _m_psubusd _mm_subs_pu32
1738 1.1 mrg #define _m_pmaddwd _mm_madd_pi16
1739 1.1 mrg #define _m_pmadduwd _mm_madd_pu16
1740 1.1 mrg #define _m_pmulhw _mm_mulhi_pi16
1741 1.1 mrg #define _m_pmulhuw _mm_mulhi_pu16
1742 1.1 mrg #define _m_pmullw _mm_mullo_pi16
1743 1.1 mrg #define _m_pmacsw _mm_mac_pi16
1744 1.1 mrg #define _m_pmacuw _mm_mac_pu16
1745 1.1 mrg #define _m_pmacszw _mm_macz_pi16
1746 1.1 mrg #define _m_pmacuzw _mm_macz_pu16
1747 1.1 mrg #define _m_paccb _mm_acc_pu8
1748 1.1 mrg #define _m_paccw _mm_acc_pu16
1749 1.1 mrg #define _m_paccd _mm_acc_pu32
1750 1.1 mrg #define _m_pmia _mm_mia_si64
1751 1.1 mrg #define _m_pmiaph _mm_miaph_si64
1752 1.1 mrg #define _m_pmiabb _mm_miabb_si64
1753 1.1 mrg #define _m_pmiabt _mm_miabt_si64
1754 1.1 mrg #define _m_pmiatb _mm_miatb_si64
1755 1.1 mrg #define _m_pmiatt _mm_miatt_si64
1756 1.1 mrg #define _m_psllw _mm_sll_pi16
1757 1.1 mrg #define _m_psllwi _mm_slli_pi16
1758 1.1 mrg #define _m_pslld _mm_sll_pi32
1759 1.1 mrg #define _m_pslldi _mm_slli_pi32
1760 1.1 mrg #define _m_psllq _mm_sll_si64
1761 1.1 mrg #define _m_psllqi _mm_slli_si64
1762 1.1 mrg #define _m_psraw _mm_sra_pi16
1763 1.1 mrg #define _m_psrawi _mm_srai_pi16
1764 1.1 mrg #define _m_psrad _mm_sra_pi32
1765 1.1 mrg #define _m_psradi _mm_srai_pi32
1766 1.1 mrg #define _m_psraq _mm_sra_si64
1767 1.1 mrg #define _m_psraqi _mm_srai_si64
1768 1.1 mrg #define _m_psrlw _mm_srl_pi16
1769 1.1 mrg #define _m_psrlwi _mm_srli_pi16
1770 1.1 mrg #define _m_psrld _mm_srl_pi32
1771 1.1 mrg #define _m_psrldi _mm_srli_pi32
1772 1.1 mrg #define _m_psrlq _mm_srl_si64
1773 1.1 mrg #define _m_psrlqi _mm_srli_si64
1774 1.1 mrg #define _m_prorw _mm_ror_pi16
1775 1.1 mrg #define _m_prorwi _mm_rori_pi16
1776 1.1 mrg #define _m_prord _mm_ror_pi32
1777 1.1 mrg #define _m_prordi _mm_rori_pi32
1778 1.1 mrg #define _m_prorq _mm_ror_si64
1779 1.1 mrg #define _m_prorqi _mm_rori_si64
1780 1.1 mrg #define _m_pand _mm_and_si64
1781 1.1 mrg #define _m_pandn _mm_andnot_si64
1782 1.1 mrg #define _m_por _mm_or_si64
1783 1.1 mrg #define _m_pxor _mm_xor_si64
1784 1.1 mrg #define _m_pcmpeqb _mm_cmpeq_pi8
1785 1.1 mrg #define _m_pcmpeqw _mm_cmpeq_pi16
1786 1.1 mrg #define _m_pcmpeqd _mm_cmpeq_pi32
1787 1.1 mrg #define _m_pcmpgtb _mm_cmpgt_pi8
1788 1.1 mrg #define _m_pcmpgtub _mm_cmpgt_pu8
1789 1.1 mrg #define _m_pcmpgtw _mm_cmpgt_pi16
1790 1.1 mrg #define _m_pcmpgtuw _mm_cmpgt_pu16
1791 1.1 mrg #define _m_pcmpgtd _mm_cmpgt_pi32
1792 1.1 mrg #define _m_pcmpgtud _mm_cmpgt_pu32
1793 1.1 mrg #define _m_pextrb _mm_extract_pi8
1794 1.1 mrg #define _m_pextrw _mm_extract_pi16
1795 1.1 mrg #define _m_pextrd _mm_extract_pi32
1796 1.1 mrg #define _m_pextrub _mm_extract_pu8
1797 1.1 mrg #define _m_pextruw _mm_extract_pu16
1798 1.1 mrg #define _m_pextrud _mm_extract_pu32
1799 1.1 mrg #define _m_pinsrb _mm_insert_pi8
1800 1.1 mrg #define _m_pinsrw _mm_insert_pi16
1801 1.1 mrg #define _m_pinsrd _mm_insert_pi32
1802 1.1 mrg #define _m_pmaxsb _mm_max_pi8
1803 1.1 mrg #define _m_pmaxsw _mm_max_pi16
1804 1.1 mrg #define _m_pmaxsd _mm_max_pi32
1805 1.1 mrg #define _m_pmaxub _mm_max_pu8
1806 1.1 mrg #define _m_pmaxuw _mm_max_pu16
1807 1.1 mrg #define _m_pmaxud _mm_max_pu32
1808 1.1 mrg #define _m_pminsb _mm_min_pi8
1809 1.1 mrg #define _m_pminsw _mm_min_pi16
1810 1.1 mrg #define _m_pminsd _mm_min_pi32
1811 1.1 mrg #define _m_pminub _mm_min_pu8
1812 1.1 mrg #define _m_pminuw _mm_min_pu16
1813 1.1 mrg #define _m_pminud _mm_min_pu32
1814 1.1 mrg #define _m_pmovmskb _mm_movemask_pi8
1815 1.1 mrg #define _m_pmovmskw _mm_movemask_pi16
1816 1.1 mrg #define _m_pmovmskd _mm_movemask_pi32
1817 1.1 mrg #define _m_pshufw _mm_shuffle_pi16
1818 1.1 mrg #define _m_pavgb _mm_avg_pu8
1819 1.1 mrg #define _m_pavgw _mm_avg_pu16
1820 1.1 mrg #define _m_pavg2b _mm_avg2_pu8
1821 1.1 mrg #define _m_pavg2w _mm_avg2_pu16
1822 1.1 mrg #define _m_psadbw _mm_sad_pu8
1823 1.1 mrg #define _m_psadwd _mm_sad_pu16
1824 1.1 mrg #define _m_psadzbw _mm_sadz_pu8
1825 1.1 mrg #define _m_psadzwd _mm_sadz_pu16
1826 1.1 mrg #define _m_paligniq _mm_align_si64
1827 1.1 mrg #define _m_cvt_si2pi _mm_cvtsi64_m64
1828 1.1 mrg #define _m_cvt_pi2si _mm_cvtm64_si64
1829 1.1.1.2 mrg #define _m_from_int _mm_cvtsi32_si64
1830 1.1.1.2 mrg #define _m_to_int _mm_cvtsi64_si32
1831 1.1.1.2 mrg
1832 1.1.1.2 mrg #if defined __cplusplus
1833 1.1.1.2 mrg }; /* End "C" */
1834 1.1.1.2 mrg #endif /* __cplusplus */
1835 1.1 mrg
1836 1.1 mrg #endif /* _MMINTRIN_H_INCLUDED */
1837