tmmintrin.h revision 1.1 1 1.1 mrg /* Copyright (C) 2003-2019 Free Software Foundation, Inc.
2 1.1 mrg
3 1.1 mrg This file is part of GCC.
4 1.1 mrg
5 1.1 mrg GCC is free software; you can redistribute it and/or modify
6 1.1 mrg it under the terms of the GNU General Public License as published by
7 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
8 1.1 mrg any later version.
9 1.1 mrg
10 1.1 mrg GCC is distributed in the hope that it will be useful,
11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 1.1 mrg GNU General Public License for more details.
14 1.1 mrg
15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
16 1.1 mrg permissions described in the GCC Runtime Library Exception, version
17 1.1 mrg 3.1, as published by the Free Software Foundation.
18 1.1 mrg
19 1.1 mrg You should have received a copy of the GNU General Public License and
20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 1.1 mrg <http://www.gnu.org/licenses/>. */
23 1.1 mrg
24 1.1 mrg /* Implemented from the specification included in the Intel C++ Compiler
25 1.1 mrg User Guide and Reference, version 9.0. */
26 1.1 mrg
27 1.1 mrg #ifndef NO_WARN_X86_INTRINSICS
28 1.1 mrg /* This header is distributed to simplify porting x86_64 code that
29 1.1 mrg makes explicit use of Intel intrinsics to powerpc64le.
30 1.1 mrg It is the user's responsibility to determine if the results are
31 1.1 mrg acceptable and make additional changes as necessary.
32 1.1 mrg Note that much code that uses Intel intrinsics can be rewritten in
33 1.1 mrg standard C or GNU C extensions, which are more portable and better
34 1.1 mrg optimized across multiple targets. */
35 1.1 mrg #endif
36 1.1 mrg
37 1.1 mrg #ifndef TMMINTRIN_H_
38 1.1 mrg #define TMMINTRIN_H_
39 1.1 mrg
40 1.1 mrg #include <altivec.h>
41 1.1 mrg #include <assert.h>
42 1.1 mrg
43 1.1 mrg /* We need definitions from the SSE header files. */
44 1.1 mrg #include <pmmintrin.h>
45 1.1 mrg
46 1.1 mrg extern __inline __m128i
47 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48 1.1 mrg _mm_abs_epi16 (__m128i __A)
49 1.1 mrg {
50 1.1 mrg return (__m128i) vec_abs ((__v8hi) __A);
51 1.1 mrg }
52 1.1 mrg
53 1.1 mrg extern __inline __m128i
54 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
55 1.1 mrg _mm_abs_epi32 (__m128i __A)
56 1.1 mrg {
57 1.1 mrg return (__m128i) vec_abs ((__v4si) __A);
58 1.1 mrg }
59 1.1 mrg
60 1.1 mrg extern __inline __m128i
61 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
62 1.1 mrg _mm_abs_epi8 (__m128i __A)
63 1.1 mrg {
64 1.1 mrg return (__m128i) vec_abs ((__v16qi) __A);
65 1.1 mrg }
66 1.1 mrg
67 1.1 mrg extern __inline __m64
68 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
69 1.1 mrg _mm_abs_pi16 (__m64 __A)
70 1.1 mrg {
71 1.1 mrg __v8hi __B = (__v8hi) (__v2du) { __A, __A };
72 1.1 mrg return (__m64) ((__v2du) vec_abs (__B))[0];
73 1.1 mrg }
74 1.1 mrg
75 1.1 mrg extern __inline __m64
76 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77 1.1 mrg _mm_abs_pi32 (__m64 __A)
78 1.1 mrg {
79 1.1 mrg __v4si __B = (__v4si) (__v2du) { __A, __A };
80 1.1 mrg return (__m64) ((__v2du) vec_abs (__B))[0];
81 1.1 mrg }
82 1.1 mrg
83 1.1 mrg extern __inline __m64
84 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85 1.1 mrg _mm_abs_pi8 (__m64 __A)
86 1.1 mrg {
87 1.1 mrg __v16qi __B = (__v16qi) (__v2du) { __A, __A };
88 1.1 mrg return (__m64) ((__v2du) vec_abs (__B))[0];
89 1.1 mrg }
90 1.1 mrg
91 1.1 mrg extern __inline __m128i
92 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 1.1 mrg _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
94 1.1 mrg {
95 1.1 mrg if (__builtin_constant_p (__count) && __count < 16)
96 1.1 mrg {
97 1.1 mrg #ifdef __LITTLE_ENDIAN__
98 1.1 mrg __A = (__m128i) vec_reve ((__v16qu) __A);
99 1.1 mrg __B = (__m128i) vec_reve ((__v16qu) __B);
100 1.1 mrg #endif
101 1.1 mrg __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
102 1.1 mrg #ifdef __LITTLE_ENDIAN__
103 1.1 mrg __A = (__m128i) vec_reve ((__v16qu) __A);
104 1.1 mrg #endif
105 1.1 mrg return __A;
106 1.1 mrg }
107 1.1 mrg
108 1.1 mrg if (__count == 0)
109 1.1 mrg return __B;
110 1.1 mrg
111 1.1 mrg if (__count >= 16)
112 1.1 mrg {
113 1.1 mrg if (__count >= 32)
114 1.1 mrg {
115 1.1 mrg const __v16qu zero = { 0 };
116 1.1 mrg return (__m128i) zero;
117 1.1 mrg }
118 1.1 mrg else
119 1.1 mrg {
120 1.1 mrg const __v16qu __shift =
121 1.1 mrg vec_splats ((unsigned char) ((__count - 16) * 8));
122 1.1 mrg #ifdef __LITTLE_ENDIAN__
123 1.1 mrg return (__m128i) vec_sro ((__v16qu) __A, __shift);
124 1.1 mrg #else
125 1.1 mrg return (__m128i) vec_slo ((__v16qu) __A, __shift);
126 1.1 mrg #endif
127 1.1 mrg }
128 1.1 mrg }
129 1.1 mrg else
130 1.1 mrg {
131 1.1 mrg const __v16qu __shiftA =
132 1.1 mrg vec_splats ((unsigned char) ((16 - __count) * 8));
133 1.1 mrg const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
134 1.1 mrg #ifdef __LITTLE_ENDIAN__
135 1.1 mrg __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
136 1.1 mrg __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
137 1.1 mrg #else
138 1.1 mrg __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
139 1.1 mrg __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
140 1.1 mrg #endif
141 1.1 mrg return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
142 1.1 mrg }
143 1.1 mrg }
144 1.1 mrg
145 1.1 mrg extern __inline __m64
146 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 1.1 mrg _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
148 1.1 mrg {
149 1.1 mrg if (__count < 16)
150 1.1 mrg {
151 1.1 mrg __v2du __C = { __B, __A };
152 1.1 mrg #ifdef __LITTLE_ENDIAN__
153 1.1 mrg const __v4su __shift = { __count << 3, 0, 0, 0 };
154 1.1 mrg __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
155 1.1 mrg #else
156 1.1 mrg const __v4su __shift = { 0, 0, 0, __count << 3 };
157 1.1 mrg __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
158 1.1 mrg #endif
159 1.1 mrg return (__m64) __C[0];
160 1.1 mrg }
161 1.1 mrg else
162 1.1 mrg {
163 1.1 mrg const __m64 __zero = { 0 };
164 1.1 mrg return __zero;
165 1.1 mrg }
166 1.1 mrg }
167 1.1 mrg
168 1.1 mrg extern __inline __m128i
169 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 1.1 mrg _mm_hadd_epi16 (__m128i __A, __m128i __B)
171 1.1 mrg {
172 1.1 mrg const __v16qu __P =
173 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
174 1.1 mrg const __v16qu __Q =
175 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
176 1.1 mrg __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
177 1.1 mrg __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
178 1.1 mrg return (__m128i) vec_add (__C, __D);
179 1.1 mrg }
180 1.1 mrg
181 1.1 mrg extern __inline __m128i
182 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 1.1 mrg _mm_hadd_epi32 (__m128i __A, __m128i __B)
184 1.1 mrg {
185 1.1 mrg const __v16qu __P =
186 1.1 mrg { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
187 1.1 mrg const __v16qu __Q =
188 1.1 mrg { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
189 1.1 mrg __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
190 1.1 mrg __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
191 1.1 mrg return (__m128i) vec_add (__C, __D);
192 1.1 mrg }
193 1.1 mrg
194 1.1 mrg extern __inline __m64
195 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 1.1 mrg _mm_hadd_pi16 (__m64 __A, __m64 __B)
197 1.1 mrg {
198 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __B };
199 1.1 mrg const __v16qu __P =
200 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
201 1.1 mrg const __v16qu __Q =
202 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
203 1.1 mrg __v8hi __D = vec_perm (__C, __C, __Q);
204 1.1 mrg __C = vec_perm (__C, __C, __P);
205 1.1 mrg __C = vec_add (__C, __D);
206 1.1 mrg return (__m64) ((__v2du) __C)[1];
207 1.1 mrg }
208 1.1 mrg
209 1.1 mrg extern __inline __m64
210 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 1.1 mrg _mm_hadd_pi32 (__m64 __A, __m64 __B)
212 1.1 mrg {
213 1.1 mrg __v4si __C = (__v4si) (__v2du) { __A, __B };
214 1.1 mrg const __v16qu __P =
215 1.1 mrg { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
216 1.1 mrg const __v16qu __Q =
217 1.1 mrg { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
218 1.1 mrg __v4si __D = vec_perm (__C, __C, __Q);
219 1.1 mrg __C = vec_perm (__C, __C, __P);
220 1.1 mrg __C = vec_add (__C, __D);
221 1.1 mrg return (__m64) ((__v2du) __C)[1];
222 1.1 mrg }
223 1.1 mrg
224 1.1 mrg extern __inline __m128i
225 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 1.1 mrg _mm_hadds_epi16 (__m128i __A, __m128i __B)
227 1.1 mrg {
228 1.1 mrg __v4si __C = { 0 }, __D = { 0 };
229 1.1 mrg __C = vec_sum4s ((__v8hi) __A, __C);
230 1.1 mrg __D = vec_sum4s ((__v8hi) __B, __D);
231 1.1 mrg __C = (__v4si) vec_packs (__C, __D);
232 1.1 mrg return (__m128i) __C;
233 1.1 mrg }
234 1.1 mrg
235 1.1 mrg extern __inline __m64
236 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237 1.1 mrg _mm_hadds_pi16 (__m64 __A, __m64 __B)
238 1.1 mrg {
239 1.1 mrg const __v4si __zero = { 0 };
240 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __B };
241 1.1 mrg __v4si __D = vec_sum4s (__C, __zero);
242 1.1 mrg __C = vec_packs (__D, __D);
243 1.1 mrg return (__m64) ((__v2du) __C)[1];
244 1.1 mrg }
245 1.1 mrg
246 1.1 mrg extern __inline __m128i
247 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 1.1 mrg _mm_hsub_epi16 (__m128i __A, __m128i __B)
249 1.1 mrg {
250 1.1 mrg const __v16qu __P =
251 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
252 1.1 mrg const __v16qu __Q =
253 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
254 1.1 mrg __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
255 1.1 mrg __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
256 1.1 mrg return (__m128i) vec_sub (__C, __D);
257 1.1 mrg }
258 1.1 mrg
259 1.1 mrg extern __inline __m128i
260 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261 1.1 mrg _mm_hsub_epi32 (__m128i __A, __m128i __B)
262 1.1 mrg {
263 1.1 mrg const __v16qu __P =
264 1.1 mrg { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
265 1.1 mrg const __v16qu __Q =
266 1.1 mrg { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
267 1.1 mrg __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
268 1.1 mrg __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
269 1.1 mrg return (__m128i) vec_sub (__C, __D);
270 1.1 mrg }
271 1.1 mrg
272 1.1 mrg extern __inline __m64
273 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
274 1.1 mrg _mm_hsub_pi16 (__m64 __A, __m64 __B)
275 1.1 mrg {
276 1.1 mrg const __v16qu __P =
277 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
278 1.1 mrg const __v16qu __Q =
279 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
280 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __B };
281 1.1 mrg __v8hi __D = vec_perm (__C, __C, __Q);
282 1.1 mrg __C = vec_perm (__C, __C, __P);
283 1.1 mrg __C = vec_sub (__C, __D);
284 1.1 mrg return (__m64) ((__v2du) __C)[1];
285 1.1 mrg }
286 1.1 mrg
287 1.1 mrg extern __inline __m64
288 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 1.1 mrg _mm_hsub_pi32 (__m64 __A, __m64 __B)
290 1.1 mrg {
291 1.1 mrg const __v16qu __P =
292 1.1 mrg { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
293 1.1 mrg const __v16qu __Q =
294 1.1 mrg { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
295 1.1 mrg __v4si __C = (__v4si) (__v2du) { __A, __B };
296 1.1 mrg __v4si __D = vec_perm (__C, __C, __Q);
297 1.1 mrg __C = vec_perm (__C, __C, __P);
298 1.1 mrg __C = vec_sub (__C, __D);
299 1.1 mrg return (__m64) ((__v2du) __C)[1];
300 1.1 mrg }
301 1.1 mrg
302 1.1 mrg extern __inline __m128i
303 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 1.1 mrg _mm_hsubs_epi16 (__m128i __A, __m128i __B)
305 1.1 mrg {
306 1.1 mrg const __v16qu __P =
307 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
308 1.1 mrg const __v16qu __Q =
309 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
310 1.1 mrg __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
311 1.1 mrg __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
312 1.1 mrg return (__m128i) vec_subs (__C, __D);
313 1.1 mrg }
314 1.1 mrg
315 1.1 mrg extern __inline __m64
316 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317 1.1 mrg _mm_hsubs_pi16 (__m64 __A, __m64 __B)
318 1.1 mrg {
319 1.1 mrg const __v16qu __P =
320 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
321 1.1 mrg const __v16qu __Q =
322 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
323 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __B };
324 1.1 mrg __v8hi __D = vec_perm (__C, __C, __P);
325 1.1 mrg __v8hi __E = vec_perm (__C, __C, __Q);
326 1.1 mrg __C = vec_subs (__D, __E);
327 1.1 mrg return (__m64) ((__v2du) __C)[1];
328 1.1 mrg }
329 1.1 mrg
330 1.1 mrg extern __inline __m128i
331 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332 1.1 mrg _mm_shuffle_epi8 (__m128i __A, __m128i __B)
333 1.1 mrg {
334 1.1 mrg const __v16qi __zero = { 0 };
335 1.1 mrg __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
336 1.1 mrg __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
337 1.1 mrg return (__m128i) vec_sel (__C, __zero, __select);
338 1.1 mrg }
339 1.1 mrg
340 1.1 mrg extern __inline __m64
341 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
342 1.1 mrg _mm_shuffle_pi8 (__m64 __A, __m64 __B)
343 1.1 mrg {
344 1.1 mrg const __v16qi __zero = { 0 };
345 1.1 mrg __v16qi __C = (__v16qi) (__v2du) { __A, __A };
346 1.1 mrg __v16qi __D = (__v16qi) (__v2du) { __B, __B };
347 1.1 mrg __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
348 1.1 mrg __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
349 1.1 mrg __C = vec_sel (__C, __zero, __select);
350 1.1 mrg return (__m64) ((__v2du) (__C))[0];
351 1.1 mrg }
352 1.1 mrg
353 1.1 mrg extern __inline __m128i
354 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 1.1 mrg _mm_sign_epi8 (__m128i __A, __m128i __B)
356 1.1 mrg {
357 1.1 mrg const __v16qi __zero = { 0 };
358 1.1 mrg __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
359 1.1 mrg __v16qi __selectpos =
360 1.1 mrg (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
361 1.1 mrg __v16qi __conv = vec_add (__selectneg, __selectpos);
362 1.1 mrg return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
363 1.1 mrg }
364 1.1 mrg
365 1.1 mrg extern __inline __m128i
366 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 1.1 mrg _mm_sign_epi16 (__m128i __A, __m128i __B)
368 1.1 mrg {
369 1.1 mrg const __v8hi __zero = { 0 };
370 1.1 mrg __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
371 1.1 mrg __v8hi __selectpos =
372 1.1 mrg (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
373 1.1 mrg __v8hi __conv = vec_add (__selectneg, __selectpos);
374 1.1 mrg return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
375 1.1 mrg }
376 1.1 mrg
377 1.1 mrg extern __inline __m128i
378 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 1.1 mrg _mm_sign_epi32 (__m128i __A, __m128i __B)
380 1.1 mrg {
381 1.1 mrg const __v4si __zero = { 0 };
382 1.1 mrg __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
383 1.1 mrg __v4si __selectpos =
384 1.1 mrg (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
385 1.1 mrg __v4si __conv = vec_add (__selectneg, __selectpos);
386 1.1 mrg return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
387 1.1 mrg }
388 1.1 mrg
389 1.1 mrg extern __inline __m64
390 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391 1.1 mrg _mm_sign_pi8 (__m64 __A, __m64 __B)
392 1.1 mrg {
393 1.1 mrg const __v16qi __zero = { 0 };
394 1.1 mrg __v16qi __C = (__v16qi) (__v2du) { __A, __A };
395 1.1 mrg __v16qi __D = (__v16qi) (__v2du) { __B, __B };
396 1.1 mrg __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
397 1.1 mrg return (__m64) ((__v2du) (__C))[0];
398 1.1 mrg }
399 1.1 mrg
400 1.1 mrg extern __inline __m64
401 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 1.1 mrg _mm_sign_pi16 (__m64 __A, __m64 __B)
403 1.1 mrg {
404 1.1 mrg const __v8hi __zero = { 0 };
405 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __A };
406 1.1 mrg __v8hi __D = (__v8hi) (__v2du) { __B, __B };
407 1.1 mrg __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
408 1.1 mrg return (__m64) ((__v2du) (__C))[0];
409 1.1 mrg }
410 1.1 mrg
411 1.1 mrg extern __inline __m64
412 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413 1.1 mrg _mm_sign_pi32 (__m64 __A, __m64 __B)
414 1.1 mrg {
415 1.1 mrg const __v4si __zero = { 0 };
416 1.1 mrg __v4si __C = (__v4si) (__v2du) { __A, __A };
417 1.1 mrg __v4si __D = (__v4si) (__v2du) { __B, __B };
418 1.1 mrg __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
419 1.1 mrg return (__m64) ((__v2du) (__C))[0];
420 1.1 mrg }
421 1.1 mrg
422 1.1 mrg extern __inline __m128i
423 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424 1.1 mrg _mm_maddubs_epi16 (__m128i __A, __m128i __B)
425 1.1 mrg {
426 1.1 mrg __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
427 1.1 mrg __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
428 1.1 mrg __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
429 1.1 mrg __v8hi __E = vec_unpackh ((__v16qi) __B);
430 1.1 mrg __v8hi __F = vec_unpackl ((__v16qi) __B);
431 1.1 mrg __C = vec_mul (__C, __E);
432 1.1 mrg __D = vec_mul (__D, __F);
433 1.1 mrg const __v16qu __odds =
434 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
435 1.1 mrg const __v16qu __evens =
436 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
437 1.1 mrg __E = vec_perm (__C, __D, __odds);
438 1.1 mrg __F = vec_perm (__C, __D, __evens);
439 1.1 mrg return (__m128i) vec_adds (__E, __F);
440 1.1 mrg }
441 1.1 mrg
442 1.1 mrg extern __inline __m64
443 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444 1.1 mrg _mm_maddubs_pi16 (__m64 __A, __m64 __B)
445 1.1 mrg {
446 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __A };
447 1.1 mrg __C = vec_unpackl ((__v16qi) __C);
448 1.1 mrg const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
449 1.1 mrg __C = vec_and (__C, __unsigned);
450 1.1 mrg __v8hi __D = (__v8hi) (__v2du) { __B, __B };
451 1.1 mrg __D = vec_unpackl ((__v16qi) __D);
452 1.1 mrg __D = vec_mul (__C, __D);
453 1.1 mrg const __v16qu __odds =
454 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
455 1.1 mrg const __v16qu __evens =
456 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
457 1.1 mrg __C = vec_perm (__D, __D, __odds);
458 1.1 mrg __D = vec_perm (__D, __D, __evens);
459 1.1 mrg __C = vec_adds (__C, __D);
460 1.1 mrg return (__m64) ((__v2du) (__C))[0];
461 1.1 mrg }
462 1.1 mrg
463 1.1 mrg extern __inline __m128i
464 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 1.1 mrg _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
466 1.1 mrg {
467 1.1 mrg __v4si __C = vec_unpackh ((__v8hi) __A);
468 1.1 mrg __v4si __D = vec_unpackh ((__v8hi) __B);
469 1.1 mrg __C = vec_mul (__C, __D);
470 1.1 mrg __D = vec_unpackl ((__v8hi) __A);
471 1.1 mrg __v4si __E = vec_unpackl ((__v8hi) __B);
472 1.1 mrg __D = vec_mul (__D, __E);
473 1.1 mrg const __v4su __shift = vec_splats ((unsigned int) 14);
474 1.1 mrg __C = vec_sr (__C, __shift);
475 1.1 mrg __D = vec_sr (__D, __shift);
476 1.1 mrg const __v4si __ones = vec_splats ((signed int) 1);
477 1.1 mrg __C = vec_add (__C, __ones);
478 1.1 mrg __C = vec_sr (__C, (__v4su) __ones);
479 1.1 mrg __D = vec_add (__D, __ones);
480 1.1 mrg __D = vec_sr (__D, (__v4su) __ones);
481 1.1 mrg return (__m128i) vec_pack (__C, __D);
482 1.1 mrg }
483 1.1 mrg
484 1.1 mrg extern __inline __m64
485 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 1.1 mrg _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
487 1.1 mrg {
488 1.1 mrg __v4si __C = (__v4si) (__v2du) { __A, __A };
489 1.1 mrg __C = vec_unpackh ((__v8hi) __C);
490 1.1 mrg __v4si __D = (__v4si) (__v2du) { __B, __B };
491 1.1 mrg __D = vec_unpackh ((__v8hi) __D);
492 1.1 mrg __C = vec_mul (__C, __D);
493 1.1 mrg const __v4su __shift = vec_splats ((unsigned int) 14);
494 1.1 mrg __C = vec_sr (__C, __shift);
495 1.1 mrg const __v4si __ones = vec_splats ((signed int) 1);
496 1.1 mrg __C = vec_add (__C, __ones);
497 1.1 mrg __C = vec_sr (__C, (__v4su) __ones);
498 1.1 mrg __v8hi __E = vec_pack (__C, __D);
499 1.1 mrg return (__m64) ((__v2du) (__E))[0];
500 1.1 mrg }
501 1.1 mrg
502 1.1 mrg #endif
503